<a href="https://colab.research.google.com/github/a-donat/Benchmarks_PyCaret/blob/main/Predicting_Urban_Water_Quality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# I. Set-Up

## I.A. Import Libraries and Download Data

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d mssmartypants/water-quality
! unzip '/content/water-quality.zip' -d '/content/data'

In [None]:
! pip install pycaret

In [3]:
#import matplotlib.pyplot as plt
#import seaborn as sns
import VisualizeDataAbbrev as viz

import numpy as np
import pandas as pd
from pycaret.classification import *
from sklearn.model_selection import train_test_split

## I.B. Load Data and Check Data Integrity

In [46]:
ds_df = pd.read_csv("/content/data/waterQuality1.csv")

In [47]:
ds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7999 entries, 0 to 7998
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   aluminium    7999 non-null   float64
 1   ammonia      7999 non-null   object 
 2   arsenic      7999 non-null   float64
 3   barium       7999 non-null   float64
 4   cadmium      7999 non-null   float64
 5   chloramine   7999 non-null   float64
 6   chromium     7999 non-null   float64
 7   copper       7999 non-null   float64
 8   flouride     7999 non-null   float64
 9   bacteria     7999 non-null   float64
 10  viruses      7999 non-null   float64
 11  lead         7999 non-null   float64
 12  nitrates     7999 non-null   float64
 13  nitrites     7999 non-null   float64
 14  mercury      7999 non-null   float64
 15  perchlorate  7999 non-null   float64
 16  radium       7999 non-null   float64
 17  selenium     7999 non-null   float64
 18  silver       7999 non-null   float64
 19  uraniu

In [48]:
ds_df.nunique()

aluminium       495
ammonia        2564
arsenic         107
barium          480
cadmium          23
chloramine      812
chromium         91
copper          201
flouride        151
bacteria        101
viruses          61
lead            200
nitrates       1803
nitrites        280
mercury          11
perchlorate    2999
radium          735
selenium         11
silver           51
uranium          10
is_safe           3
dtype: int64

In [49]:
ds_df["is_safe"].value_counts()

0        7084
1         912
#NUM!       3
Name: is_safe, dtype: int64

In [50]:
ds_df[ds_df["is_safe"]=="#NUM!"]

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
7551,0.03,#NUM!,0.08,0.79,0.07,0.08,0.05,0.58,0.34,0.0,...,0.183,4.37,1.43,0.007,0.62,2.54,0.07,0.05,0.05,#NUM!
7568,0.06,#NUM!,0.07,1.72,0.08,0.32,0.01,1.11,0.61,0.0,...,0.178,12.1,2.03,0.008,1.37,2.05,0.06,0.1,0.07,#NUM!
7890,0.01,#NUM!,0.08,0.49,0.0,0.07,0.09,0.06,0.72,0.57,...,0.088,9.57,1.45,0.009,7.67,7.7,0.03,0.05,0.02,#NUM!


In [51]:
ds_df = ds_df[ds_df["is_safe"]!="#NUM!"].copy()
ds_df["ammonia"] = ds_df["ammonia"].astype(float)
ds_df["is_safe"] = ds_df["ammonia"].astype(int)

In [52]:
ds_df.describe().round(5)

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
count,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0,...,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0,7996.0
mean,0.6664,14.27821,0.16148,1.56793,0.0428,2.17759,0.2473,0.80594,0.77165,0.31971,...,0.09943,9.81925,1.32985,0.00519,16.46527,2.92011,0.04968,0.14781,0.04467,13.78914
std,1.26532,8.87893,0.25263,1.21623,0.03605,2.56721,0.27066,0.65359,0.43542,0.3295,...,0.05817,5.54198,0.57327,0.00297,17.68883,2.32281,0.02877,0.14357,0.02691,8.87189
min,0.0,-0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.04,6.5775,0.03,0.56,0.008,0.1,0.05,0.09,0.4075,0.0,...,0.048,5.0,1.0,0.003,2.17,0.82,0.02,0.04,0.02,6.0
50%,0.07,14.13,0.05,1.19,0.04,0.53,0.09,0.75,0.77,0.22,...,0.102,9.93,1.42,0.005,7.745,2.41,0.05,0.08,0.05,14.0
75%,0.28,22.1325,0.1,2.4825,0.07,4.24,0.44,1.39,1.16,0.61,...,0.151,14.61,1.76,0.008,29.4875,4.67,0.07,0.24,0.07,22.0
max,5.05,29.84,1.05,4.94,0.13,8.68,0.9,2.0,1.5,1.0,...,0.2,19.83,2.93,0.01,60.01,7.99,0.1,0.5,0.09,29.0



# II. Preprocessing

In [53]:
train_df, test_df = train_test_split(
    ds_df, test_size=0.20, stratify=ds_df["is_safe"], random_state=1)

# III. Create Models

In [54]:
exp_clf101 = setup(data=train_df, target = "is_safe", session_id=123,
                   fix_imbalance=True)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,is_safe
2,Target type,Multiclass
3,Original data shape,"(6396, 21)"
4,Transformed data shape,"(11189, 21)"
5,Transformed train set shape,"(9270, 21)"
6,Transformed test set shape,"(1919, 21)"
7,Numeric features,20
8,Preprocess,True
9,Imputation type,simple


In [55]:
best_model = compare_models(fold=5)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9958,0.9995,0.9958,0.9959,0.9958,0.9956,0.9956,10.014
dt,Decision Tree Classifier,0.9955,0.9977,0.9955,0.9957,0.9955,0.9954,0.9954,0.698
gbc,Gradient Boosting Classifier,0.9955,0.9977,0.9955,0.9957,0.9955,0.9954,0.9954,55.934
lda,Linear Discriminant Analysis,0.9535,0.9998,0.9535,0.9552,0.9534,0.9518,0.9519,0.138
lightgbm,Light Gradient Boosting Machine,0.9535,0.9996,0.9535,0.9557,0.9536,0.9518,0.9519,8.688
nb,Naive Bayes,0.9261,0.9993,0.9261,0.9281,0.926,0.9234,0.9234,0.33
qda,Quadratic Discriminant Analysis,0.862,0.9974,0.862,0.8671,0.861,0.8569,0.8571,0.216
rf,Random Forest Classifier,0.8566,0.9933,0.8566,0.86,0.8561,0.8513,0.8515,4.17
et,Extra Trees Classifier,0.8075,0.9932,0.8075,0.8114,0.8066,0.8004,0.8006,2.918
lr,Logistic Regression,0.4445,0.9625,0.4445,0.4524,0.4435,0.4246,0.4249,7.228


Processing:   0%|          | 0/65 [00:00<?, ?it/s]