# Predicting the quality of freshwater

### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("data\waterquality.csv", )

In [3]:
df.head()

Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Turbidity,Fluoride,Copper,Odor,Sulfate,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Color,Source,Target
0,6.703964,0.000137,10.552672,211.930128,1.888592e-48,2.848238,0.081378,1.61056,0.46088,1.278813,89.29089,570.051816,3.265975,7.472054e-12,330.24163,Colorless,Reservoir,1
1,5.396613,5e-06,5.703348,225.017979,1.917085e-28,0.400466,0.154662,0.070338,0.162591,1.038821,247.846145,435.979961,3.287618,2.900787e-07,80.026017,Colorless,Spring,1
2,7.337099,0.002543,5.505768,104.419913,1.744962e-112,1.710885,0.005547,0.790912,1.287845,3.087267,192.573085,634.09255,2.903765,7.605443e-07,541.543025,Colorless,River,1
3,7.662773,0.000504,8.314857,125.43499,4.419118e-53,0.177662,0.960557,0.399637,1.40097,1.120277,162.364348,456.736456,3.487772,5.379401e-07,26.562335,Near Colorless,Spring,1
4,6.300318,0.002024,6.444564,124.231733,3.7092310000000003e-54,1.669923,1.07809,0.078835,0.017711,1.417753,55.333114,397.080817,3.4982,0.000796002,84.817161,Near Colorless,Lake,1


In [4]:
df.columns

Index(['pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc', 'Turbidity',
       'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity', 'Chlorine',
       'Manganese', 'Total Dissolved Solids', 'Color', 'Source', 'Target'],
      dtype='object')

In [5]:
X = df.drop(['Target'], axis = 1)
y = df['Target']

#### Scaling and Encoding dataframe

In [6]:
numeric_columns = X.select_dtypes(exclude="object").columns
categorical_columns = X.select_dtypes(include="object").columns

In [7]:
print(numeric_columns)
print(categorical_columns)

Index(['pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc', 'Turbidity',
       'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity', 'Chlorine',
       'Manganese', 'Total Dissolved Solids'],
      dtype='object')
Index(['Color', 'Source'], dtype='object')


In [12]:
numeric_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", ohe_transformer, categorical_columns),
         ("StandardScaler", numeric_transformer, numeric_columns),
    ]
)

In [13]:
X = preprocessor.fit_transform(X)

In [14]:
X.shape

(536698, 28)

In [15]:
X[0]

array([ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.7495507 , -0.50809435,
        1.8940207 ,  0.58525485, -0.09965082,  1.26871761, -0.67195213,
        1.19317522,  0.11126334, -0.54258042, -0.92259869,  0.9535978 ,
        0.0351599 , -0.50683571,  0.35929633])

Train Test Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((429358, 28), (107340, 28))

### Model building

In [12]:
def evaluate_model(true, predicted):
    score = accuracy_score(true, predicted)
    return score

In [13]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "AdaBoosting Classifier": AdaBoostClassifier(),
    "Catboost Clssifier" : CatBoostClassifier()
}
model_list = []
accuracyscore = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    score_train = evaluate_model(y_train, y_train_pred)
    score_test = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    
    print("- Accuracy Score: {:.4f}".format(score_train))

    print('----------------------------------')
    
    print("- Accuracy Score: {:.4f}".format(score_test))
    accuracyscore.append(score_test)
    
    print('='*35)
    print('\n')

Logistic Regression
- Accuracy Score: 0.7567
----------------------------------
- Accuracy Score: 0.7547


Decision Tree
- Accuracy Score: 1.0000
----------------------------------
- Accuracy Score: 0.8987


Random Forest Classifier
- Accuracy Score: 1.0000
----------------------------------
- Accuracy Score: 0.9446


XGBClassifier
- Accuracy Score: 0.9441
----------------------------------
- Accuracy Score: 0.9427


AdaBoosting Classifier
- Accuracy Score: 0.8836
----------------------------------
- Accuracy Score: 0.8823


Learning rate set to 0.137135
0:	learn: 0.5890482	total: 236ms	remaining: 3m 55s
1:	learn: 0.5146812	total: 290ms	remaining: 2m 24s
2:	learn: 0.4617299	total: 341ms	remaining: 1m 53s
3:	learn: 0.4208664	total: 394ms	remaining: 1m 38s
4:	learn: 0.3898967	total: 455ms	remaining: 1m 30s
5:	learn: 0.3633280	total: 513ms	remaining: 1m 25s
6:	learn: 0.3417333	total: 574ms	remaining: 1m 21s
7:	learn: 0.3233304	total: 634ms	remaining: 1m 18s
8:	learn: 0.3085184	total: 703m

145:	learn: 0.1865423	total: 13.4s	remaining: 1m 18s
146:	learn: 0.1865216	total: 13.5s	remaining: 1m 18s
147:	learn: 0.1864875	total: 13.6s	remaining: 1m 18s
148:	learn: 0.1864606	total: 13.7s	remaining: 1m 18s
149:	learn: 0.1864292	total: 13.8s	remaining: 1m 18s
150:	learn: 0.1863955	total: 13.9s	remaining: 1m 18s
151:	learn: 0.1863744	total: 14s	remaining: 1m 18s
152:	learn: 0.1863439	total: 14.1s	remaining: 1m 18s
153:	learn: 0.1863051	total: 14.2s	remaining: 1m 17s
154:	learn: 0.1862716	total: 14.3s	remaining: 1m 17s
155:	learn: 0.1862318	total: 14.4s	remaining: 1m 17s
156:	learn: 0.1861994	total: 14.5s	remaining: 1m 17s
157:	learn: 0.1861677	total: 14.6s	remaining: 1m 17s
158:	learn: 0.1861511	total: 14.6s	remaining: 1m 17s
159:	learn: 0.1861243	total: 14.7s	remaining: 1m 17s
160:	learn: 0.1860859	total: 14.8s	remaining: 1m 17s
161:	learn: 0.1860171	total: 14.9s	remaining: 1m 17s
162:	learn: 0.1859891	total: 15s	remaining: 1m 17s
163:	learn: 0.1859606	total: 15.1s	remaining: 1m 1

303:	learn: 0.1820778	total: 27.8s	remaining: 1m 3s
304:	learn: 0.1820440	total: 27.9s	remaining: 1m 3s
305:	learn: 0.1820153	total: 28s	remaining: 1m 3s
306:	learn: 0.1819946	total: 28.1s	remaining: 1m 3s
307:	learn: 0.1819647	total: 28.2s	remaining: 1m 3s
308:	learn: 0.1819364	total: 28.3s	remaining: 1m 3s
309:	learn: 0.1819169	total: 28.4s	remaining: 1m 3s
310:	learn: 0.1818893	total: 28.5s	remaining: 1m 3s
311:	learn: 0.1818737	total: 28.6s	remaining: 1m 2s
312:	learn: 0.1818586	total: 28.6s	remaining: 1m 2s
313:	learn: 0.1818276	total: 28.7s	remaining: 1m 2s
314:	learn: 0.1818038	total: 28.8s	remaining: 1m 2s
315:	learn: 0.1817812	total: 28.9s	remaining: 1m 2s
316:	learn: 0.1817460	total: 29s	remaining: 1m 2s
317:	learn: 0.1817135	total: 29.1s	remaining: 1m 2s
318:	learn: 0.1816916	total: 29.2s	remaining: 1m 2s
319:	learn: 0.1816662	total: 29.3s	remaining: 1m 2s
320:	learn: 0.1816429	total: 29.4s	remaining: 1m 2s
321:	learn: 0.1816183	total: 29.4s	remaining: 1m 1s
322:	learn: 0.18

463:	learn: 0.1781119	total: 42.3s	remaining: 48.8s
464:	learn: 0.1780894	total: 42.3s	remaining: 48.7s
465:	learn: 0.1780721	total: 42.4s	remaining: 48.6s
466:	learn: 0.1780537	total: 42.5s	remaining: 48.5s
467:	learn: 0.1780304	total: 42.6s	remaining: 48.4s
468:	learn: 0.1780168	total: 42.7s	remaining: 48.3s
469:	learn: 0.1779939	total: 42.8s	remaining: 48.3s
470:	learn: 0.1779828	total: 42.9s	remaining: 48.2s
471:	learn: 0.1779664	total: 43s	remaining: 48.1s
472:	learn: 0.1779397	total: 43.1s	remaining: 48s
473:	learn: 0.1779223	total: 43.2s	remaining: 47.9s
474:	learn: 0.1778931	total: 43.3s	remaining: 47.8s
475:	learn: 0.1778791	total: 43.3s	remaining: 47.7s
476:	learn: 0.1778598	total: 43.4s	remaining: 47.6s
477:	learn: 0.1778374	total: 43.5s	remaining: 47.5s
478:	learn: 0.1778140	total: 43.6s	remaining: 47.5s
479:	learn: 0.1777917	total: 43.7s	remaining: 47.4s
480:	learn: 0.1777600	total: 43.8s	remaining: 47.3s
481:	learn: 0.1777329	total: 43.9s	remaining: 47.2s
482:	learn: 0.17

622:	learn: 0.1745509	total: 56.6s	remaining: 34.2s
623:	learn: 0.1745383	total: 56.6s	remaining: 34.1s
624:	learn: 0.1745230	total: 56.7s	remaining: 34s
625:	learn: 0.1744984	total: 56.8s	remaining: 34s
626:	learn: 0.1744867	total: 56.9s	remaining: 33.9s
627:	learn: 0.1744648	total: 57s	remaining: 33.8s
628:	learn: 0.1744507	total: 57.1s	remaining: 33.7s
629:	learn: 0.1744275	total: 57.2s	remaining: 33.6s
630:	learn: 0.1744049	total: 57.3s	remaining: 33.5s
631:	learn: 0.1743876	total: 57.4s	remaining: 33.4s
632:	learn: 0.1743664	total: 57.4s	remaining: 33.3s
633:	learn: 0.1743545	total: 57.5s	remaining: 33.2s
634:	learn: 0.1743377	total: 57.6s	remaining: 33.1s
635:	learn: 0.1743085	total: 57.7s	remaining: 33s
636:	learn: 0.1742877	total: 57.8s	remaining: 32.9s
637:	learn: 0.1742670	total: 57.9s	remaining: 32.8s
638:	learn: 0.1742475	total: 58s	remaining: 32.7s
639:	learn: 0.1742315	total: 58s	remaining: 32.7s
640:	learn: 0.1742052	total: 58.1s	remaining: 32.6s
641:	learn: 0.1741841	to

782:	learn: 0.1710310	total: 1m 11s	remaining: 19.7s
783:	learn: 0.1710215	total: 1m 11s	remaining: 19.6s
784:	learn: 0.1710034	total: 1m 11s	remaining: 19.5s
785:	learn: 0.1709933	total: 1m 11s	remaining: 19.4s
786:	learn: 0.1709809	total: 1m 11s	remaining: 19.3s
787:	learn: 0.1709565	total: 1m 11s	remaining: 19.2s
788:	learn: 0.1709348	total: 1m 11s	remaining: 19.1s
789:	learn: 0.1709123	total: 1m 11s	remaining: 19.1s
790:	learn: 0.1708890	total: 1m 11s	remaining: 19s
791:	learn: 0.1708576	total: 1m 11s	remaining: 18.9s
792:	learn: 0.1708373	total: 1m 11s	remaining: 18.8s
793:	learn: 0.1708159	total: 1m 12s	remaining: 18.7s
794:	learn: 0.1708028	total: 1m 12s	remaining: 18.6s
795:	learn: 0.1707840	total: 1m 12s	remaining: 18.5s
796:	learn: 0.1707666	total: 1m 12s	remaining: 18.4s
797:	learn: 0.1707465	total: 1m 12s	remaining: 18.3s
798:	learn: 0.1707256	total: 1m 12s	remaining: 18.2s
799:	learn: 0.1707019	total: 1m 12s	remaining: 18.1s
800:	learn: 0.1706891	total: 1m 12s	remaining: 1

938:	learn: 0.1679044	total: 1m 25s	remaining: 5.53s
939:	learn: 0.1678837	total: 1m 25s	remaining: 5.44s
940:	learn: 0.1678699	total: 1m 25s	remaining: 5.35s
941:	learn: 0.1678431	total: 1m 25s	remaining: 5.26s
942:	learn: 0.1678218	total: 1m 25s	remaining: 5.17s
943:	learn: 0.1678036	total: 1m 25s	remaining: 5.08s
944:	learn: 0.1677759	total: 1m 25s	remaining: 4.99s
945:	learn: 0.1677550	total: 1m 25s	remaining: 4.9s
946:	learn: 0.1677357	total: 1m 25s	remaining: 4.81s
947:	learn: 0.1677085	total: 1m 25s	remaining: 4.72s
948:	learn: 0.1676898	total: 1m 26s	remaining: 4.63s
949:	learn: 0.1676709	total: 1m 26s	remaining: 4.53s
950:	learn: 0.1676594	total: 1m 26s	remaining: 4.44s
951:	learn: 0.1676416	total: 1m 26s	remaining: 4.35s
952:	learn: 0.1676217	total: 1m 26s	remaining: 4.26s
953:	learn: 0.1675999	total: 1m 26s	remaining: 4.17s
954:	learn: 0.1675751	total: 1m 26s	remaining: 4.08s
955:	learn: 0.1675575	total: 1m 26s	remaining: 3.99s
956:	learn: 0.1675428	total: 1m 26s	remaining: 

In [23]:
params = {'bootstrap': [True],
#           'criterion' : ['gini','entropy','log_loss'],
#               'max_depth': list(range(1,100)),
              'max_features': ['auto'],
              'min_samples_leaf': [1, 2, 4],
              'n_estimators': [10, 20, 30, 40]
              }
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=params, n_jobs=-1, return_train_score=True)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)

  warn(


RandomForestClassifier(max_features='auto', n_estimators=40)


In [24]:
print(grid_search.best_estimator_)

RandomForestClassifier(max_features='auto', n_estimators=40)


In [26]:
model = RandomForestClassifier(max_features='sqrt', n_estimators=40)
model.fit(X_train, y_train)

In [27]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [28]:
score_train = evaluate_model(y_train, y_train_pred)
score_test = evaluate_model(y_test, y_test_pred)
print(score_train)
print(score_test)

0.9993082695559417
0.9441587479038569
