In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, confusion_matrix
import sklearn
import math
import warnings
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA


warnings.filterwarnings("ignore")


In [None]:
data = pd.read_csv("water_potability.csv")

print(data.head())

columns  = data.columns


   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [None]:
# Missing values

def change_mean():
  for i in data.columns[data.isnull().any(axis=0)]: 
    data[i] = data.groupby("Potability")[i].transform(lambda x: x.fillna(x.mean()))

def drop_cols(data):
    for i in range(data.shape[0]):
        if (math.isnan(data.loc[i,'ph']) or math.isnan(data.loc[i, 'Sulfate']) or math.isnan(data.loc[i, "Trihalomethanes"])):
            data = data.drop(i, axis = 0)
    return data

# Outliers

def change_mean_out():
  for col in columns:
      mean = np.mean(data.loc[:, col])
      std = np.std(data.loc[:, col])
      for i in range(data.shape[0]):
        if data.loc[i,col] > mean + 2 * std:
          data[col] = data.groupby("Potability")[col].transform(lambda x: x.fillna(x.mean()))

def drop_out():
    for col in columns:
      mean = np.mean(data.loc[:, col])
      std = np.std(data.loc[:, col])
      for i in range(data.shape[0]):
        if data.loc[i,col] > mean + 2 * std:
          data.drop(i, axis = 0)

#data = drop_cols(data)
change_mean()
change_mean_out()

print(data.isnull().sum())
print(data.shape)

pca = PCA(n_components=3)
pca.fit(data)
pca.transform(data)

y = data["Potability"]
x = data.drop(["Potability"], axis = 1)


fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
(1599, 12)


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

In [None]:
model = RandomForestClassifier()

search_space = { "criterion": ["gini","entropy"],
                "max_depth" : [10, 20, None]
                }

model1 = GridSearchCV(model, search_space)

model1.fit(x_train, y_train) 

y_pred = model1.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.19      0.27      0.22        15
           5       0.75      0.74      0.74       214
           6       0.61      0.56      0.59       181
           7       0.54      0.61      0.57        61
           8       0.33      0.29      0.31         7

    accuracy                           0.63       480
   macro avg       0.40      0.41      0.41       480
weighted avg       0.64      0.63      0.64       480



In [7]:
scores = cross_validate(model1, x_train, y_train, cv=5, scoring=['accuracy', 'f1_weighted', 'recall_weighted'])

print(np.mean(scores['test_accuracy']), np.mean(scores['test_f1_weighted']), np.mean(scores['test_recall_weighted']))

0.872245352686529 0.8672495513934988 0.872245352686529


In [8]:
y_pred = model1.predict(x_test)

print(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average="weighted"), recall_score(y_test, y_pred, average="weighted"))
print()

confusion_matrix(y_test, y_pred)

0.6333333333333333 0.6374067524263239 0.6333333333333333



array([[  0,   0,   2,   0,   0,   0],
       [  3,   4,   4,   3,   1,   0],
       [  1,   7, 159,  42,   5,   0],
       [  1,   9,  45, 102,  22,   2],
       [  0,   1,   3,  18,  37,   2],
       [  0,   0,   0,   2,   3,   2]])