In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, confusion_matrix
import sklearn
import math
import warnings
import seaborn as sns
from imblearn.over_sampling import SMOTE


warnings.filterwarnings("ignore")


In [2]:
data = pd.read_csv("water_potability.csv")

print(data.head())

columns  = data.columns


         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0       NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1  3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2  8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
3  8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4  9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135           0  
1       15.180013        56.329076   4.500656           0  
2       16.868637        66.420093   3.055934           0  
3       18.436524       100.341674   4.628771           0  
4       11.558279        31.997993   4.075075           0  


In [3]:
# Missing values

def change_mean():
  for i in data.columns[data.isnull().any(axis=0)]: 
    data[i] = data.groupby("Potability")[i].transform(lambda x: x.fillna(x.mean()))

def drop_cols(data):
    for i in range(data.shape[0]):
        if (math.isnan(data.loc[i,'ph']) or math.isnan(data.loc[i, 'Sulfate']) or math.isnan(data.loc[i, "Trihalomethanes"])):
            data = data.drop(i, axis = 0)
    return data

# Outliers

def change_mean_out():
  for col in columns:
      mean = np.mean(data.loc[:, col])
      std = np.std(data.loc[:, col])
      for i in range(data.shape[0]):
        if data.loc[i,col] > mean + 2 * std:
          data[col] = data.groupby("Potability")[col].transform(lambda x: x.fillna(x.mean()))

def drop_out():
    for col in columns:
      mean = np.mean(data.loc[:, col])
      std = np.std(data.loc[:, col])
      for i in range(data.shape[0]):
        if data.loc[i,col] > mean + 2 * std:
          data.drop(i, axis = 0)

#data = drop_cols(data)
change_mean()
change_mean_out()

print(data.isnull().sum())
print(data.shape)

y = data["Potability"]
x = data.drop(["Potability"], axis = 1)

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64
(3276, 10)


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

In [14]:
svc = SVC()

search_space = { "C": [1,2],
                "kernel" : ["linear", "rbf"]}

model1 = GridSearchCV(svc, search_space)

model1.fit(x_train, y_train) 

y_pred = model1.predict(x_test)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.66      0.70       621
           1       0.51      0.60      0.55       362

    accuracy                           0.64       983
   macro avg       0.62      0.63      0.62       983
weighted avg       0.65      0.64      0.64       983



In [16]:
scores = cross_validate(model1, x_train, y_train, cv=5, scoring=['accuracy', 'f1', 'recall'])

print(np.mean(scores['test_accuracy']), np.mean(scores['test_f1']), np.mean(scores['test_recall']))

0.6623250288731233 0.6641085802645277 0.668171277997365


In [28]:
y_pred = model1.predict(x_test)

print(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), recall_score(y_test, y_pred))
print()

confusion_matrix(y_test, y_pred)

0.6378433367243134 0.549367088607595 0.5994475138121547



array([[410, 211],
       [145, 217]])