In [1]:
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler

df1 = pd.read_csv('data/dataset_1.csv')

# Filter NaNs
df1 = df1.dropna()

df1

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
5,5.584087,188.313324,28748.687739,7.544869,326.678363,280.467916,8.399735,54.917862,2.559708,0
6,10.223862,248.071735,28749.716544,7.513408,393.663396,283.651634,13.789695,84.603556,2.672989,0
7,8.635849,203.361523,13672.091764,4.563009,303.309771,474.607645,12.363817,62.798309,4.401425,0
...,...,...,...,...,...,...,...,...,...,...
3267,8.989900,215.047358,15921.412018,6.297312,312.931022,390.410231,9.899115,55.069304,4.613843,1
3268,6.702547,207.321086,17246.920347,7.708117,304.510230,329.266002,16.217303,28.878601,3.442983,1
3269,11.491011,94.812545,37188.826022,9.263166,258.930600,439.893618,16.172755,41.558501,4.369264,1
3270,6.069616,186.659040,26138.780191,7.747547,345.700257,415.886955,12.067620,60.419921,3.669712,1


In [2]:
# Normalize the data
df = pd.DataFrame(MinMaxScaler().fit_transform(df1), columns=df1.columns)
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,0.587349,0.577747,0.386298,0.568199,0.647347,0.292985,0.654522,0.795029,0.630115,0.0
1,0.643654,0.441300,0.314381,0.439304,0.514545,0.356685,0.377248,0.202914,0.520358,0.0
2,0.388934,0.470876,0.506122,0.524364,0.561537,0.142913,0.249922,0.401487,0.219973,0.0
3,0.725820,0.715942,0.506141,0.521683,0.751819,0.148683,0.467200,0.658678,0.242428,0.0
4,0.610517,0.532588,0.237701,0.270288,0.495155,0.494792,0.409721,0.469762,0.585049,0.0
...,...,...,...,...,...,...,...,...,...,...
2006,0.636224,0.580511,0.277748,0.418063,0.522486,0.342184,0.310364,0.402799,0.627156,1.0
2007,0.470143,0.548826,0.301347,0.538273,0.498565,0.231359,0.565061,0.175889,0.395061,1.0
2008,0.817826,0.087434,0.656389,0.670774,0.369089,0.431872,0.563265,0.285745,0.578674,1.0
2009,0.424187,0.464092,0.459656,0.541633,0.615572,0.388360,0.397780,0.449156,0.440004,1.0


In [3]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets

train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, test_df

(            ph  Hardness    Solids  Chloramines   Sulfate  Conductivity  \
 1424  0.525466  0.629528  0.467144     0.363573  0.487715      0.236440   
 432   0.429643  0.563830  0.350030     0.665700  0.403650      0.237399   
 1286  0.721535  0.495438  0.509095     0.503643  0.532696      0.140552   
 76    0.485395  0.337296  0.314277     0.300391  0.417317      0.431427   
 1927  0.472168  0.452840  0.159127     0.563585  0.613787      0.215775   
 ...        ...       ...       ...          ...       ...           ...   
 1130  0.561832  0.320078  0.444271     0.486973  0.634366      0.591001   
 1294  0.456878  0.450013  0.248607     0.596296  0.693382      0.532510   
 860   0.536435  0.538774  0.143897     0.393779  0.821830      0.198577   
 1459  0.557049  0.562822  0.277302     0.470030  0.542245      0.447773   
 1126  0.481209  0.313154  0.282763     0.370133  0.700844      0.269565   
 
       Organic_carbon  Trihalomethanes  Turbidity  Potability  
 1424        0.558431 

In [4]:
# bisect the data into features and target (potability)
train_x = train_df.drop('Potability', axis=1)
train_y = train_df['Potability']

test_x = test_df.drop('Potability', axis=1)
test_y = test_df['Potability']

In [5]:
# Train a baseline linear SVM

from sklearn.svm import SVC

baseline_svm = SVC(kernel='linear', C=1).fit(train_x, train_y)
print(f'Baseline SVM score: {baseline_svm.score(test_x, test_y):0.2%}')


Baseline SVM score: 59.93%


In [6]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to search
param_grid = {
    'C': [0.1, 0.25, 1, 1.5],
    'kernel': ['poly', 'rbf'],
    'degree': [2, 3],
    'gamma': ['scale', 'auto'],
}
cv = 5

grid_search = GridSearchCV(SVC(), param_grid, cv=cv, verbose=4, n_jobs=-1)
grid_search.fit(train_x, train_y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [7]:
best_model = grid_search.best_estimator_
print(f'Best model: {best_model}')

print(f'Best model score: {best_model.score(test_x, test_y):0.2%}')

Best model: SVC(C=0.1, kernel='poly')
Best model score: 69.87%


In [8]:
# Try a random forest model

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(
  n_estimators=500, 
  criterion='entropy',
  oob_score=True, 
  random_state=42, 
  n_jobs=-1
)
random_forest.fit(train_x, train_y)

In [9]:
print(f'Random Forest train accuracy: {random_forest.score(train_x, train_y):0.2%}')
print(f'The out of bag accuracy is: {random_forest.oob_score_:0.2%}')
print(f'Random Forest test accuracy: {random_forest.score(test_x, test_y):0.2%}')

Random Forest train accuracy: 100.00%
The out of bag accuracy is: 68.46%
Random Forest test accuracy: 70.20%
