In [30]:
import pandas as pd
df = pd.read_csv("water_potability.csv")

In [12]:
df.info() #some records dont have values for all atributes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [31]:
df.isnull().sum() #calculate number of null vals for each attribute

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [33]:
#Replace all NaN elements with means 
df['ph'] = df['ph'].fillna(df['ph'].mean())
df['Sulfate'] = df['Sulfate'].fillna(df['Sulfate'].mean())
df['Trihalomethanes'] = df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean())

In [34]:
df.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [35]:
#spliting dataset to input(X)set and output(Y) set
X = df.drop('Potability', axis=1)
y = df['Potability']
X.shape, y.shape

((3276, 9), (3276,))

In [None]:
#performing scaling makes machine learning algorithms perform better 
#when numerical input variables are scaled to a standard range
# Standardization scales each input variable separately by subtracting the mean (called centering) and dividing by the standard deviation to shift the distribution 
#to have a mean of zero and a standard deviation of one.
#Data scaling is a recommended pre-processing step when working with many machine learning algorithms.
#Data scaling can be achieved by normalizing or standardizing real-valued input 
#and output variables.

In [36]:
# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [37]:
#spliting the dataset for  training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.2) #20% for training

In [None]:
#learning and prediction

In [38]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model_dt = DecisionTreeClassifier() #create model 
model_dt.fit(X_train, y_train) #train the model
pred_dt =model_dt.predict(X_test) #storing prediction

In [39]:
#accuracy for DecisionTree
accuracy_score(y_test, pred_dt)

0.5685975609756098

In [40]:
#KNeighbours
from sklearn.neighbors import KNeighborsClassifier
model_kn =KNeighborsClassifier(n_neighbors=8,) #create model default 5 neighbors
model_kn.fit(X_train, y_train) #train
pred_kn = model_kn.predict(X_test) # storing prediction

In [41]:
#accuracy for KN
accuracy_score(y_test, pred_kn)

0.6387195121951219

In [42]:
#SVM
from sklearn.svm import SVC
model_svm = SVC()
model_svm.fit(X_train, y_train)
pred_svm = model_svm.predict(X_test)


In [43]:
# Calculating Accuracy Score
accuracy_score(y_test, pred_svm)

0.6783536585365854