In [None]:
#Load Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
from matplotlib import cm
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier


from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, classification_report
#from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
import missingno as msno

In [None]:
#Load Dataset
df = pd.read_csv('./water_potability.csv')
df.head()

### DATA PRE-PROCESSING

In [None]:
df ['Solids'] = df['Solids']/100

In [None]:
#Load Dataset
df.head()

In [None]:
#Create Profile Report
 
#Importing package
import pandas_profiling as pp
from IPython.display import IFrame
 
#Profile Report
DataReport = pp.ProfileReport(df)
DataReport.to_file('WaterReport.html')
display(IFrame('WaterReport.html', width=900, height=350))

# EXPLORATORY DATA ANALYSIS 

In [None]:
#Overview of Dataset Characteristics
df.info()

In [None]:
#Statistics of the dataset
df.describe()

In [None]:
df.groupby('Potability').mean()

In [None]:
describeNum = df.describe(include =['float64', 'int64', 'float', 'int'])
describeNum.T.style.background_gradient(cmap='viridis',low=0.2,high=0.1)

In [None]:
colors=['#f94144', '#48cae4']
labels=['Not Potable','Potable']
pieplot = df.groupby('Potability').size()
pieplot.plot(kind='pie', colors=colors, subplots=True,shadow=True, figsize=(7, 7), fontsize=9, autopct='%1.1f%%')
plt.title("Potability Values Distribution")
plt.legend(labels)
plt.ylabel("")

### Univariate Statistics

In [None]:
#Histogram of numeric variables
num_bins = 10

df.hist(bins=num_bins, figsize=(20,15))
plt.savefig("water_histogram_plots")
plt.show()

In [None]:
#Skewness 

plt.style.use('seaborn-dark')
colors=['#00a8e8', '#00afb9',  '#48bfe3', '#006e90', '#20a4f3', '#00b4d8', '#0466c8', '#20a4f3', '#00008B','#1E90FF']
i=0
while i<10:
    for col in df.columns:
        plt.figure(figsize=(6,4))
        sns.distplot(df[col],color=colors[i])
        plt.title(f'Distribution plot for {col}')
        plt.xlabel(f'Skewness = {round(df[col].skew(),3)}',fontsize=14)
        i+=1
        plt.show()

In [None]:
plt.figure(figsize=(12,10))
for i, column in enumerate(df.columns[:9]):
    plt.subplot(3,3,i+1)
    sns.histplot(df[column],kde=True,alpha=0.3, bins=10, color='blue',common_norm=False)

In [None]:
df.skew().sort_values(ascending = False)

### Bivariate Statistics

In [None]:
sns.countplot(data = df, x = 'Potability')

In [None]:
#Most features are normal distribution. Values between 0.5 to -0.5 will be considered as the normal distribution. Though Solids has value slightly above 0.5, we still consider it doesn't have skewness.

sns.pairplot(df, hue ='Potability')

In [None]:
#Potability and Ph
fig,ax  = plt.subplots(figsize = (12,5))
sns.boxplot(data =df, x = 'ph', y = 'Potability', orient = 'h').set(title = 'Ph distribution');

In [None]:
#Potability and hardness distribution
fig,ax  = plt.subplots(figsize = (12,5))
sns.boxplot(data =df, x = 'Hardness', y = 'Potability', orient = 'h').set(title = 'Hardness distribution');

In [None]:
#Potability and  Solids distribution
fig,ax  = plt.subplots(figsize = (12,5))
sns.boxplot(data =df, x = 'Solids', y = 'Potability', orient = 'h').set(title = 'Solids distribution');

In [None]:
#Potability and Chloramines distribution
fig,ax  = plt.subplots(figsize = (12,5))
sns.boxplot(data =df, x = 'Chloramines', y = 'Potability', orient = 'h').set(title = 'Chloramines distribution');

In [None]:
#Potability and Sulfate distribution
fig,ax  = plt.subplots(figsize = (12,5))
sns.boxplot(data =df, x = 'Sulfate', y = 'Potability', orient = 'h').set(title = 'Sulfate distribution');

In [None]:
#Potability and Conductivity distribution
fig,ax  = plt.subplots(figsize = (12,5))
sns.boxplot(data =df, x = 'Conductivity', y = 'Potability', orient = 'h').set(title = 'Conductivity distribution');

In [None]:
#Potability and Organic_carbon distribution
fig,ax  = plt.subplots(figsize = (12,5))
sns.boxplot(data =df, x = 'Organic_carbon', y = 'Potability', orient = 'h').set(title = 'Organic_carbon distribution');

In [None]:
#Potability and Trihalomethanes distribution
fig,ax  = plt.subplots(figsize = (12,5))
sns.boxplot(data =df, x = 'Trihalomethanes', y = 'Potability', orient = 'h').set(title = 'Trihalomethanes distribution');

In [None]:
#Potability and Turbidity distribution
fig,ax  = plt.subplots(figsize = (12,5))
sns.boxplot(data =df, x = 'Turbidity', y = 'Potability', orient = 'h').set(title = 'Turbidity distribution');

### Multivariate Statistics

In [None]:
# Correlation heatmap among features

fig,ax = plt.subplots(figsize = (10,7))
sns.heatmap(df.corr(),annot = True)

In [None]:
#Correlation with Potability
plt.figure(figsize=(7, 10))
heatmap = sns.heatmap(df.corr()[['Potability']].sort_values(by='Potability', ascending=False),annot=True, cmap='GnBu_r')
plt.title('Descending Correlation with Potability',pad=20, fontsize=16)

## HANDLING OUTLIERS

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
df1=df

In [None]:
#outliers in the data.

i=1
plt.figure(figsize=(15,25))
for feature in df.columns:
    plt.subplot(6,3,i)
    sns.boxplot(y=df[feature])
    i+=1

In [None]:
#Removing outliers

cols = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'] # one or more

Q1 = df[cols].quantile(0.25)
Q3 = df[cols].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

print("Old Shape: ", df1.shape)
print("New Shape: ", df.shape)

In [None]:
#outliers in the data.

i=1
plt.figure(figsize=(15,25))
for feature in df1.columns:
    plt.subplot(6,3,i)
    sns.boxplot(y=df1[feature])
    i+=1

In [None]:
#outliers in the data.

i=1
plt.figure(figsize=(15,25))
for feature in df.columns:
    plt.subplot(6,3,i)
    sns.boxplot(y=df[feature])
    i+=1

## DUPLICATE VALUES

In [None]:
df.duplicated()

## NULL VALUES - PREPROCESSING

In [None]:
#Summary of N/A Values
df.isnull().sum()

In [None]:
msno.bar(df, figsize = (16,5),color = "#483D8B")
plt.show()

In [None]:
# get the number and percentage of missing data points per column
null=pd.DataFrame(df.isnull().sum(),columns=["Null Values"])
null["% Missing Values"]=(df.isna().sum()/len(df)*100)
null = null[null["% Missing Values"] > 0]
null.style.background_gradient(cmap='viridis',low =0.2,high=0.1) 

The missing values are columns - 

'ph', 14.98%  
'sulfate', 23.84%  
'Trihalomethanes', 4.94%

It might not be a good idea to drop all the missing value columns. 
Let's continue exploring the dataset and then deal with these missing values.

The difference between mean and median values of potable water is also small.

ph: 7.0367 (median) 7.0737 (mean)
Sulfate: 331.8381 (median) 332.5670 (mean)
Trihalomethanes: 66.6782 (median) 66.5397 (mean)

## FILL THE GAP IN DATA

we can use the overall median of the feature to impute values.

In [None]:
df['ph'].fillna(value=df['ph'].median(), inplace=True)
df['Sulfate'].fillna(value=df['Sulfate'].median(), inplace=True)
df['Trihalomethanes'].fillna(value=df['Trihalomethanes'].median(), inplace=True)

In [None]:
df.info()

In [None]:
df.isnull().sum()

# MODELLING

In [None]:
#The first step is to scale the data. 
#This is important because scaling can ensure that one factor will not impact the model just because of their large magnitude.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = df.drop('Potability', axis =1)
y = df['Potability']
features = X.columns
X[features] = sc.fit_transform(X[features])
X

In [None]:
df.shape

In [None]:
# import train test split
from sklearn.model_selection import train_test_split
# assign 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [None]:
X_train.shape

In [None]:
X_test.shape

### Decision Tree

In [None]:
# create the model
DeTree = DecisionTreeClassifier(max_depth = 4, random_state = 42, min_samples_leaf = 1, criterion ='entropy')
# model training
DeTree.fit(X_train, y_train)
# prediction
DeTree_pred = DeTree.predict(X_test)
# accuracy
DeTree_acc = accuracy_score(y_test, DeTree_pred)
# precision
DeTree_prec = precision_score(y_test, DeTree_pred)

In [None]:
print("The accuracy for Decision Tree is", DeTree_acc)
print("The classification report using Decision Tree is:")
print(classification_report(y_test, DeTree_pred))

In [None]:
# let's plot confusion matrix
DeTree_cm = confusion_matrix(y_test, DeTree_pred)
sns.heatmap(DeTree_cm/np.sum(DeTree_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Decision Tree')
plt.savefig('Decision Tree')

In [None]:
#Confusion Matrix for Decision Tree
DeTree_cm = confusion_matrix(y_test, DeTree_pred)
sns.heatmap(DeTree_cm, annot=True, fmt='.2f')
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Decision Tree')
plt.savefig('Decision Tree')

### Random Forest

In [None]:
# create the model
RmTree = RandomForestClassifier(n_estimators =100,min_samples_leaf =2, random_state = 42)
# model training
RmTree.fit(X_train, y_train)
# prediction
RmTree_pred = RmTree.predict(X_test)
# accuracy
RmTree_acc = accuracy_score(y_test, RmTree_pred)
# precision
RmTree_prec = precision_score(y_test, RmTree_pred)

In [None]:
print("The accuracy for Random Forest is", RmTree_acc)
print("The classification report using Random Forest is:")
print(classification_report(y_test, RmTree_pred))

In [None]:
# let's plot confusion matrix
RmTree_cm = confusion_matrix(y_test, RmTree_pred)
#RmTree_cm
sns.heatmap(RmTree_cm/np.sum(RmTree_cm), annot = True, fmt = '0.2%', cmap = 'Blues')

In [None]:
#Confusion Matrix for Random Forest
DeTree_cm = confusion_matrix(y_test, RmTree_pred)
sns.heatmap(RmTree_cm, annot=True, fmt='.2f')
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Random Forest')
plt.savefig('Random Forest')

### Logistic Regression

In [None]:
# create the model
LogReg = LogisticRegression(random_state = 42)
# model training
LogReg.fit(X_train, y_train)
# prediction
LogReg_pred = LogReg.predict(X_test)
# accuracy
LogReg_acc = accuracy_score(y_test, LogReg_pred)
# precision
LogReg_prec = precision_score(y_test, LogReg_pred)

In [None]:
print("The accuracy for Logistic Regression is", LogReg_acc)
print("The classification report using Logistic Regression is:")
print(classification_report(y_test, LogReg_pred))

In [None]:
# let's plot confusion matrix
LogReg_cm = confusion_matrix(y_test, LogReg_pred)
sns.heatmap(LogReg_cm/np.sum(LogReg_cm), annot = True, fmt = '0.2%', cmap = 'Blues')

In [None]:
#Confusion Matrix for Logistic Regression
DeTree_cm = confusion_matrix(y_test, LogReg_pred)
sns.heatmap(LogReg_cm, annot=True, fmt='.2f')
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Logistic Regression')
plt.savefig('Logistic Regression')

### XGBoost

In [None]:
# create the model
XGB = XGBClassifier(max_depth= 8, n_estimators= 250, random_state= 0,  learning_rate= 0.03, n_jobs=5)
# model training
XGB.fit(X_train, y_train)
# prediction
XGB_pred = XGB.predict(X_test)
# accuracy
XGB_acc = accuracy_score(y_test, XGB_pred)
# precision
XGB_prec = precision_score(y_test, XGB_pred)

In [None]:
print("The accuracy for XGBoost is", XGB_acc)
print("The classification report using XGBoost is:", XGB_acc)
print(classification_report(y_test, XGB_pred))

In [None]:
# let's plot confusion matrix
XGB_cm = confusion_matrix(y_test, XGB_pred)
sns.heatmap(XGB_cm/np.sum(XGB_cm), annot = True, fmt = '0.2%', cmap = 'Blues')

In [None]:
#Confusion Matrix for XGB
DeTree_cm = confusion_matrix(y_test, XGB_pred)
sns.heatmap(XGB_cm, annot=True, fmt='.2f')
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('XGB')
plt.savefig('XGB')

### KNeighbors Classifier

In [None]:
# create the model
KNN = KNeighborsClassifier(n_neighbors = 8, leaf_size =20)
# model training
KNN.fit(X_train, y_train)
# prediction
KNN_pred = KNN.predict(X_test)
# accuracy
KNN_acc = accuracy_score(y_test, KNN_pred)
# precision
KNN_prec = precision_score(y_test, KNN_pred)

In [None]:
print("The accuracy for KNeighbors is", KNN_acc)
print("The classification report using KNeighbors is:", KNN_acc)
print(classification_report(y_test, KNN_pred))

In [None]:
# let's plot confusion matrix
KNN_cm = confusion_matrix(y_test, KNN_pred)
sns.heatmap(KNN_cm/np.sum(KNN_cm), annot = True, fmt = '0.2%', cmap = 'Blues')

In [None]:
#Confusion Matrix for KNN
DeTree_cm = confusion_matrix(y_test, KNN_pred)
sns.heatmap(KNN_cm, annot=True, fmt='.2f')
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('KNN')
plt.savefig('KNN')

### SVM

In [None]:
# create the model
SVM = SVC(kernel ='rbf', random_state = 42)
# model training
SVM.fit(X_train, y_train)
# prediction
SVM_pred = SVM.predict(X_test)
# accuracy
SVM_acc = accuracy_score(y_test, SVM_pred)
print("The accuracy for SVM is", SVM_acc)
print("The classification report using SVM is:", SVM_acc)
print(classification_report(y_test, SVM_pred))


In [None]:
print("The accuracy for SVM is", SVM_acc)
print("The classification report using SVM is:", SVM_acc)
print(classification_report(y_test, SVM_pred))

In [None]:
# let's plot confusion matrix
SVM_cm = confusion_matrix(y_test, SVM_pred)
sns.heatmap(SVC_cm/np.sum(SVM_cm), annot = True, fmt = '0.2%', cmap = 'Blues')

In [None]:
#Confusion Matrix for SVM
DeTree_cm = confusion_matrix(y_test, SVM_pred)
sns.heatmap(SVM_cm, annot=True, fmt='.2f')
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('SVM')
plt.savefig('SVM')

### AdaBoost Classifier

In [None]:
# create the model
AdaBoost = AdaBoostClassifier(learning_rate = 0.08, n_estimators = 200, random_state = 42)
# model training
AdaBoost.fit(X_train, y_train)
# prediction
AdaBoost_pred = AdaBoost.predict(X_test)
# accuracy
AdaBoost_acc = accuracy_score(y_test, AdaBoost_pred)
# precision
AdaBoost_prec = precision_score(y_test, AdaBoost_pred)

In [None]:
print("The accuracy for AdaBoost is", AdaBoost_acc)
print("The classification report using AdaBoost is:", AdaBoost_acc)
print(classification_report(y_test, AdaBoost_pred))

In [None]:
# let's plot confusion matrix
AdaBoost_cm = confusion_matrix(y_test, AdaBoost_pred)
sns.heatmap(SVM_cm/np.sum(SVM_cm), annot = True, fmt = '0.2%', cmap = 'Blues')

In [None]:
#Confusion Matrix for AdaBoost
AdaBoost_cm = confusion_matrix(y_test, AdaBoost_pred)
sns.heatmap(AdaBoost_cm, annot=True, fmt='.2f')
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('AdaBoost')
plt.savefig('AdaBoost')

# SUMMARY

In [None]:
models = pd.DataFrame({
    'Model':['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'KNeighbours', 'SVM', 'AdaBoost'],
    'Accuracy' :[LogReg_acc, DeTree_acc, RmTree_acc, XGB_acc, KNN_acc, SVM_acc, AdaBoost_acc]
})
models.sort_values(by='Accuracy', ascending=False)

In [None]:
models1 = pd.DataFrame({
    'Model':['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'KNeighbours', 'SVM', 'AdaBoost'],
    'Precision' :[LogReg_prec, DeTree_prec, RmTree_prec, XGB_prec, KNN_prec, SVM_prec, AdaBoost_prec]
})
models1.sort_values(by='Precision', ascending=False)



In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='Model', y='Accuracy', data = models, 
            order = models.sort_values("Accuracy").Model,
           palette = 'Blues_d')


In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='Model', y='Precision', data = models1, 
            order = models1.sort_values("Precision").Model,
           palette = 'Blues_d')

In [None]:
from sklearn import metrics

In [None]:
#set up plotting area
plt.figure(0).clf()

#fit logistic regression model and plot ROC curve
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="Logistic Regression, AUC="+str(auc))

#fit AdaBoostClassifier model and plot ROC curve
model = AdaBoostClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="AdaBoostClassifier, AUC="+str(auc))

#fit DecisionTreeClassifier model and plot ROC curve
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="DecisionTreeClassifier, AUC="+str(auc))


#fit KNeighborsClassifier model and plot ROC curve
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="KNeighborsClassifier, AUC="+str(auc))

#fit XGBClassifier model and plot ROC curve
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="XGBClassifier, AUC="+str(auc))

#fit Random Forest model and plot ROC curve
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="RandomForestClassifier, AUC="+str(auc))

#add legend
plt.legend()

In [None]:
#set up plotting area with SVC
plt.figure(0).clf()

#fit logistic regression model and plot ROC curve
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="Logistic Regression, AUC="+str(auc))

#fit AdaBoostClassifier model and plot ROC curve
model = AdaBoostClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="AdaBoostClassifier, AUC="+str(auc))

#fit DecisionTreeClassifier model and plot ROC curve
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="DecisionTreeClassifier, AUC="+str(auc))


#fit KNeighborsClassifier model and plot ROC curve
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="KNeighborsClassifier, AUC="+str(auc))

#fit XGBClassifier model and plot ROC curve
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="XGBClassifier, AUC="+str(auc))

#fit Random Forest model and plot ROC curve
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="RandomForestClassifier, AUC="+str(auc))

#fit SVC model and plot ROC curve
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="SVM, AUC="+str(auc))

#add legend
plt.legend()