In [None]:
## importing the Modules 
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tqdm
from tqdm import tqdm_notebook

import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [None]:
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, r2_score, accuracy_score
from sklearn.model_selection import (GridSearchCV, KFold, train_test_split, cross_val_score)

from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import svm

from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier

In [None]:
wd = pd.read_csv("water_potability.csv")

wd.head()

In [None]:
wd.isnull().sum()

In [None]:
wd.min()

In [None]:
wd.max()

In [None]:
wd_range  = wd.max() - wd.min()
print(wd_range)

In [None]:
wd["Hardness"].min()

In [None]:
wd.describe().T

In [None]:
wd["Solids"].mean()

In [None]:
np.percentile(wd["Solids"],25)

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(y="Potability", data=wd,orient = "h", edgecolor=sns.color_palette("dark"));

In [None]:
plt.figure(figsize=(12, 8))
wd["ph"].hist(bins=30)
plt.axvline(wd["ph"].mean(), color ='r', label = "mean")
plt.axvline(wd["ph"].median(), color ='y', label = "mean")

In [None]:
plt.figure(figsize=(12, 8))
wd["Sulfate"].hist(bins=30)
plt.axvline(wd["Sulfate"].mean(), color ='r', label = "mean")
plt.axvline(wd["Sulfate"].median(), color ='y', label = "mean")

In [None]:
cols = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']

In [None]:
def boxPlotter(dataset, columnName):
    """
    Plots boxplots for column given as parameter.
    """
    sns.catplot(x="Potability", y=columnName, data=dataset, kind="box");
for column in tqdm_notebook(cols, desc = "Your Charts are being ready"):
    boxPlotter(wd, column)

In [None]:
plt.figure(figsize=(20, 17))
matrix = np.triu(wd.corr())
sns.heatmap(wd.corr(), annot=True,linewidth=.8, mask=matrix, cmap="rocket");

In [None]:
sns.pairplot(wd, hue="Potability", palette="husl")

In [None]:
def pieChartPlotter(dataset, columnName):
    """
    Creates pie chart of the column given as parameter in the dataset
    """
    values = dataset[columnName].value_counts()
    print(values)
    labels = dataset[columnName].unique()
    print(labels)
    
    pie, ax = plt.subplots(figsize=[10, 6])

    patches, texts, autotexts = ax.pie(values, labels=labels, autopct='%1.2f%%', shadow=True, pctdistance=.5,explode=[0.06]*dataset[columnName].unique()
                                       )

    plt.legend(patches, labels, loc="best")
    plt.title(columnName, color='white', fontsize=14)
    plt.setp(texts, color='white', fontsize=20)
    plt.setp(autotexts, size=15, color='black')
    autotexts[1].set_color('black')

   
    plt.show()
pieChartPlotter(wd, 'Potability') 

In [None]:
wd.groupby("Potability").count()

In [None]:
wd.head()

In [None]:
lower_outlier = wd["Sulfate"].quantile(.25) - 1.5*(stats.iqr(wd["Sulfate"], rng=(25, 75),nan_policy='omit'))
lower_outlier

In [None]:
sns.countplot(wd["Potability"])
plt.show()

In [None]:
plt.figure(figsize = (12,8))
wd["Sulfate"].plot(kind = "box")

In [None]:
wd.isnull().sum()

In [None]:
#ph values are evenly distributed so we can use mean
wd['ph'].fillna(wd['ph'].mean(),inplace=True)
#sulphate values are slightly on the right side and it has outliers which may affect mean hence we will use median here
wd['Sulfate'].fillna(wd['Sulfate'].median(),inplace=True)
#Trihalomethanes values are evenly distributed so we will use mean
wd['Trihalomethanes'].fillna(wd['Trihalomethanes'].mean(),inplace=True)

In [None]:
wd.isnull().sum()

In [None]:
colors = ['#ff0000','#fff000','#18fff9','#8f139f']
fig, axes = plt.subplots(3,3 ,figsize=(12,12))
column = wd.columns
fig.suptitle('Boxplots of each variable')
sns.boxplot(ax=axes[0,0],x=column[0],data=wd,color=colors[0])
sns.boxplot(ax=axes[0,1],x=column[1],data=wd,color=colors[1])
sns.boxplot(ax=axes[0,2],x=column[2],data=wd,color=colors[2])
sns.boxplot(ax=axes[1,0],x=column[3],data=wd,color=colors[3])
sns.boxplot(ax=axes[1,1],x=column[4],data=wd,color=colors[0])
sns.boxplot(ax=axes[1,2],x=column[5],data=wd,color=colors[1])
sns.boxplot(ax=axes[2,0],x=column[6],data=wd,color=colors[2])
sns.boxplot(ax=axes[2,1],x=column[7],data=wd,color=colors[3])
sns.boxplot(ax=axes[2,2],x=column[8],data=wd,color=colors[0])
plt.show()

In [None]:
wd["ph"].plot(kind = "kde")

In [None]:
## KDE Kernel Density Plot - Probability Distriubtion 
colors = ['#ff0000','#fff000','#18fff9','#8f139f']
fig, axes = plt.subplots(3,3 ,figsize=(12,12))
column = wd.columns
fig.suptitle('kdeplots of each variable')
sns.kdeplot(ax=axes[0,0],x=column[0],data=wd,color=colors[0])
sns.kdeplot(ax=axes[0,1],x=column[1],data=wd,color=colors[1])
sns.kdeplot(ax=axes[0,2],x=column[2],data=wd,color=colors[2])
sns.kdeplot(ax=axes[1,0],x=column[3],data=wd,color=colors[3])
sns.kdeplot(ax=axes[1,1],x=column[4],data=wd,color=colors[0])
sns.kdeplot(ax=axes[1,2],x=column[5],data=wd,color=colors[1])
sns.kdeplot(ax=axes[2,0],x=column[6],data=wd,color=colors[2])
sns.kdeplot(ax=axes[2,1],x=column[7],data=wd,color=colors[3])
sns.kdeplot(ax=axes[2,2],x=column[8],data=wd,color=colors[0])
plt.show()

In [None]:
sns.scatterplot(y= column[0-8] , x = wd["ph"], hue = wd["Potability"] ,data = wd)

In [None]:
wd['Potability'].value_counts()

In [None]:
wd.skew()

In [None]:
wd.skew(axis=0).to_frame(name='Skewed Values')

In [None]:
wd.

In [None]:
plt.figure(figsize=(8,5))

sns.countplot(x='Potability',data=wd,saturation=0.95)

In [None]:
wd['Potability'].value_counts()

In [None]:
print("Potable water %",(wd['Potability'].value_counts()[1]/len(wd)*100))
print("Non-Potable water %",(wd['Potability'].value_counts()[0]/len(wd)*100))

In [None]:
wd.iloc[:,:-1].mean().to_frame().T

In [None]:
df=wd

In [None]:
df.kurtosis()

In [None]:
plt.figure(figsize=(13,12))
sns.distplot(df['Solids'][df.Potability==1],hist=True, rug=True,color='Maroon',hist_kws={'alpha':0.50})
sns.distplot(df['Solids'][df.Potability==0],hist=True, rug=True,color='Blue',hist_kws={'alpha':0.60})
plt.legend(['Potability','Non Potability'])

In [None]:
plt.figure(figsize=(13,12))
sns.distplot(df['ph'][df.Potability==1],hist=True, rug=True,color='Maroon',hist_kws={'alpha':0.50})
sns.distplot(df['ph'][df.Potability==0],hist=True, rug=True,color='Blue',hist_kws={'alpha':0.60})
plt.legend(['Potability','Non Potability'])

- Both the Potable and Non Potable Water is within the range of WHO standard i.e 6.5 and 8.5
- Solids contained in both potable and non potable water is high but containment is less in Potable   
   water.
- Sulfate containment in Non Potable water is too high compare to Potable water but still both water is drinkable as sulfate containment is less than 1000mg/L
- Trihalomethanes containment is high in non potable water but less than permissible limit but still this non potable water is not recommended to drink.
- Conductivity of the non potable water is too high above permissible limit. This because of containments of solid,organic matter,etc.
- From The Mean table We can also see that most water is below ph 8.5 but conductivitiy is high that makes it unhealthy. As the conductivity, presence of high Trihalomethanes and other levels are quite high than Potable water, This makes it unhealty to drink.Therefore almost 60% of water is Non-Potable Water.

In [None]:
plt.figure(figsize=(18,15))
corr=df.iloc[:,:-1].corr()
corr_feat = corr.index
sns.heatmap(df[corr_feat].corr(),cmap='cividis',annot=True,linewidths=2)

In [None]:
bin=np.linspace(min(df['ph']),max(df['ph']),15)
groups=[1,2,3,4,5,6,7,8,9,10,11,12,13,14]
df['ph-binned']=pd.cut(df['ph'],bins=bin,labels=groups)

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(df['ph-binned'],hue=df['Potability'],palette='viridis')

In [None]:
from scipy import stats
import pylab
def normality(data,feature):
    plt.figure(figsize=(10,5))
    plt.subplot(1,2,1)
    sns.kdeplot(data[feature])
    plt.subplot(1,2,2)
    stats.probplot(data[feature],plot=pylab)
    plt.show()

In [None]:
normality(df,'Solids')

In [None]:
df['Solids'],param=stats.boxcox(df['Solids'])

In [None]:
normality(df,'Solids')

In [None]:
normality(df,'Conductivity')

In [None]:
df['Conductivity'],param=stats.boxcox(df['Conductivity'])

In [None]:
normality(df,'Conductivity')

In [None]:
df.skew()

In [None]:
X = df.loc[:,wd.columns != "Potability"]
X.head()

In [None]:
y = wd['Potability']

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_es)


In [None]:

feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.plot(kind='barh')
plt.show()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=45)

In [None]:
print('Correlation of Potability with feature variables:')
features = list(df.columns.drop('Potability'))

Corr = list()
for cols in features:
    Corr.append(df[cols].corr(df['Potability']))

corrDf = pd.DataFrame({'Features' : features, 'Corr' : Corr})
corrDf['Corr'] = corrDf['Corr'].abs()
corrDf.sort_values(by='Corr', ascending = True)

In [None]:
waterData = wd


In [None]:
#ph values are evenly distributed so we can use mean
wd['ph'].fillna(wd['ph'].mean(),inplace=True)
#sulphate values are slightly on the right side and it has outliers which may affect mean hence we will use median here
wd['Sulfate'].fillna(wd['Sulfate'].median(),inplace=True)
#Trihalomethanes values are evenly distributed so we will use mean
wd['Trihalomethanes'].fillna(wd['Trihalomethanes'].mean(),inplace=True)

In [None]:
X = waterData.drop('Potability', axis = 1).copy()
y = waterData['Potability'].copy()
print(X)

In [None]:

############################# Train-Test split ############################
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)



In [None]:
########################## Synthetic OverSampling ###########################
print('Balancing the data by SMOTE - Oversampling of Minority level\n')
smt = SMOTE()
counter = Counter(y_train)
print('Before SMOTE', counter)
X_train, y_train = smt.fit_resample(X_train, y_train)
counter = Counter(y_train)
print('\nAfter SMOTE', counter)

################################# Scaling #################################
ssc = StandardScaler()

X_train = ssc.fit_transform(X_train)
X_test = ssc.transform(X_test)

modelAccuracy = list()

In [None]:
model = [LogisticRegression(), DecisionTreeClassifier(), GaussianNB(), RandomForestClassifier(), ExtraTreesClassifier(),
        svm.LinearSVC(), CatBoostClassifier()]
trainAccuracy = list()
testAccuracy = list()
kfold = KFold(n_splits=10, random_state=7, shuffle=True)

for mdl in model:
    trainResult = cross_val_score(mdl, X_train, y_train, scoring='accuracy', cv=kfold)
    trainAccuracy.append(trainResult.mean())
    mdl.fit(X_train, y_train)
    y_pred = mdl.predict(X_test)
    testResult = metrics.accuracy_score(y_test, y_pred)
    testAccuracy.append(testResult)

In [None]:
print('The comparision\n')
modelScore = pd.DataFrame({'Model' : model, 'Train_Accuracy' : trainAccuracy, 'Test_Accuracy' : testAccuracy})
modelScore

In [None]:
print('Random Forest Classifier\n')
Rfc = RandomForestClassifier()
Rfc.fit(X_train, y_train)

y_Rfc = Rfc.predict(X_test)
print(metrics.classification_report(y_test, y_Rfc))
print(modelAccuracy.append(metrics.accuracy_score(y_test, y_Rfc)))

sns.heatmap(confusion_matrix(y_test, y_Rfc), annot=True, fmt='d')
plt.show()

In [None]:
print('CatBoostClassifier\n')
cat = CatBoostClassifier(verbose=False)
cat.fit(X_train, y_train)

y_cat = cat.predict(X_test)
print(metrics.classification_report(y_test, y_cat))
print(modelAccuracy.append(metrics.accuracy_score(y_test, y_cat)))

sns.heatmap(confusion_matrix(y_test, y_cat), annot=True, fmt='d')
plt.show()