In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from catboost import CatBoostClassifier

import warnings
warnings.simplefilter("ignore")

%matplotlib inline


In [None]:
df_raw = pd.read_csv("water_potability.csv")
df_raw

In [None]:
# Checking data types
df_raw.info()

In [None]:
# Check for missing data
df_raw.isna().sum()

In [None]:
sns.countplot(x=df_raw["Potability"])
print(f'{df_raw.Potability[df_raw.Potability==1].count()/df_raw.Potability.count()*100:.2f} % of samples are potable (1)')

In [None]:
# Correlation matrix for dataset
plt.figure(figsize=(15,10))
sns.heatmap(df_raw.corr(), annot=True, cmap='YlGnBu')

In [None]:
# Distribution of features
potable = df_raw.query('Potability == 0')
not_potable = df_raw.query('Potability == 1')

fig = plt.figure(figsize=(20,15))

for ax,column in enumerate(df_raw.columns[:9]):
    plt.subplot(3,3,ax+1)
    plt.title(f'Distribution of {column} values')
    sns.kdeplot(x=not_potable[column],label='Not Potable(0)')
    sns.kdeplot(x=potable[column],label='Potable(1)')
    plt.legend(prop=dict(size=10))

    
plt.tight_layout()

In [None]:
# Imputing missing data

def fill_nan(df):
    for index, column in enumerate(df.columns[:9]):
        # print(index, column)
        df[column] = df[column].fillna(df.groupby('Potability')[column].transform('mean'))
    return df
        
df = fill_nan(df_raw)

df.isna().sum()                                               

In [None]:
# Splitting
X = df.drop(['Potability'], axis = 1)
y = df['Potability']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=18, stratify=y) #stratify=y

# Balancing data - oversampling minority
smt = SMOTE()
X_train, y_train = smt.fit_resample(X_train, y_train)

# Scaling
sc = StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)


In [None]:
# Put models in a dictionary
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(), 
    "Random Forest": RandomForestClassifier(),                  
    "XgBoost": XGBClassifier(),
    "CatBoost Classifier": CatBoostClassifier()
}

# Create a function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
   
    """
   Fits and evaluates given machine learning models.
   models: a dict of different Scikit_Learn machine learning models
   X_train: training data (no labels)
   X_test: testing data (no labels)
   y_train: training labels
   y_test: test labels
   """ 
    # Set random seed
    np.random.seed(18)
    # Make a dictionary to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit model to data
        model.fit(X_train, y_train)
        # Evaluate model and append its score to model_scores
        model_scores[name] = cross_val_score(model,
                                             X_test,
                                             y_test,
                                            scoring='roc_auc',
                                            cv=5
                                            ).mean()

    return model_scores

In [None]:
model_scores = fit_and_score(models,X_train,X_test,y_train,y_test)

model_scores

In [None]:
# The XGB Classifier seems to show the most promise 
# with about 79% accuracy after 5 folds of cross-validation

model_compare = pd.DataFrame(model_scores, index=["roc_auc"])
model_compare.T.plot.bar(color="green");

In [None]:
# Hyperparameter Tuning

np.random.seed(18)

# Create a hyperparameter grid for XGB Classifier
xgb_grid = {
    "learning_rate" : [0.01, 0.05, 0.10, 0.20, 0.30],
    "n_estimators" : [50, 100, 200, 500, 1000],
    "max_depth" : [ 3, 5, 8, 11, 15],
    "min_child_weight" : [ 1, 3, 5, 7, 10]
}


# Setup random hyperparameter search for XGB Classifier
rs_xgb = RandomizedSearchCV(XGBClassifier(),
                                param_distributions=xgb_grid,
                                cv=2,
                                n_iter=100,
                                verbose=0
                               )

# Fit random hyperparameter search model for XGB Classifier
rs_xgb.fit(X_train, y_train)

# Find best hyperparamaters
rs_xgb.best_params_


In [None]:
rs_xgb.score(X_test, y_test)

In [None]:
# Final model - XGBClassifier

model = XGBClassifier(
    n_estimators=500,
    learning_rate =0.2, 
    max_depth=8, 
    min_child_weight=10
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Confusion matrix
sns.heatmap(confusion_matrix(y_test, y_pred), 
    annot=True, 
    fmt='d', 
    cmap='YlGnBu')


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Feature importance

def plot_features(columns, importances,n=20):
    df = (pd.DataFrame({"features": columns,
                       "feature_importances": importances})
         .sort_values("feature_importances", ascending=False)
         .reset_index(drop=True))
    # Plot dataframe
    fix, ax = plt.subplots()
    ax.barh(df["features"][:n], df["feature_importances"][:20])
    ax.set_ylabel("Features")
    ax.set_xlabel("Feature Importance")
    ax.invert_yaxis()
    
plot_features(df.drop(['Potability'],axis=1).columns, model.feature_importances_)