# Machine Learning Modeling with K-Fold Cross Validation

In [34]:
#Copied libraries from 'Full_Notebook.ipynb'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandasql import sqldf
import missingno as msno
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from IPython.core import display as ICD

import category_encoders as ce
import time
# New Libraries added for K-Fold Cross Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

pd.set_option('mode.chained_assignment',None)

In [35]:
df = pd.read_csv(r'C:\Users\joe98\OneDrive\Documents\GitHub\Binary-Classification-Model-Predicting-Functionality-Of-WaterPipelines-In-Tanzania\Notebook\DF.csv')

In [36]:
df.head()


Unnamed: 0.1,Unnamed: 0,status_group,amount_tsh,funder,gps_height,installer,wpt_name,basin,subvillage,region,...,scheme_management,permit,construction_year,extraction_type_class,management,payment,water_quality,quantity,source_type,waterpoint_type
0,0,1,6000.0,Roman,1390,Roman,none,Lake Nyasa,Mnyusi B,Iringa,...,VWC,0.0,1999,gravity,vwc,pay annually,soft,enough,spring,communal standpipe
1,1,1,0.0,Grumeti,1399,GRUMETI,Zahanati,Lake Victoria,Nyamara,Mara,...,Other,1.0,2010,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe
2,2,1,25.0,Lottery Club,686,World vision,Kwa Mahundi,Pangani,Majengo,Manyara,...,VWC,1.0,2009,gravity,vwc,pay per bucket,soft,enough,dam,communal standpipe multiple
3,3,0,0.0,Unicef,263,UNICEF,Zahanati Ya Nanyumbu,Ruvuma / Southern Coast,Mahakamani,Mtwara,...,VWC,1.0,1986,submersible,vwc,never pay,soft,dry,borehole,communal standpipe multiple
4,4,1,0.0,Action In A,0,Artisan,Shuleni,Lake Victoria,Kyanyamisa,Kagera,...,,1.0,0,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe


In [37]:
df = df.drop(['Unnamed: 0'], axis=1)

In [38]:
X = df.drop(['status_group'], axis=1)
y = df['status_group']


In [39]:
# To deal with categorical values, I will group columns together based on the frequency of values
# create column with numeric values = num_cols 
# create column with values less than 10 categories = ohe_cols (one hot encoder)
# create column with values more than 10 categories = freq_cols (high frequency values)

num_cols = []
ohe_cols = []
freq_cols = []

for c in X.columns:
    if df[c].dtype in ['float64', 'int64']:
        num_cols.append(c)
    elif len(df[c].unique()) <= 10:
        ohe_cols.append(c)
    else:
        freq_cols.append(c)

In [40]:
# Dealing with numeric values and adding a min-max scaler and replacing null values with the median value
numeric_transformer = Pipeline(steps=[("num_imputer", SimpleImputer(strategy="median")), 
                                      ("scaler", MinMaxScaler())])
# Dealing with categorical columns with less than 10 unique values by One Hot Encoding them. No null values
ohe_transformer = Pipeline(steps=[("ohe", OneHotEncoder(handle_unknown='ignore'))])

# Dealing with Categorical columns with more than 10 unique values by using Count Encoder. Count Encoder handles null values.
freq_transformer = Pipeline(steps=[("freq_enc", ce.CountEncoder(normalize=True, 
                                                                min_group_size=0.01,
                                                                min_group_name='other'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat_ohe", ohe_transformer, ohe_cols),
        ("cat_freq", freq_transformer, freq_cols)], remainder='passthrough')




In [41]:
#Instantiating K-Fold with 7 splits to be used for each of the models
cv = KFold(n_splits=7, random_state=42, shuffle=True)

## Logistic Regression

In [42]:
clf_logreg = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', LogisticRegression(class_weight='balanced',solver='liblinear'))])



In [43]:
# Creating an evaluation function to show the average scores for each of the models
def evaluate(estimator, X, y, cv):
    '''
    Evalutaion function to show the average scoring metrics for f1, ROC AUC and accuracy 
    in the K-Fold with 7 splits
    '''
    # set predictions
    scores = cross_validate(estimator, X, y, scoring = ('accuracy', 'f1', 'roc_auc'), cv = cv, n_jobs=-1)
    
    # print scores
    print("------------")
    print(f"avg_accuracy: ", np.round(np.mean(scores['test_accuracy']),2))
    print(f"avg_f1: ", np.round(np.mean(scores['test_f1']),2))
    print(f"avg_roc_auc: ", np.round(np.mean(scores['test_roc_auc']),2))

    
    


In [44]:
evaluate(clf_logreg, X, y, cv)

------------
avg_accuracy:  0.77
avg_f1:  0.81
avg_roc_auc:  0.83


## Decision Tree

In [45]:
clf_decision_tree = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', DecisionTreeClassifier(random_state=0,
                                                                          max_depth=10,
                                                                          min_samples_leaf=2,
                                                                          min_samples_split=7))])

evaluate(clf_decision_tree, X, y, cv)

------------
avg_accuracy:  0.79
avg_f1:  0.84
avg_roc_auc:  0.85


## K-Nearest Neighbors

In [46]:
clf_knn = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', KNeighborsClassifier(n_neighbors=11,
                                                              weights='uniform'))])

evaluate(clf_knn, X, y, cv)

------------
avg_accuracy:  0.82
avg_f1:  0.85
avg_roc_auc:  0.89


## Random Forest

In [47]:
clf_RF= Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier(random_state=0,
                                                                criterion= 'gini',
                                                                max_depth=15,
                                                                min_samples_leaf=1,
                                                                min_samples_split=2))])

evaluate(clf_RF, X, y, cv)


------------
avg_accuracy:  0.84
avg_f1:  0.87
avg_roc_auc:  0.91


## XGBoost

In [48]:
clf_XG = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', XGBClassifier(use_label_encoder=False, 
                                                       disable_default_eval_metric=True,
                                                       gamma=0,
                                                       learning_rate=0.1,
                                                       max_depth=15,
                                                       min_child_weight=1,
                                                       n_estimators=100,
                                                       subsample=0.5))])

evaluate(clf_XG, X, y, cv)

------------
avg_accuracy:  0.85
avg_f1:  0.87
avg_roc_auc:  0.92


## Support Vector Machine

In [49]:
clf_svm = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', SVC(probability=True,
                                             C=10,
                                             gamma='scale',
                                             kernel='rbf'))])

evaluate(clf_svm, X, y, cv)

------------
avg_accuracy:  0.82
avg_f1:  0.86
avg_roc_auc:  0.86
