K_Fold Test

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandasql import sqldf
import missingno as msno
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from IPython.core import display as ICD

import category_encoders as ce
import time

pd.set_option('mode.chained_assignment',None)

In [34]:
df = pd.read_csv(r'C:\Users\joe98\OneDrive\Documents\GitHub\Binary-Classification-Model-Predicting-Functionality-Of-WaterPipelines-In-Tanzania\Notebook\DF.csv')

In [43]:
df.head()


Unnamed: 0.1,Unnamed: 0,status_group,amount_tsh,funder,gps_height,installer,wpt_name,basin,subvillage,region,...,scheme_management,permit,construction_year,extraction_type_class,management,payment,water_quality,quantity,source_type,waterpoint_type
0,0,1,6000.0,Roman,1390,Roman,none,Lake Nyasa,Mnyusi B,Iringa,...,VWC,0.0,1999,gravity,vwc,pay annually,soft,enough,spring,communal standpipe
1,1,1,0.0,Grumeti,1399,GRUMETI,Zahanati,Lake Victoria,Nyamara,Mara,...,Other,1.0,2010,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe
2,2,1,25.0,Lottery Club,686,World vision,Kwa Mahundi,Pangani,Majengo,Manyara,...,VWC,1.0,2009,gravity,vwc,pay per bucket,soft,enough,dam,communal standpipe multiple
3,3,0,0.0,Unicef,263,UNICEF,Zahanati Ya Nanyumbu,Ruvuma / Southern Coast,Mahakamani,Mtwara,...,VWC,1.0,1986,submersible,vwc,never pay,soft,dry,borehole,communal standpipe multiple
4,4,1,0.0,Action In A,0,Artisan,Shuleni,Lake Victoria,Kyanyamisa,Kagera,...,,1.0,0,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe


In [44]:
df = df.drop(['Unnamed: 0'], axis=1)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55083 entries, 0 to 55082
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   status_group           55083 non-null  int64  
 1   amount_tsh             55083 non-null  float64
 2   funder                 51885 non-null  object 
 3   gps_height             55083 non-null  int64  
 4   installer              51868 non-null  object 
 5   wpt_name               55083 non-null  object 
 6   basin                  55083 non-null  object 
 7   subvillage             54713 non-null  object 
 8   region                 55083 non-null  object 
 9   public_meeting         51905 non-null  float64
 10  scheme_management      51429 non-null  object 
 11  permit                 52327 non-null  float64
 12  construction_year      55083 non-null  int64  
 13  extraction_type_class  55083 non-null  object 
 14  management             55083 non-null  object 
 15  pa

In [48]:
X = df.drop(['status_group'], axis=1)
y = df['status_group']


In [49]:
from sklearn.model_selection import KFold

In [61]:
from sklearn.model_selection import cross_validate

In [52]:
# To deal with categorical values, I will group columns together based on the frequency of values
# create column with numeric values = num_cols 
# create column with values less than 10 categories = ohe_cols (one hot encoder)
# create column with values more than 10 categories = freq_cols (high frequency values)

num_cols = []
ohe_cols = []
freq_cols = []

for c in X.columns:
    if df[c].dtype in ['float64', 'int64']:
        num_cols.append(c)
    elif len(df[c].unique()) <= 10:
        ohe_cols.append(c)
    else:
        freq_cols.append(c)

In [53]:
cv = KFold(n_splits=7, random_state=42, shuffle=True)

In [54]:
# Dealing with numeric values and adding a min-max scaler and replacing null values with the median value
numeric_transformer = Pipeline(steps=[("num_imputer", SimpleImputer(strategy="median")), 
                                      ("scaler", MinMaxScaler())])
# Dealing with categorical columns with less than 10 unique values by One Hot Encoding them. No null values
ohe_transformer = Pipeline(steps=[("ohe", OneHotEncoder(handle_unknown='ignore'))])

# Dealing with Categorical columns with more than 10 unique values by using Count Encoder. Count Encoder handles null values.
freq_transformer = Pipeline(steps=[("freq_enc", ce.CountEncoder(normalize=True, 
                                                                min_group_size=0.01,
                                                                min_group_name='other'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat_ohe", ohe_transformer, ohe_cols),
        ("cat_freq", freq_transformer, freq_cols)], remainder='passthrough')


clf_logreg = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', LogisticRegression(class_weight='balanced',solver='liblinear'))])



In [74]:
scores = cross_validate(clf_logreg, X, y, scoring=('accuracy','f1', 'roc_auc'), cv=cv, n_jobs=-1 )

In [57]:
print(np.mean(scores))

0.7683132726975653


In [73]:
for score in scores['test_f1']:
    print('test_accuracy ', score)

test_accuracy  0.8094400165614326
test_accuracy  0.8113127902068384
test_accuracy  0.8081172970675732
test_accuracy  0.805019305019305
test_accuracy  0.8106627331155831
test_accuracy  0.8029520295202951
test_accuracy  0.8037482696198489


In [110]:
def evaluate(estimator, X, y, cv):
    '''
    Evalutaion function to show scoring metrics such as f1, ROC AUC, accuracy, precision and recall
    for both the train and test sets
    Also will show a confusion matrix
    '''
    # set predictions
    scores = cross_validate(estimator, X, y, scoring = ('accuracy', 'f1', 'roc_auc'), cv = cv, n_jobs=-1)
    
    # print scores
    print("------------")
    print(f"avg_accuracy: ", np.round(np.mean(scores['test_accuracy']),2))
    print(f"avg_f1: ", np.round(np.mean(scores['test_f1']),2))
    print(f"avg_roc_auc: ", np.round(np.mean(scores['test_roc_auc']),2))

    
    # plot test confusion matrix   


In [111]:
evaluate(clf_logreg, X, y, cv)

------------
avg_accuracy:  0.77
avg_f1:  0.81
avg_roc_auc:  0.83
