# Import, read_csv, target and split, OHE

In [59]:
import pandas as pd
import numpy as np

from IPython.display import Image

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import tree
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.metrics import accuracy_score,recall_score,f1_score,classification_report
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, RandomizedSearchCV, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

import xgboost
from xgboost.sklearn import XGBClassifier

In [3]:
target = pd.read_csv('../data/cleaned_water_Modeling.csv')

In [4]:
target.drop(columns='Unnamed: 0', inplace=True)

In [5]:
target.head()

Unnamed: 0,status_group,amount_tsh,funder,gps_height,installer,basin,region,lga,ward,population,...,permit,extraction_type_class,management,payment_type,water_quality,quantity,source,source_class,waterpoint_type,age
0,0,6000.0,Roman,1390,Roman,Lake Nyasa,Iringa,Ludewa,Mundindi,109,...,False,gravity,vwc,annually,soft,enough,spring,groundwater,communal standpipe,21
1,0,25.0,Lottery Club,686,World vision,Pangani,Manyara,Simanjiro,Ngorika,250,...,True,gravity,vwc,per bucket,soft,enough,dam,surface,communal standpipe multiple,11
2,1,0.0,Unicef,263,UNICEF,Ruvuma / Southern Coast,Mtwara,Nanyumbu,Nanyumbu,58,...,True,submersible,vwc,never pay,soft,dry,machine dbh,groundwater,communal standpipe multiple,34
3,0,20.0,Mkinga Distric Coun,0,DWE,Pangani,Tanga,Mkinga,Moa,1,...,True,submersible,vwc,per bucket,salty,enough,other,unknown,communal standpipe multiple,11
4,1,0.0,Dwsp,0,DWSP,Internal,Shinyanga,Shinyanga Rural,Samuye,0,...,True,handpump,vwc,never pay,soft,enough,machine dbh,groundwater,hand pump,23


In [6]:
X = target.drop(columns='status_group')
y = target['status_group']

**Train, test, split**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 77, 
                                                    stratify = y, # in classification problems 
                                                                  # when you split the data 
                                                                  # you want to keep the ratio in the classes.
                                                    test_size = .2 # This is usually the ratio but it might change 
                                                                   # according to the problem at hand.
                                                   )

**Extracting feature names**

In [8]:
cat_cols = X.select_dtypes(include=['object','bool']).columns.tolist()
cont_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols,cont_cols

(['funder',
  'installer',
  'basin',
  'region',
  'lga',
  'ward',
  'public_meeting',
  'scheme_management',
  'permit',
  'extraction_type_class',
  'management',
  'payment_type',
  'water_quality',
  'quantity',
  'source',
  'source_class',
  'waterpoint_type'],
 ['amount_tsh', 'gps_height', 'population', 'age'])

**OHE**

In [9]:
## create an encoder object. This will help us to convert
## categorical variables to new columns
ss = StandardScaler()
encoder = OneHotEncoder(handle_unknown= 'ignore',
                        #drop='first',
                        categories= 'auto')

## Create an columntransformer object.
## This will help us to merge transformed columns
## with the rest of the dataset.

ct = ColumnTransformer(transformers =[('ohe', encoder, cat_cols)], #('scaler', ss, cont_cols)],
                                    remainder= ss)
X_train_ohe = ct.fit_transform(X_train)
X_test_ohe = ct.transform(X_test)

In [10]:
X_train_ohe

<38920x5134 sparse matrix of type '<class 'numpy.float64'>'
	with 817320 stored elements in Compressed Sparse Row format>

**Create a train_train train_test split**

- use randomsearch with wide parameters to get guidance

- then use gridsearch with the  refined parameters

- then...?

In [15]:
# X_train_train, y_train_train, X_train_test, y_train_test = train_test_split(X_train_ohe,y_train)

---
# Logistic Regression

## LR Vanilla

In [64]:
log_reg = LogisticRegression()
log_reg.fit(X_train_ohe, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
log_reg.score(X_train_ohe, y_train)

0.8307297019527236

---
# Decision Tree

## DT Vanilla

In [16]:
clf = DecisionTreeClassifier()
clf.fit(X_train_ohe,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [17]:
clf.score(X_test_ohe, y_test)

0.7957044496968452

In [72]:
#print(classification_report(y_test,clf.predict(X_test_ohe)))

## DT RandomizedSearchCV

In [26]:
clf_rs = DecisionTreeClassifier()

min_samples_leaf = range(1,11)

max_depth_params = range(2, 12) ## values between 2 to 11 total: 10 values

max_features_param = [None, 'auto', 'sqrt', 'log2', 25] ## total of 5 values

param_grid = {'max_depth': max_depth_params, 'max_features': max_features_param, 'min_samples_leaf': min_samples_leaf}


randomsearch = RandomizedSearchCV(clf_rs, param_grid, random_state=100, n_iter=10, cv=5, verbose=0, n_jobs=-1, return_train_score=False)

In [27]:
rscv = randomsearch.fit(X_train_ohe, y_train)

In [28]:
rscv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [29]:
rscv.best_score_

0.7144655704008221

In [30]:
clf_rs.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

## DT GridSearchCV

In [33]:
clf_gs = DecisionTreeClassifier()

min_samples_leaf = range(1,3)

#max_depth_params = None ## values between 2 to 11 total: 10 values

max_features_param = [None, 'auto', 'sqrt', 'log2', 25] ## total of 5 values

param_grid = { 'max_features': max_features_param, 'min_samples_leaf': min_samples_leaf}

gridsearch = GridSearchCV(estimator = clf_gs, 
                          param_grid = param_grid,
                          n_jobs = -1, ## paralllel computation
                          verbose = 1, ## gives feedback
                          cv = 2,     ## cross-validate
                          scoring = 'recall',  ## you can use multiple scoring too
                          return_train_score= True)

gridsearch = gridsearch.fit(X_train_ohe, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.9s finished


In [34]:
gridsearch.best_estimator_

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'),
 0.7406520747834017)

In [57]:
gridsearch.best_score_

0.7406520747834017

## DT Crossvalidation

In [36]:
cv = cross_validate(clf,X_train_ohe,y_train, cv=4)

In [38]:
cv

{'fit_time': array([3.20178986, 3.23990297, 3.14910698, 3.3185401 ]),
 'score_time': array([0.00607514, 0.00591421, 0.00583005, 0.00641394]),
 'test_score': array([0.78078109, 0.78468654, 0.78920863, 0.78581706])}

In [None]:
cross val predict

use X_test to get y_pred

---
# RandomForest

## RF Vanilla 

In [45]:
rf = RandomForestClassifier()
rf.fit(X_train_ohe,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
rf.score(X_test_ohe, y_test)

0.8160517932381051

## RF RandomSearchCV

In [52]:
rf_rs = RandomForestClassifier()

min_samples_leaf = range(1,11)

max_depth_params = range(2, 12) ## values between 2 to 11 total: 10 values

max_features_param = [None, 'auto', 'sqrt', 'log2', 25] ## total of 5 values

param_grid = {'max_depth': max_depth_params, 'max_features': max_features_param, 'min_samples_leaf': min_samples_leaf}


randomsearch = RandomizedSearchCV(clf_rs, param_grid, random_state=100, n_iter=10, cv=5, verbose=0, n_jobs=-1, return_train_score=False)

In [53]:
rfrscv = randomsearch.fit(X_train_ohe, y_train)

In [54]:
rfrscv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [55]:
rfrscv.best_score_

0.7144398766700925

In [56]:
rf_rs.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## RF GridSearchCV

In [33]:
clf_gs = DecisionTreeClassifier()

min_samples_leaf = range(1,3)

#max_depth_params = None ## values between 2 to 11 total: 10 values

max_features_param = [None, 'auto', 'sqrt', 'log2', 25] ## total of 5 values

param_grid = { 'max_features': max_features_param, 'min_samples_leaf': min_samples_leaf}

gridsearch = GridSearchCV(estimator = clf_gs, 
                          param_grid = param_grid,
                          n_jobs = -1, ## paralllel computation
                          verbose = 1, ## gives feedback
                          cv = 2,     ## cross-validate
                          scoring = 'recall',  ## you can use multiple scoring too
                          return_train_score= True)

gridsearch = gridsearch.fit(X_train_ohe, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.9s finished


In [34]:
gridsearch.best_estimator_

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'),
 0.7406520747834017)

In [57]:
gridsearch.best_score_

0.7406520747834017

## RandomForest Final

compare y_test to y_pred