In [4]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

In [5]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "/Users/vishali/Desktop/SPRING 2021/project-ml/ML"
CHAPTER_ID = "Project-MidTerm"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

np.random.seed(2042)

In [40]:
import pandas as pd
class StringConverter(dict):

    def __contains__(self, item):

        return True

    def __getitem__(self, item):

        return str

    def get(self, default=None):

        return str

def load_data(data_path = "/Users/vishali/Desktop/SPRING 2021/project-ml"):
    csv_path = os.path.join(data_path, "ch_imputed.csv")
    return pd.read_csv(csv_path, low_memory = False)

In [41]:
ch = load_data()

In [42]:
ch.describe()

Unnamed: 0.1,Unnamed: 0,X,sd1,hv204,hv206,hv207,hv208,hv209,hv210,hv211,...,PET_2015,Proximity_to_tiol_Borders,Proximity_to_Protected_Areas,Proximity_to_Water,Rainfall_2015,Travel_Times_2015,UN_Population_Density_2015,Wet_Days_2015,washhands,smoke
count,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0,...,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0,9395.0
mean,4698.0,22262.091964,0.482278,128.520809,0.219372,0.531985,0.112826,0.01703,0.168175,0.019159,...,2.861483,23235.013448,29222.281985,22814.013185,1079.743748,38.247741,1003.719147,16.182889,0.108675,0.825652
std,2712.247223,12738.764883,0.499712,286.127055,0.413843,0.499002,0.316397,0.129391,0.374041,0.137091,...,0.188219,14578.638448,18926.488291,13932.925922,219.290823,27.971789,1533.360813,0.627466,0.311247,0.379429
min,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.45,225.64,0.0,0.0,740.6,0.0,145.52,15.18,0.0,0.0
25%,2349.5,11275.5,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.69,10497.34,12487.38,11861.15,872.0,16.64,458.85,15.63,0.0,1.0
50%,4698.0,22154.0,0.0,30.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.89,21882.13,26923.4,22821.1,1053.33,35.33,552.06,16.39,0.0,1.0
75%,7046.5,32813.0,1.0,60.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3.0,36685.81,45813.81,32177.58,1243.9,53.95,738.38,16.56,0.0,1.0
max,9395.0,44985.0,1.0,996.0,1.0,1.0,1.0,1.0,1.0,1.0,...,3.17,55444.9,71428.82,65493.43,1621.0,175.81,10562.06,17.31,1.0,1.0


In [44]:
ch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9395 entries, 0 to 9394
Data columns (total 42 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      9395 non-null   int64  
 1   X                               9395 non-null   int64  
 2   whsd                            9395 non-null   object 
 3   sd1                             9395 non-null   int64  
 4   hv204                           9395 non-null   int64  
 5   hv206                           9395 non-null   int64  
 6   hv207                           9395 non-null   int64  
 7   hv208                           9395 non-null   int64  
 8   hv209                           9395 non-null   int64  
 9   hv210                           9395 non-null   int64  
 10  hv211                           9395 non-null   int64  
 11  hv212                           9395 non-null   int64  
 12  hv216                           93

In [45]:
# drop object dtypes
del ch['whsd']
del ch['v025']

In [46]:
ch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9395 entries, 0 to 9394
Data columns (total 40 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      9395 non-null   int64  
 1   X                               9395 non-null   int64  
 2   sd1                             9395 non-null   int64  
 3   hv204                           9395 non-null   int64  
 4   hv206                           9395 non-null   int64  
 5   hv207                           9395 non-null   int64  
 6   hv208                           9395 non-null   int64  
 7   hv209                           9395 non-null   int64  
 8   hv210                           9395 non-null   int64  
 9   hv211                           9395 non-null   int64  
 10  hv212                           9395 non-null   int64  
 11  hv216                           9395 non-null   int64  
 12  hv225                           93

In [48]:
X = ch.drop('sd1', axis=1)
y = ch['sd1']

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) 

In [51]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier()

In [52]:
y_pred = classifier.predict(X_test)

In [53]:
from sklearn.metrics import classification_report, confusion_matrix
CM = confusion_matrix(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[888  89]
 [ 83 819]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       977
           1       0.90      0.91      0.90       902

    accuracy                           0.91      1879
   macro avg       0.91      0.91      0.91      1879
weighted avg       0.91      0.91      0.91      1879



In [106]:
TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]

FNR = FN/(TP+FN)
FPR = FP/(FP+TN)
TPR = TP/(TP+FN)
print(FNR)
print(FPR)
print(TPR)

0.09201773835920177
0.09109518935516889
0.9079822616407982


In [109]:
from sklearn import metrics
roc_curve = metrics.auc(FPR, TPR)

TypeError: Singleton array 0.09109518935516889 cannot be considered a valid collection.

[[182 780]
 [141 776]]
              precision    recall  f1-score   support

           0       0.56      0.19      0.28       962
           1       0.50      0.85      0.63       917

    accuracy                           0.51      1879
   macro avg       0.53      0.52      0.46      1879
weighted avg       0.53      0.51      0.45      1879



DecisionTreeClassifier(max_depth=10, random_state=42)

[[901  61]
 [604 313]]
              precision    recall  f1-score   support

           0       0.60      0.94      0.73       962
           1       0.84      0.34      0.48       917

    accuracy                           0.65      1879
   macro avg       0.72      0.64      0.61      1879
weighted avg       0.71      0.65      0.61      1879



NameError: name 'randint' is not defined

In [65]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
import random
from random import randint

In [101]:
parameters = {"min_samples_leaf": range(1, 9)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=4)
clf.fit(X=X, y=y)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 

0.5002660989888238 {'min_samples_leaf': 2}


In [102]:
from sklearn.tree import DecisionTreeClassifier
classifier4 = DecisionTreeClassifier(max_depth = 2)
classifier4.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2)

In [103]:
y_pred = classifier3.predict(X_val)

In [104]:
from sklearn.metrics import classification_report, confusion_matrix
CM2 = confusion_matrix(y_val, y_pred)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[933  52]
 [578 316]]
              precision    recall  f1-score   support

           0       0.62      0.95      0.75       985
           1       0.86      0.35      0.50       894

    accuracy                           0.66      1879
   macro avg       0.74      0.65      0.62      1879
weighted avg       0.73      0.66      0.63      1879



In [105]:
TN = CM2[0][0]
FN = CM2[1][0]
TP = CM2[1][1]
FP = CM2[0][1]

FNR = FN/(TP+FN)
FPR = FP/(FP+TN)
print(FNR)
print(FPR)

0.6465324384787472
0.05279187817258883
