In [40]:
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

In [41]:
import pandas as pd
class StringConverter(dict):

    def __contains__(self, item):

        return True

    def __getitem__(self, item):

        return str

    def get(self, default=None):

        return str

def load_data(data_path = "/Users/vishali/Desktop/SPRING 2021/project-ml"):
    csv_path = os.path.join(data_path, "topcleandata.csv")
    return pd.read_csv(csv_path, low_memory = False)

In [14]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    """
    Modified from:
    Hands-On Machine learning with Scikit-Learn
    and TensorFlow; p.89
    """
    plt.figure(figsize=(8, 8))
    plt.title("Precision and Recall Scores as a function of the decision threshold")
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.ylabel("Score")
    plt.xlabel("Decision Threshold")
    plt.legend(loc='best')

In [42]:
new = load_data()

In [43]:
new = new.dropna(subset=['ha57'])

In [44]:
new['ha57'] = pd.Categorical(new.ha57)

In [45]:
y = new['ha57']

In [46]:
X = new.drop(['ha57', 'hc11','DHSCC', 'DHSCLUST'], axis = 1)

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [48]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [49]:
finalDf = pd.concat([new, new[['ha57']]], axis = 1)

import matplotlib.pyplot as plt
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = ['taret']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['ha57'] == target
    ax.scatter(finalDF.loc[indicesToKeep, 'principal component 1']
               , finalDF.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [50]:
pca.explained_variance_ratio_


array([0.87596476, 0.10163377])

In [61]:
from sklearn.model_selection import train_test_split
# test_size: what proportion of original data is used for test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [76]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [77]:
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)
pca

In [78]:
pca.fit(X_train)

PCA(n_components=0.95)

In [79]:
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [81]:
from sklearn.linear_model import LogisticRegression

In [82]:
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
logisticRegr = LogisticRegression(solver = 'liblinear')


In [84]:
logisticRegr.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [85]:
# Predict for One Observation (image)
logisticRegr.predict(X_test)


array([0., 0., 0., ..., 0., 0., 0.])

In [86]:
# Predict for One Observation (image)
y_pred_LR = logisticRegr.predict(X_test)

In [88]:
from sklearn.metrics import classification_report, confusion_matrix
CM = confusion_matrix(y_test, y_pred_LR)
print(confusion_matrix(y_test, y_pred_LR))
print(classification_report(y_test, y_pred_LR))


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]

# True Negative Rate | Specificity

FPR = TN / (TN + FP)
print(FPR)

# When true positive + false positive == 0, precision returns 0 and raises UndefinedMetricWarning. This behavior can be modified with zero_division.

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_LR))

[[1669   15]
 [ 878    7]]
              precision    recall  f1-score   support

         0.0       0.66      0.99      0.79      1684
         1.0       0.32      0.01      0.02       885

    accuracy                           0.65      2569
   macro avg       0.49      0.50      0.40      2569
weighted avg       0.54      0.65      0.52      2569

0.9910926365795725
0.6523939275982873


In [89]:
# Decision Trees
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state = 42)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [90]:
y_pred_DT = classifier.predict(X_test)

In [91]:
from sklearn.metrics import classification_report, confusion_matrix
CM = confusion_matrix(y_test, y_pred_DT)
print(confusion_matrix(y_test, y_pred_DT))
print(classification_report(y_test, y_pred_DT))


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]

# True Negative Rate | Specificity

FPR = TN / (TN + FP)
print(FPR)

# When true positive + false positive == 0, precision returns 0 and raises UndefinedMetricWarning. This behavior can be modified with zero_division.

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_DT))

[[1395  289]
 [ 512  373]]
              precision    recall  f1-score   support

         0.0       0.73      0.83      0.78      1684
         1.0       0.56      0.42      0.48       885

    accuracy                           0.69      2569
   macro avg       0.65      0.62      0.63      2569
weighted avg       0.67      0.69      0.68      2569

0.8283847980997625
0.6882055274425847


In [94]:
from sklearn.svm import LinearSVC
lin_svc = LinearSVC(max_iter=100000,random_state=42)
lin_svc.fit(X_train, y_train)
y_pred = lin_svc.predict(X_test)

In [95]:
from sklearn.metrics import classification_report, confusion_matrix
CM = confusion_matrix(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]


# or using accuracy scores
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

[[1669   15]
 [ 878    7]]
              precision    recall  f1-score   support

         0.0       0.66      0.99      0.79      1684
         1.0       0.32      0.01      0.02       885

    accuracy                           0.65      2569
   macro avg       0.49      0.50      0.40      2569
weighted avg       0.54      0.65      0.52      2569

0.6523939275982873


In [96]:
# random forests
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()    

In [97]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

In [100]:
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [101]:
y_rf = clf.predict(X_test)

In [102]:
from sklearn.metrics import classification_report, confusion_matrix
CM = confusion_matrix(y_test, y_rf)
print(confusion_matrix(y_test, y_rf))
print(classification_report(y_test, y_rf))


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]

FPR = FP / (TN + FP)
print(FPR)

# or using accuracy scores
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_rf)


[[1684    0]
 [ 885    0]]
              precision    recall  f1-score   support

         0.0       0.66      1.00      0.79      1684
         1.0       0.00      0.00      0.00       885

    accuracy                           0.66      2569
   macro avg       0.33      0.50      0.40      2569
weighted avg       0.43      0.66      0.52      2569

0.0


  _warn_prf(average, modifier, msg_start, len(result))


0.6555079797586609