In [1]:
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

In [2]:
import pandas as pd
class StringConverter(dict):

    def __contains__(self, item):

        return True

    def __getitem__(self, item):

        return str

    def get(self, default=None):

        return str

def load_data(data_path = "/Users/vishali/Desktop/SPRING 2021/project-ml"):
    csv_path = os.path.join(data_path, "hc11final.csv")
    return pd.read_csv(csv_path, low_memory = False)

In [3]:
new = load_data()

In [42]:
new = new.dropna(subset=['hc11'])

In [43]:
new['hc11'] = pd.Categorical(new.hc11)

In [45]:
y = new['hc11']

In [46]:
X = new.drop(['ha57', 'hc11','DHSCC', 'DHSCLUST'], axis = 1)

In [47]:
from sklearn.model_selection import train_test_split
# test_size: what proportion of original data is used for test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [48]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [49]:
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)

In [50]:
pca.fit(X_train)

PCA(n_components=0.95)

In [51]:
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [52]:
from sklearn.linear_model import LogisticRegression

In [53]:
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
logisticRegr = LogisticRegression(solver = 'liblinear')

In [54]:
logisticRegr.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [55]:
y_pred_LR = logisticRegr.predict(X_test)

In [56]:
from sklearn.metrics import classification_report, confusion_matrix
CM = confusion_matrix(y_test, y_pred_LR)
print(confusion_matrix(y_test, y_pred_LR))
print(classification_report(y_test, y_pred_LR))


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]

# True Negative Rate | Specificity

FPR = TN / (TN + FP)
print(FPR)

# When true positive + false positive == 0, precision returns 0 and raises UndefinedMetricWarning. This behavior can be modified with zero_division.

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_LR))

[[810 317]
 [496 349]]
              precision    recall  f1-score   support

         0.0       0.62      0.72      0.67      1127
         1.0       0.52      0.41      0.46       845

    accuracy                           0.59      1972
   macro avg       0.57      0.57      0.56      1972
weighted avg       0.58      0.59      0.58      1972

0.7187222715173026
0.5877281947261663


In [57]:
# Decision Trees
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state = 42)
classifier.fit(X_train, y_train)
y_pred_DT = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
CM = confusion_matrix(y_test, y_pred_DT)
print(confusion_matrix(y_test, y_pred_DT))
print(classification_report(y_test, y_pred_DT))


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]

# True Negative Rate | Specificity

FPR = TN / (TN + FP)
print(FPR)

# When true positive + false positive == 0, precision returns 0 and raises UndefinedMetricWarning. This behavior can be modified with zero_division.

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_DT))

[[743 384]
 [451 394]]
              precision    recall  f1-score   support

         0.0       0.62      0.66      0.64      1127
         1.0       0.51      0.47      0.49       845

    accuracy                           0.58      1972
   macro avg       0.56      0.56      0.56      1972
weighted avg       0.57      0.58      0.57      1972

0.6592724046140195
0.5765720081135902


In [58]:
from sklearn.svm import LinearSVC
lin_svc = LinearSVC(max_iter=100000,random_state=42)
lin_svc.fit(X_train, y_train)
y_pred = lin_svc.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
CM = confusion_matrix(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]

# or using accuracy scores
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

[[810 317]
 [499 346]]
              precision    recall  f1-score   support

         0.0       0.62      0.72      0.67      1127
         1.0       0.52      0.41      0.46       845

    accuracy                           0.59      1972
   macro avg       0.57      0.56      0.56      1972
weighted avg       0.58      0.59      0.58      1972

0.5862068965517241
