In [None]:
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd

In [None]:
def row_is_valid(feature_row):
    if feature_row['min_lead_quantity'] == -1:
        return False
    if feature_row['lat'] == 'unknown' or feature_row['lat'] == -1:
        return False
    if feature_row['long'] == 'unknown' or feature_row['long'] == -1:
        return False
    if feature_row['tract'] == 'unknown' or feature_row['tract'] == -1:
        return False
    return True

In [None]:
name_map = pickle.load(open("/Users/23amrutad/Projects/LeadProj/ProjectData/final_data.pkl", 'rb'))
all_features = []
labels = []
for k, v in name_map.items():
    if row_is_valid(v):
        features = []
        if v['min_lead_quantity'] >= 5.0:
            labels.append(1)
        else:
            labels.append(0)
        for f in v.keys():
            if f != 'addr' and f != 'min_lead_quantity':
                features.append(v[f])
                if type(v[f]) == type(''):
                    print(v[f])
        all_features.append(features)

In [None]:
X = np.array(all_features)
y = np.array(labels)
results = []

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

# Dummy

In [None]:
dummy_clf = DummyClassifier(strategy="uniform")
dummy_clf.fit(X_train, y_train)
y_pred_dummy = dummy_clf.predict(X_test)
f1_dummy = f1_score(y_test, y_pred_dummy)
print("f1: ", f1_dummy)

In [None]:
# ROC AUC curve
metrics.plot_roc_curve(dummy_clf, X_test, y_test) 

In [None]:
# confusion matrix
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),
]
class_names = ['lead', 'not_lead']
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        dummy_clf,
        X_test,
        y_test,
        display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)
    print(title)
    print(disp.confusion_matrix)
plt.show()

# Random Forest

In [None]:
clf = RandomForestClassifier(max_depth=None, random_state=1, n_estimators = 15)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("f1: ", f1)

In [None]:
# ROC AUC curve
metrics.plot_roc_curve(clf, X_test, y_test) 

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# confusion matrix
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),
]
class_names = ['not_lead', 'lead']
precision = 0
recall = 0
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        clf,
        X_test,
        y_test,
        display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)
    print(title)
    print(disp.confusion_matrix)
    precision = disp.confusion_matrix[0]
    recall = disp.confusion_matrix[1]
plt.show()

In [None]:
TP = precision[0]
FP = precision[1]
FN = recall[0]
TN = recall[1]
print(TP, FP, FN, TN)

In [None]:
precision = TP/(TP+FP)
recall = TP/(TP+FN)
print(precision, recall)
f1_ = 2*precision*recall/(precision+recall)
print(f1_, f1)

# SVM

In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)
f1 = f1_score(y_test, y_pred)
print("f1: ", f1)

In [None]:
# ROC AUC curve
metrics.plot_roc_curve(clf, X_test, y_test) 

In [None]:
# confusion matrix
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),
]
class_names = ['not_lead', 'lead']
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator (
        clf,
        X_test,
        y_test,
        display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)
    print(title)
    print(disp.confusion_matrix)
plt.show()