In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Import CA data
full_data = pd.read_csv("full_data.csv")
full_data.head(10)

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance,Side,County,Pressure,Wind_Direction,Weather_Condition,Amenity,...,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,Year,Month,Day,Hour,Weekday,Time_Duration,Severity3
0,3,0.586512,0.216692,0.0,R,Solano,0.49046,VAR,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.112849,1
1,3,0.538525,0.231791,0.0,R,Alameda,0.49155,W,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.20788,1
2,2,0.56654,0.233155,0.0,R,Contra Costa,0.489915,N,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.112849,0
3,3,0.507039,0.237035,0.0,R,Santa Clara,0.491187,N,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.112849,1
4,2,0.4982,0.249386,0.0,R,Santa Clara,0.49155,VAR,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.219759,0
5,3,0.545908,0.249769,0.0,R,Alameda,0.49046,VAR,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.201941,1
6,3,0.50641,0.253332,0.0,R,Santa Clara,0.491187,VAR,Cloudy,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.20788,1
7,3,0.547768,0.194857,0.0,R,San Francisco,0.491187,NE,Cloudy,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.112849,1
8,2,0.56353,0.220454,0.0,R,Contra Costa,0.489915,N,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.112849,0
9,3,0.545841,0.260438,0.0,R,Alameda,0.49046,VAR,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.112849,1


# Part 1

In [3]:
#Import
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import label_binarize

In [4]:
# Metrics dictionary
accuracy = dict()
precision = dict()
recall = dict()
f1 = dict()
fpr = dict()
tpr = dict()

### 1. set up training data, validation set and test data

In [5]:
# 1. Look at the year to make sure if we could use year to seperate the data
full_data['Year'].value_counts()

0.00    80101
1.00    71262
0.25    51706
0.75    10206
Name: Year, dtype: int64

In [6]:
data = full_data.groupby(full_data['Year'])['Severity3'].value_counts()
data

Year  Severity3
0.00  0            43382
      1            36719
0.25  0            27942
      1            23764
0.75  0             6686
      1             3520
1.00  0            50053
      1            21209
Name: Severity3, dtype: int64

In [7]:
#2. MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features = ['Pressure','Distance','Start_Lng','Start_Lat','Year','Day','Hour','Time_Duration']
full_data[features] = scaler.fit_transform(full_data[features])
full_data.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance,Side,County,Pressure,Wind_Direction,Weather_Condition,Amenity,...,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,Year,Month,Day,Hour,Weekday,Time_Duration,Severity3
0,3,0.586512,0.216692,0.0,R,Solano,0.49046,VAR,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.112849,1
1,3,0.538525,0.231791,0.0,R,Alameda,0.49155,W,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.20788,1
2,2,0.56654,0.233155,0.0,R,Contra Costa,0.489915,N,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.112849,0
3,3,0.507039,0.237035,0.0,R,Santa Clara,0.491187,N,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.112849,1
4,2,0.4982,0.249386,0.0,R,Santa Clara,0.49155,VAR,Clear,False,...,False,False,Day,0.0,Jun,0.666667,0.434783,Tue,0.219759,0


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [9]:
#3. Split Trian set and Test set
x = full_data.drop(['Year', 'Severity', 'Severity3'], axis=1)
y = full_data['Severity3']

x = x.replace([True, False], [1,0])

category = ['Side','Wind_Direction','Weekday', 'County', 'Weather_Condition', 'Sunrise_Sunset', 'Month']
x[category] = x[category].astype('category')
x = pd.get_dummies(x, columns=category, drop_first=True)

x_Train, x_test, y_Train, y_test = train_test_split(x, y, test_size=0.30, random_state=88)
x_Train.shape,x_test.shape

((149292, 113), (63983, 113))

In [10]:
#4. Split train and validation set
X_train, x_val, Y_train, y_val = train_test_split(x_Train, y_Train, test_size=0.10, random_state=88)
X_train.shape,x_val.shape

((134362, 113), (14930, 113))

### 2. Logistic Regression Model

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
clf_base = LogisticRegression(max_iter = 1000)
grid = {'C': 10.0 ** np.arange(-2, 3),
        'class_weight': ['balanced']}
clf_lr = GridSearchCV(clf_base, grid, cv=5, n_jobs=8, scoring='f1_macro')

clf_lr.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=1000), n_jobs=8,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'class_weight': ['balanced']},
             scoring='f1_macro')

In [12]:
print("Best parameters scores:")
print(clf_lr.best_params_)
print("Train score:", clf_lr.score(X_train, Y_train))
print("Validation score:", clf_lr.score(x_val, y_val))

coef = clf_lr.best_estimator_.coef_
intercept = clf_lr.best_estimator_.intercept_
print (classification_report(y_val, clf_lr.predict(x_val)))

Best parameters scores:
{'C': 100.0, 'class_weight': 'balanced'}
Train score: 0.7074505882625046
Validation score: 0.7106317425328467
              precision    recall  f1-score   support

           0       0.84      0.64      0.73      8934
           1       0.60      0.81      0.69      5996

    accuracy                           0.71     14930
   macro avg       0.72      0.73      0.71     14930
weighted avg       0.74      0.71      0.71     14930



In [13]:
#Use the best model
lr = LogisticRegression(**clf_lr.best_params_, max_iter = 1000)
lr.fit(X_train, Y_train)

LogisticRegression(C=100.0, class_weight='balanced', max_iter=1000)

## 3. Random Forest Model

In [None]:
import time
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold

grid_values = {"n_estimators": [50, 100, 200, 500], "max_depth": [5, 10, 15, 30]}

tic = time.time()

rf2 = RandomForestRegressor() 

cv = KFold(n_splits=5,random_state=333,shuffle=True) 
grid = GridSearchCV(rf2, param_grid=grid_values, scoring='r2', cv=cv,verbose=2)
grid.fit(X_train, Y_train)

toc = time.time()

print('time:', round(toc-tic, 2),'s')
print("Best parameters scores:")
print(grid.best_params_)
print("Train score:", grid.score(X_train, Y_train))
print("Validation score:", grid.score(x_val, y_val))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] max_depth=5, n_estimators=50 ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..................... max_depth=5, n_estimators=50, total=  24.2s
[CV] max_depth=5, n_estimators=50 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.2s remaining:    0.0s


[CV] ..................... max_depth=5, n_estimators=50, total=  25.4s
[CV] max_depth=5, n_estimators=50 ....................................
[CV] ..................... max_depth=5, n_estimators=50, total=  24.6s
[CV] max_depth=5, n_estimators=50 ....................................
[CV] ..................... max_depth=5, n_estimators=50, total=  24.6s
[CV] max_depth=5, n_estimators=50 ....................................
[CV] ..................... max_depth=5, n_estimators=50, total=  24.9s
[CV] max_depth=5, n_estimators=100 ...................................
[CV] .................... max_depth=5, n_estimators=100, total=  50.5s
[CV] max_depth=5, n_estimators=100 ...................................
[CV] .................... max_depth=5, n_estimators=100, total=  49.0s
[CV] max_depth=5, n_estimators=100 ...................................
[CV] .................... max_depth=5, n_estimators=100, total=  49.6s
[CV] max_depth=5, n_estimators=100 ...................................
[CV] .

In [None]:
toc = time.time()

print('time:', round(toc-tic, 2),'s')
print("Best parameters scores:")
print(grid.best_params_)
print("Train score:", grid.score(X_train, Y_train))
print("Validation score:", grid.score(x_val, y_val))

In [None]:
rf2 = RandomForestClassifier(max_depth = 30, n_estimators = 500)

print("Default scores:")
rf2.fit(X_train, Y_train)
print("Train score:", rf2.score(X_train, Y_train))
print("Validation score:", rf2.score(x_val, y_val))

## 4. Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train)

print("Train score:", gnb.score(X_train, Y_train))
print("Validation score:", gnb.score(x_val, y_val))

## 5. Test Set Performance

### (a) Logistic Regression Model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

y_pred = lr.predict(x_test)

accuracy["Logistic Regression"] = accuracy_score(y_test, y_pred)
f1["Logistic Regression"] = f1_score(y_test, y_pred, average="macro")

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = lr.predict(x_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)

index = ["Actual Severity 1", "Actual Severity 2"]
columns = ["Predicted Severity 1", "Predicted Severity 2"]
conf_matrix = pd.DataFrame(data=confmat, columns=columns, index=index)
plt.figure(figsize=(8, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
Y = label_binarize(y_test, classes=[0, 1])
Y = enc.fit_transform(Y).toarray()

y_score = lr.predict_proba(x_test)
print(y_score)
precision["Logistic Regression"], recall["Logistic Regression"], _ = precision_recall_curve(Y.ravel(), y_score.ravel())
fpr["Logistic Regression"], tpr["Logistic Regression"], _ = roc_curve(Y.ravel(), y_score.ravel())

plt.step(recall["Logistic Regression"], precision["Logistic Regression"], where="post")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.title("PR Curve - Logisitc Regression")
plt.show()

In [None]:
plt.step(fpr["Logistic Regression"], tpr["Logistic Regression"], where="post")

plt.title("ROC curve - Logistic Regression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.show()

### (b) Random Forest 

In [None]:
y_pred = rf2.predict(x_test)

accuracy["Random Forest"] = accuracy_score(y_test.values, y_pred)
f1["Random Forest"] = f1_score(y_test, y_pred, average="macro")

print(classification_report(y_test, y_pred))

In [None]:
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)

index = ["Actual Severity 1", "Actual Severity 2"]
columns = ["Predicted Severity 1", "Predicted Severity 2"]
conf_matrix = pd.DataFrame(data=confmat, columns=columns, index=index)
plt.figure(figsize=(8, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Confusion Matrix - Random Forest")
plt.show()

In [None]:
importances = pd.DataFrame(np.zeros((x_test.shape[1], 1)), columns=["importance"], index=X_train.columns)

importances.iloc[:,0] = rf2.feature_importances_

importances = importances.sort_values(by="importance", ascending=False)[:30]

plt.figure(figsize=(15, 10))
sns.barplot(x="importance", y=importances.index, data=importances)
plt.show()

In [None]:
#PR curve
enc = OneHotEncoder()
Y = label_binarize(y_test, classes=[0, 1])
Y = enc.fit_transform(Y).toarray()

y_score = rf2.predict_proba(x_test)

precision["Random Forest"], recall["Random Forest"], _ = precision_recall_curve(Y.ravel(), y_score.ravel())
fpr["Random Forest"], tpr["Random Forest"], _ = roc_curve(Y.ravel(), y_score.ravel())

plt.step(recall["Random Forest"], precision["Random Forest"], where="post")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.title("PR Curve - Random Forest")
plt.show()

In [None]:
plt.step(fpr["Random Forest"], tpr["Random Forest"], where="post")

plt.title("ROC curve - Random Forest")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

y_pred = rf2.predict(x_test)
print("AUC score:", roc_auc_score(y_test, y_pred))

### (c) Naive Bayes

In [None]:
y_pred = gnb.predict(x_test)

accuracy["Gaussian Naive Bayes"] = accuracy_score(y_test, y_pred)
f1["Gaussian Naive Bayes"] = f1_score(y_test, y_pred, average="macro")

print(classification_report(y_test, y_pred))

In [None]:
y_pred = gnb.predict(x_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)

index = ["Actual Severity 1", "Actual Severity 2"]
columns = ["Predicted Severity 1", "Predicted Severity 2"]
conf_matrix = pd.DataFrame(data=confmat, columns=columns, index=index)
plt.figure(figsize=(8, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Confusion Matrix - Gaussian Naive Bayes")
plt.show()

In [None]:
enc = OneHotEncoder()
Y = label_binarize(y_test, classes=[0, 1])
Y = enc.fit_transform(Y).toarray()

y_score = gnb.predict_proba(x_test)

precision["Gaussian Naive Bayes"], recall["Gaussian Naive Bayes"], _ = precision_recall_curve(Y.ravel(), y_score.ravel())
fpr["Gaussian Naive Bayes"], tpr["Gaussian Naive Bayes"], _ = roc_curve(Y.ravel(), y_score.ravel())

plt.figure(figsize=(18, 10))
plt.step(recall["Gaussian Naive Bayes"], precision["Gaussian Naive Bayes"], where="post")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.title("PR Curve - Gaussian Naive Bayes")
plt.show()

### (d) Result

In [None]:
plt.title("Accuracy on Validation set for each model")
sns.barplot(list(range(len(accuracy))), list(accuracy.values()))
plt.xticks(range(len(accuracy)), labels=accuracy.keys())
plt.show()

In [None]:
plt.title("F1 Score on Validation set for each model")
sns.barplot(list(range(len(f1))), list(f1.values()))
plt.xticks(range(len(f1)), labels=f1.keys())
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
for key in f1.keys():
    plt.step(recall[key], precision[key], where="post", label=key)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.title("PR curve")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
for key in f1.keys():
    plt.step(fpr[key], tpr[key], where="post", label=key)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.title("ROC curve")
plt.legend()
plt.show()

# Part 2 ---- Add Population Density

## 1. Adding Variable and Split Data

In [None]:
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import label_binarize

In [None]:
CA_popdensity = pd.read_csv("CA_pop_density.csv")
scaler = MinMaxScaler()
features = ['Population Density']
CA_popdensity[features] = scaler.fit_transform(CA_popdensity[features])
CA_popdensity

In [None]:
full_data_new = pd.merge(full_data, CA_popdensity)
full_data_new

In [None]:
#3. Split Trian set and Test set
x = full_data_new.drop(['Year', 'Severity', 'Severity3'], axis=1)
y = full_data_new['Severity3']

x = x.replace([True, False], [1,0])

category = ['Side','Wind_Direction','Weekday', 'County', 'Weather_Condition', 'Sunrise_Sunset', 'Month']
x[category] = x[category].astype('category')
x = pd.get_dummies(x, columns=category, drop_first=True)

x_Train, x_test, y_Train, y_test = train_test_split(x, y, test_size=0.30, random_state=88)
x_Train.shape,x_test.shape

In [None]:
#4. Split train and validation set
X_train, x_val, Y_train, y_val = train_test_split(x_Train, y_Train, test_size=0.10, random_state=88)
X_train.shape,x_val.shape

## 2. Logistic Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
clf_base = LogisticRegression(max_iter = 1000)
grid = {'C': 10.0 ** np.arange(-2, 3),
        'class_weight': ['balanced']}
clf_lr = GridSearchCV(clf_base, grid, cv=5, n_jobs=8, scoring='f1_macro')

clf_lr.fit(X_train, Y_train)

In [None]:
print("Best parameters scores:")
print(clf_lr.best_params_)
print("Train score:", clf_lr.score(X_train, Y_train))
print("Validation score:", clf_lr.score(x_val, y_val))

coef = clf_lr.best_estimator_.coef_
intercept = clf_lr.best_estimator_.intercept_
print (classification_report(y_val, clf_lr.predict(x_val)))

In [None]:
#Use the best model
lr_population = LogisticRegression(**clf_lr.best_params_, max_iter = 1000)
lr_population.fit(X_train, Y_train)

## 3. Random Forest Model

In [None]:
import time
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold

grid_values = {"n_estimators": [50, 100, 200, 500], "max_depth": [5, 10, 15, 30]}

tic = time.time()

rf2 = RandomForestClassifier() 

cv = KFold(n_splits=5,random_state=333,shuffle=True) 
grid = GridSearchCV(rf2, param_grid=grid_values, scoring='r2', cv=cv,verbose=2)
grid.fit(X_train, Y_train)

toc = time.time()

print('time:', round(toc-tic, 2),'s')
print("Best parameters scores:")
print(grid.best_params_)
print("Train score:", grid.score(X_train, Y_train))
print("Validation score:", grid.score(x_val, y_val))

In [None]:
rf_population = RandomForestClassifier(max_depth = 30, n_estimators = 500)

print("Default scores:")
rf_population.fit(X_train, Y_train)
print("Train score:", rf_population.score(X_train, Y_train))
print("Validation score:", rf_population.score(x_val, y_val))

## 4. Naive Bayes

In [None]:
gnb_population = GaussianNB()
gnb_population.fit(X_train, Y_train)

print("Train score:", gnb_population.score(X_train, Y_train))
print("Validation score:", gnb_population.score(x_val, y_val))

## 5. Test Performance

### (a). Logistic Regression Model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

y_pred = lr_population.predict(x_test)

accuracy["Logistic Regression"] = accuracy_score(y_test, y_pred)
f1["Logistic Regression"] = f1_score(y_test, y_pred, average="macro")

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = lr_population.predict(x_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)

index = ["Actual Severity 1", "Actual Severity 2"]
columns = ["Predicted Severity 1", "Predicted Severity 2"]
conf_matrix = pd.DataFrame(data=confmat, columns=columns, index=index)
plt.figure(figsize=(8, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

In [None]:
plt.step(fpr["Logistic Regression"], tpr["Logistic Regression"], where="post")

plt.title("ROC curve - Logistic Regression (population)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.show()

### (b). Random Forest

In [None]:
y_pred = rf_population.predict(x_test)

accuracy["Random Forest"] = accuracy_score(y_test.values, y_pred)
f1["Random Forest"] = f1_score(y_test, y_pred, average="macro")

print(classification_report(y_test, y_pred))

In [None]:
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)

index = ["Actual Severity 1", "Actual Severity 2"]
columns = ["Predicted Severity 1", "Predicted Severity 2"]
conf_matrix = pd.DataFrame(data=confmat, columns=columns, index=index)
plt.figure(figsize=(8, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Confusion Matrix - Random Forest")
plt.show()

In [None]:
importances = pd.DataFrame(np.zeros((x_test.shape[1], 1)), columns=["importance"], index=X_train.columns)

importances.iloc[:,0] = rf_population.feature_importances_

importances = importances.sort_values(by="importance", ascending=False)[:30]

plt.figure(figsize=(15, 10))
sns.barplot(x="importance", y=importances.index, data=importances)
plt.show()

In [None]:
enc = OneHotEncoder()
Y = label_binarize(y_test, classes=[0, 1])
Y = enc.fit_transform(Y).toarray()

y_score = rf_population.predict_proba(x_test)

precision["Random Forest"], recall["Random Forest"], _ = precision_recall_curve(Y.ravel(), y_score.ravel())
fpr["Random Forest"], tpr["Random Forest"], _ = roc_curve(Y.ravel(), y_score.ravel())

plt.step(recall["Random Forest"], precision["Random Forest"], where="post")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.title("PR Curve - Random Forest(population)")
plt.show()

In [None]:
plt.step(fpr["Random Forest"], tpr["Random Forest"], where="post")

plt.title("ROC curve - Random Forest(population)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

y_pred = rf_population.predict(x_test)
print("AUC score:", roc_auc_score(y_test, y_pred))

### (c). Naive Bayes

In [None]:
y_pred = gnb_population.predict(x_test)

accuracy["Gaussian Naive Bayes"] = accuracy_score(y_test, y_pred)
f1["Gaussian Naive Bayes"] = f1_score(y_test, y_pred, average="macro")

print(classification_report(y_test, y_pred))

In [None]:
y_pred = gnb_population.predict(x_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)

index = ["Actual Severity 1", "Actual Severity 2"]
columns = ["Predicted Severity 1", "Predicted Severity 2"]
conf_matrix = pd.DataFrame(data=confmat, columns=columns, index=index)
plt.figure(figsize=(8, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Confusion Matrix - Gaussian Naive Bayes")
plt.show()

In [None]:
enc = OneHotEncoder()
Y = label_binarize(y_test, classes=[0, 1])
Y = enc.fit_transform(Y).toarray()

y_score = gnb_population.predict_proba(x_test)

precision["Gaussian Naive Bayes"], recall["Gaussian Naive Bayes"], _ = precision_recall_curve(Y.ravel(), y_score.ravel())
fpr["Gaussian Naive Bayes"], tpr["Gaussian Naive Bayes"], _ = roc_curve(Y.ravel(), y_score.ravel())

plt.figure(figsize=(18, 10))
plt.step(recall["Gaussian Naive Bayes"], precision["Gaussian Naive Bayes"], where="post")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.title("PR Curve - Gaussian Naive Bayes")
plt.show()

# Overall Result

In [None]:
plt.title("Accuracy on Validation set for each model")
sns.barplot(list(range(len(accuracy))), list(accuracy.values()))
plt.xticks(range(len(accuracy)), labels=accuracy.keys())
plt.show()

In [None]:
plt.title("F1 Score on Validation set for each model")
sns.barplot(list(range(len(f1))), list(f1.values()))
plt.xticks(range(len(f1)), labels=f1.keys())
plt.show()

In [None]:
for key in f1.keys():
    plt.step(recall[key], precision[key], where="post", label=key)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.title("PR curve")
plt.legend()
plt.show()

In [None]:
for key in f1.keys():
    plt.step(fpr[key], tpr[key], where="post", label=key)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.xlim([0, 1])
plt.ylim([0, 1.01])
plt.title("ROC curve")
plt.legend()
plt.show()