In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split


In [32]:
# Reading in data and cleaning it up

# read in bbNoText.csv as bbSimple
bbSimple = pd.read_csv("bbNoText_pop.csv")

# Replace Nan with 0
bbSimple.fillna(0, inplace=True)

# Change Date to datetime format
bbSimple['Date'] = pd.to_datetime(bbSimple['Date'])

# Log of population variable to put it on similar scale to other variables
bbSimple['population'] = np.log(bbSimple['population'])


In [54]:
# Split the data into training and testing sets

train_set, test_set = train_test_split(bbSimple, test_size=0.2, random_state=755)

# Divide up X and Y variables in testing and training sets
X_train = train_set.drop(columns=['econ_index_change_dummy','econ_index', 'econ_index_change','Year','District','file_name','Date','sentiment_trigram','variance_sentiment'])
y_train = train_set['econ_index_change_dummy']
X_test = test_set.drop(columns=['econ_index_change_dummy','econ_index', 'econ_index_change','Year','District','file_name','Date','sentiment_trigram','variance_sentiment'])
y_test = test_set['econ_index_change_dummy']

## Random forest

In [55]:
# Create the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=755)

# Fit the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

#cv_scores = cross_val_score(model, X, y, cv=k, scoring='accuracy')
#cv_predictions = cross_val_predict(model, X, y, cv=k)

# Calculate additional metrics
accuracyRF = accuracy_score(y_test, y_pred)
precisionRF = precision_score(y_test, y_pred)
specificityRF = recall_score(y_test, y_pred, pos_label=0)
recallRF = recall_score(y_test, y_pred)
# Use predicted probabilities for AUC
y_prob = model.predict_proba(X_test)[:, 1]
aucRF = roc_auc_score(y_test, y_prob)
conf_matrixRF = confusion_matrix(y_test, y_pred)

# Print the metrics
# rounded to 2 decimal places
print("Accuracy:", round(accuracyRF, 2))
print("Precision:", round(precisionRF, 2))
print("Recall:", round(recallRF, 2))
print("Specificity:", round(specificityRF, 2))
print("AUC:", round(aucRF, 2))
print("Confusion Matrix:")
print(conf_matrixRF)


Accuracy: 0.85
Precision: 0.88
Recall: 0.96
Specificity: 0.25
AUC: 0.79
Confusion Matrix:
[[ 36 106]
 [ 35 763]]


## Support Vector Machine

In [56]:
from sklearn.svm import SVC

model = SVC(probability=True, random_state=755)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate additional metrics
accuracySVM = accuracy_score(y_test, y_pred)
precisionSVM = precision_score(y_test, y_pred)
specificitySVM = recall_score(y_test, y_pred, pos_label=0)
recallSVM = recall_score(y_test, y_pred)
# Use predicted probabilities for AUC
y_prob = model.predict_proba(X_test)[:, 1]
aucSVM = roc_auc_score(y_test, y_prob)
conf_matrixSVM = confusion_matrix(y_test, y_pred)

# Print the metrics
# rounded to 2 decimal places
print("Accuracy:", round(accuracySVM, 2))
print("Precision:", round(precisionSVM, 2))
print("Recall:", round(recallSVM, 2))
print("Specificity:", round(specificitySVM, 2))
print("AUC:", round(aucSVM, 2))
print("Confusion Matrix:")
print(conf_matrixSVM)


Accuracy: 0.85
Precision: 0.87
Recall: 0.96
Specificity: 0.22
AUC: 0.68
Confusion Matrix:
[[ 31 111]
 [ 34 764]]


## Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=755)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate additional metrics
accuracyLogit = accuracy_score(y_test, y_pred)
precisionLogit = precision_score(y_test, y_pred)
specificityLogit = recall_score(y_test, y_pred, pos_label=0)
recallLogit = recall_score(y_test, y_pred)
# Use predicted probabilities for AUC
y_prob = model.predict_proba(X_test)[:, 1]
aucLogit = roc_auc_score(y_test, y_prob)
conf_matrixLogit = confusion_matrix(y_test, y_pred)

# Print the metrics
# rounded to 2 decimal places
print("Accuracy:", round(accuracyLogit, 2))
print("Precision:", round(precisionLogit, 2))
print("Recall:", round(recallLogit, 2))
print("Specificity:", round(specificityLogit, 2))
print("AUC:", round(aucLogit, 2))
print("Confusion Matrix:")
print(conf_matrixLogit)

Accuracy: 0.84
Precision: 0.87
Recall: 0.96
Specificity: 0.18
AUC: 0.76
Confusion Matrix:
[[ 25 117]
 [ 31 767]]


## k_Nearest Neighbors

In [58]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate additional metrics
accuracyKNN = accuracy_score(y_test, y_pred)
precisionKNN = precision_score(y_test, y_pred)
specificityKNN = recall_score(y_test, y_pred, pos_label=0)
recallKNN = recall_score(y_test, y_pred)
# Use predicted probabilities for AUC
y_prob = model.predict_proba(X_test)[:, 1]
aucKNN = roc_auc_score(y_test, y_prob)
conf_matrixKNN = confusion_matrix(y_test, y_pred)

# Print the metrics
# rounded to 2 decimal places
print("Accuracy:", round(accuracyKNN, 2))
print("Precision:", round(precisionKNN, 2))
print("Recall:", round(recallKNN, 2))
print("Specificity:", round(specificityKNN, 2))
print("AUC:", round(aucKNN, 2))
print("Confusion Matrix:")
print(conf_matrixKNN)

Accuracy: 0.85
Precision: 0.88
Recall: 0.95
Specificity: 0.28
AUC: 0.7
Confusion Matrix:
[[ 40 102]
 [ 38 760]]


## XGBoost

In [59]:
from xgboost import XGBClassifier

model = XGBClassifier(random_state=755)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate additional metrics
accuracyXG = accuracy_score(y_test, y_pred)
precisionXG = precision_score(y_test, y_pred)
specificityXG = recall_score(y_test, y_pred, pos_label=0)
recallXG = recall_score(y_test, y_pred)
# Use predicted probabilities for AUC
y_prob = model.predict_proba(X_test)[:, 1]
aucXG = roc_auc_score(y_test, y_prob)
conf_matrixXG = confusion_matrix(y_test, y_pred)

# Print the metrics
# rounded to 2 decimal places
print("Accuracy:", round(accuracyXG, 2))
print("Precision:", round(precisionXG, 2))
print("Recall:", round(recallXG, 2))
print("Specificity:", round(specificityXG, 2))
print("AUC:", round(aucXG, 2))
print("Confusion Matrix:")
print(conf_matrixXG)

Accuracy: 0.86
Precision: 0.88
Recall: 0.96
Specificity: 0.28
AUC: 0.8
Confusion Matrix:
[[ 40 102]
 [ 31 767]]


## AdaBoost

In [60]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(n_estimators=100, random_state=755)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate additional metrics
accuracyADA = accuracy_score(y_test, y_pred)
precisionADA = precision_score(y_test, y_pred)
specificityADA = recall_score(y_test, y_pred, pos_label=0)
recallADA = recall_score(y_test, y_pred)
# Use predicted probabilities for AUC
y_prob = model.predict_proba(X_test)[:, 1]
aucADA = roc_auc_score(y_test, y_prob)
conf_matrixADA = confusion_matrix(y_test, y_pred)

# Print the metrics
# rounded to 2 decimal places
print("Accuracy:", round(accuracyADA, 2))
print("Precision:", round(precisionADA, 2))
print("Recall:", round(recallADA, 2))
print("Specificity:", round(specificityADA, 2))
print("AUC:", round(aucADA, 2))
print("Confusion Matrix:")
print(conf_matrixADA)

Accuracy: 0.85
Precision: 0.88
Recall: 0.95
Specificity: 0.25
AUC: 0.78
Confusion Matrix:
[[ 35 107]
 [ 36 762]]


## Artifical Neural Network

In [61]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=755)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate additional metrics
accuracyANN = accuracy_score(y_test, y_pred)
precisionANN = precision_score(y_test, y_pred)
specificityANN = recall_score(y_test, y_pred, pos_label=0)
recallANN = recall_score(y_test, y_pred)
# Use predicted probabilities for AUC
y_prob = model.predict_proba(X_test)[:, 1]
aucANN = roc_auc_score(y_test, y_prob)
conf_matrixANN = confusion_matrix(y_test, y_pred)

# Print the metrics
# rounded to 2 decimal places
print("Accuracy:", round(accuracyANN, 2))
print("Precision:", round(precisionANN, 2))
print("Recall:", round(recallANN, 2))
print("Specificity:", round(specificityANN, 2))
print("AUC:", round(aucANN, 2))
print("Confusion Matrix:")
print(conf_matrixANN)

Accuracy: 0.84
Precision: 0.87
Recall: 0.96
Specificity: 0.17
AUC: 0.76
Confusion Matrix:
[[ 24 118]
 [ 30 768]]


## Comparing results

In [72]:
# Create a table of the metrics
# Use Accuracy, recall, specificity, and AUC

# Create a dictionary of the metrics
metrics = {
    'Model': ['Random Forest', 'Support Vector Machine', 'Logistic Regression', 'K-Nearest-Neighbors', 'XGBoost', 'AdaBoost', 'Artificial Neural Network'],
    'Accuracy': [accuracyRF, accuracySVM, accuracyLogit, accuracyKNN, accuracyXG, accuracyADA, accuracyANN],
    'Recall': [recallRF, recallSVM, recallLogit, recallKNN, recallXG, recallADA, recallANN],
    'Specificity': [specificityRF, specificitySVM, specificityLogit, specificityKNN, specificityXG, specificityADA, specificityANN],
    'AUC': [aucRF, aucSVM, aucLogit, aucKNN, aucXG, aucADA, aucANN]
}

# Create a DataFrame from the dictionary
metrics_df = pd.DataFrame(metrics)

# Round the metrics to 2 decimal places
metrics_df = metrics_df.round(2)

# Order them according to AUC
metrics_df = metrics_df.sort_values('AUC', ascending=False)
# Print the table
print(metrics_df)


                       Model  Accuracy  Recall  Specificity   AUC
4                    XGBoost      0.86    0.96         0.28  0.80
0              Random Forest      0.85    0.96         0.25  0.79
5                   AdaBoost      0.85    0.95         0.25  0.78
2        Logistic Regression      0.84    0.96         0.18  0.76
6  Artificial Neural Network      0.84    0.96         0.17  0.76
3        K-Nearest-Neighbors      0.85    0.95         0.28  0.70
1     Support Vector Machine      0.85    0.96         0.22  0.68


In [92]:
from great_tables import GT, style, loc, exibble

(
    gt.GT(metrics_df)
    .tab_style(
        style=style.fill(color='lightgrey'),
        locations = loc.body(rows=[1,3,5,7])
    )
    .tab_header(
        title = gt.md('**Model Performance Comparison**')
    )
    )



Model Performance Comparison,Model Performance Comparison,Model Performance Comparison,Model Performance Comparison,Model Performance Comparison
Model,Accuracy,Recall,Specificity,AUC
XGBoost,0.86,0.96,0.28,0.8
Random Forest,0.85,0.96,0.25,0.79
AdaBoost,0.85,0.95,0.25,0.78
Logistic Regression,0.84,0.96,0.18,0.76
Artificial Neural Network,0.84,0.96,0.17,0.76
K-Nearest-Neighbors,0.85,0.95,0.28,0.7
Support Vector Machine,0.85,0.96,0.22,0.68
