In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as skm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from datetime import datetime

### Data Processing, Test-Train Split

In [2]:
df = pd.read_csv('./data/merged.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.sample(n=10000, random_state=42)
df['award'] = df['label']
df

Unnamed: 0,datetime-publish,keywords,label,word-count,award
1934,2013-06-11 2013-06-11,they that qaida,1,672,1
2974,2001-10-24,"xenakis,ensemble,music",0,394,0
6896,2010-11-03,"mitsubishi,royal,mitsubishi",0,88,0
11503,2014-05-29,"hospitals,fires,dementia",0,479,0
4216,2009-09-27,"venezuela,uranium,iran",0,154,0
...,...,...,...,...,...
9382,2013-03-20,"fur,shopping,fashion",0,746,0
1125,2013-11-01,that avenue arts,1,468,1
1241,1993-05-30 1992-11-27 1993-08-24 1993-07-09 19...,said that with,1,19780,1
10034,2009-05-05,"swine,epidemics,tamiflu",0,1100,0


In [3]:
def process_datetime(x):
    string_dates = x.split(" ")
    dates = [datetime.strptime(i,"%Y-%m-%d") for i in string_dates]
    return min(dates).timestamp()

df['datetime-publish'] = df['datetime-publish'].apply(process_datetime)
df = df.drop(['keywords','label'], axis=1) # only for baseline model
df

Unnamed: 0,datetime-publish,word-count,award
1934,1.370934e+09,672,1
2974,1.003907e+09,394,0
6896,1.288768e+09,88,0
11503,1.401347e+09,479,0
4216,1.254035e+09,154,0
...,...,...,...
9382,1.363763e+09,746,0
1125,1.383289e+09,468,1
1241,7.118748e+08,19780,1
10034,1.241507e+09,1100,0


In [4]:
baseline_df = df[:8000]
validation_df = df[8000:]
X, y = baseline_df.iloc[:,:-1], baseline_df.iloc[:,-1]
X_final, y_final = validation_df.iloc[:,:-1], validation_df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Logistic Regression Classifier (Baseline)

In [5]:
model = LogisticRegressionCV(cv=10, random_state=42).fit(X_train, y_train)
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1653
           1       0.96      0.26      0.41       347

    accuracy                           0.87      2000
   macro avg       0.91      0.63      0.67      2000
weighted avg       0.88      0.87      0.84      2000



In [6]:
print("Validation Set Accuracy: " + str(skm.accuracy_score(y_test, pred)))
print("Validation Set F1 Score: " + str(skm.f1_score(y_test, pred)))
print("Validation Set Precision: " + str(skm.precision_score(y_test, pred)))
print("Validation Set Recall: " + str(skm.recall_score(y_test, pred)))
print("Validation Set ROC-AUC: " + str(skm.roc_auc_score(y_test, pred)))

Validation Set Accuracy: 0.87
Validation Set F1 Score: 0.411764705882353
Validation Set Precision: 0.9578947368421052
Validation Set Recall: 0.2622478386167147
Validation Set ROC-AUC: 0.6299139979532454


In [7]:
pred = model.predict(X_final)
print(classification_report(y_final, pred))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1648
           1       0.97      0.33      0.49       352

    accuracy                           0.88      2000
   macro avg       0.92      0.66      0.71      2000
weighted avg       0.89      0.88      0.85      2000



In [8]:
print("Test Set Accuracy: " + str(skm.accuracy_score(y_final, pred)))
print("Test Set F1 Score: " + str(skm.f1_score(y_final, pred)))
print("Test Set Precision: " + str(skm.precision_score(y_final, pred)))
print("Test Set Recall: " + str(skm.recall_score(y_final, pred)))
print("Test Set ROC-AUC: " + str(skm.roc_auc_score(y_final, pred)))

Test Set Accuracy: 0.8795
Test Set F1 Score: 0.4883227176220807
Test Set Precision: 0.9663865546218487
Test Set Recall: 0.32670454545454547
Test Set ROC-AUC: 0.6621386804942629


### Random Forest Classifier (Baseline)

In [9]:
model = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1653
           1       0.91      0.84      0.87       347

    accuracy                           0.96      2000
   macro avg       0.94      0.91      0.92      2000
weighted avg       0.96      0.96      0.96      2000



In [10]:
print("Validation Set Accuracy: " + str(skm.accuracy_score(y_test, pred)))
print("Validation Set F1 Score: " + str(skm.f1_score(y_test, pred)))
print("Validation Set Precision: " + str(skm.precision_score(y_test, pred)))
print("Validation Set Recall: " + str(skm.recall_score(y_test, pred)))
print("Validation Set ROC-AUC: " + str(skm.roc_auc_score(y_test, pred)))

Validation Set Accuracy: 0.958
Validation Set F1 Score: 0.8738738738738738
Validation Set Precision: 0.9122257053291536
Validation Set Recall: 0.8386167146974063
Validation Set ROC-AUC: 0.9108389078629197


In [11]:
pred = model.predict(X_final)
print(classification_report(y_final, pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1648
           1       0.91      0.82      0.86       352

    accuracy                           0.95      2000
   macro avg       0.93      0.90      0.92      2000
weighted avg       0.95      0.95      0.95      2000



In [12]:
print("Test Set Accuracy: " + str(skm.accuracy_score(y_final, pred)))
print("Test Set F1 Score: " + str(skm.f1_score(y_final, pred)))
print("Test Set Precision: " + str(skm.precision_score(y_final, pred)))
print("Test Set Recall: " + str(skm.recall_score(y_final, pred)))
print("Test Set ROC-AUC: " + str(skm.roc_auc_score(y_final, pred)))

Test Set Accuracy: 0.9535
Test Set F1 Score: 0.8614008941877794
Test Set Precision: 0.9059561128526645
Test Set Recall: 0.8210227272727273
Test Set ROC-AUC: 0.9014094218887908
