In [198]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score
import matplotlib.pyplot as plt
from top2vec import Top2Vec
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.datasets import load_iris

In [199]:
training_data = pd.read_csv("training_data/training_data.csv")
test_data = pd.read_csv("test_data/test_data.csv")

In [200]:
bert_train_vecs = pd.read_csv("vecs/bert_train_vecs.csv")
bert_train_vecs = bert_train_vecs.values.tolist()

bert_test_vecs = pd.read_csv("vecs/bert_test_vecs.csv")
bert_test_vecs = bert_test_vecs.values.tolist()

In [201]:
gensim_train_vecs = pd.read_csv("vecs/gensim_train_vecs.csv")
gensim_train_vecs = gensim_train_vecs.values.tolist()

gensim_test_vecs = pd.read_csv("vecs/gensim_test_vecs.csv")
gensim_test_vecs = gensim_test_vecs.values.tolist()

In [202]:
t2v_train_model = Top2Vec.load("models/top2vec_train_model")
t2v_train_vecs = t2v_train_model.document_vectors

t2v_test_vecs = pd.read_csv("vecs/t2v_test_vecs.csv")
t2v_test_vecs = t2v_test_vecs.values.tolist()

In [203]:
X_test = np.array(t2v_test_vecs)
y_test = np.array(test_data.target)
X = np.array(t2v_train_vecs)
y = np.array(training_data.target)
y = np.pad(y, pad_width=((463,0)), mode='constant')

In [204]:
# X_test = np.array(gensim_test_vecs)
# y_test = np.array(test_data.target)
# X_test = np.pad(X_test, pad_width=((0,0),(0,10)), mode='constant')
# X = np.array(gensim_train_vecs)

# y = np.array(training_data.target)
# y = np.pad(y, pad_width=((463,0)), mode='constant')

In [205]:
# X_test = np.array(bert_test_vecs)
# y_test = np.array(test_data.target)
# X = np.array(bert_train_vecs)

# y = np.array(training_data.target)
# y = np.pad(y, pad_width=((463,0)), mode='constant')

In [206]:
kf = KFold(5, shuffle=True, random_state=48)
cv_lr_f1, cv_lrsgd_f1, cv_svcsgd_f1,  = [], [], []

for train_ind, val_ind in kf.split(X, y):
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True,
    ).fit(X_train_scale, y_train)

    test_data["lr_scores"] = lr.predict(scaler.transform(X_test))
    test_data["lr_scores_prob_1"] = lr.predict_proba(scaler.transform(X_test))[:, 1]

    y_pred = lr.predict(scaler.transform(X_val_scale))
    cv_lr_f1.append(f1_score(y_val, y_pred, average='weighted'))
    
    # Logistic Regression Mini-Batch SGD
    sgd = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        loss='log',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)

    test_data["sgd_scores"] = sgd.predict(scaler.transform(X_test))
    test_data["sgd_scores_prob_1"] = sgd.predict_proba(scaler.transform(X_test))[:, 1]

    y_pred = sgd.predict(X_val_scale)
    cv_lrsgd_f1.append(f1_score(y_val, y_pred, average="weighted"))
    
    # SGD Modified Huber
    sgd_huber = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        alpha=20,
        loss='modified_huber',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)

    test_data["sgd_huber_scores"] = sgd_huber.predict(scaler.transform(X_test))
    test_data["sgd_huber_scores_prob_1"] = sgd_huber.predict_proba(scaler.transform(X_test))[:, 1]
    
    y_pred = sgd_huber.predict(X_val_scale)
    cv_svcsgd_f1.append(f1_score(y_val, y_pred, average="weighted"))

print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
print(f'Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}')
print(f'SVM Huber Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}')



Logistic Regression Val f1: 0.596 +- 0.031
Logisitic Regression SGD Val f1: 0.639 +- 0.016
SVM Huber Val f1: 0.856 +- 0.027


In [207]:
test_pred_df = test_data[test_data.target.astype(str).isin(["1"])]
test_pred_df

Unnamed: 0,id,allegation_desc,meta_agency,meta_tracking_id,label,target,lr_scores,lr_scores_prob_1,sgd_scores,sgd_scores_prob_1,sgd_huber_scores,sgd_huber_scores_prob_1
3,14419,failed to follow oder of rank/failed to put ar...,new-orleans-so,H-022-20,internal misconduct; administrative infractions,1,1,5.592766e-01,0,7.060580e-03,0,0.387652
12,17415,complainant accused officer failing to notify ...,new-orleans-pd,2018-0713-r,internal misconduct; administrative infractions,1,0,6.628243e-02,1,1.000000e+00,0,0.381182
24,14607,involved in a verbal altercation with deputy d...,new-orleans-so,c-037-21,internal misconduct; administrative infractions,1,1,9.963395e-01,1,1.000000e+00,0,0.388502
26,13879,extended her lunch without prior approval.,new-orleans-so,K-015-20,internal misconduct; administrative infractions,1,0,8.827398e-03,0,7.202053e-46,0,0.398400
32,16633,accused officer was instructed to remain on as...,new-orleans-pd,2017-0265-r,internal misconduct; administrative infractions,1,0,3.150310e-06,0,2.711280e-85,0,0.372972
...,...,...,...,...,...,...,...,...,...,...,...,...
444,14652,officer failed to appear in court for a trial.,new-orleans-pd,2014-0043-r,internal misconduct; administrative infractions,1,1,9.979519e-01,1,1.000000e+00,0,0.330571
446,15760,the officer was assigned to 40 hours of in-ser...,new-orleans-pd,2016-0135-r,internal misconduct; administrative infractions,1,0,1.104949e-08,0,2.519773e-139,0,0.346612
457,16308,accused failed to comply with ncic's procedure...,new-orleans-pd,2016-0746-d,internal misconduct; administrative infractions,1,0,4.082695e-01,0,3.392672e-19,0,0.311705
459,15543,the officers allegedly acted in retaliation wi...,new-orleans-pd,2015-0652-n,internal misconduct; administrative infractions,1,0,1.600595e-07,0,4.747597e-142,0,0.299261
