In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import DBSCAN, MeanShift
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_curve, confusion_matrix
from sklearn import svm
import pymongo

## Modeling with All Features

In [8]:
X = pd.read_pickle('data/X_train.pkl')
y = pd.read_pickle('data/y_train.pkl')

In [9]:
# X = X.drop(0,axis=1)
# X = X.drop(67,axis=1)

In [11]:
y.shape

(22554,)

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
###  GridSearch Cross-Validation  ###
###  Change scoring for different evaluations  ###

# top_models = []
# models1 = [LogisticRegression(class_weight='balanced'),
#           svm.LinearSVC(class_weight='balanced'),
#           svm.SVC(kernel = 'rbf', class_weight='balanced')]
# grid = {'C':np.logspace(-3,2,10)}
# for model in models1:
#     print(model)
#     gscv = RandomizedSearchCV(model,param_distributions=grid,n_jobs=-1,cv=4, scoring='precision')
#     gscv.fit(X_train,y_train)
#     print(gscv.best_estimator_)
#     print(gscv.best_score_)
#     top_models.append(gscv.best_estimator_)
#     print()
#     #print(gscv)

In [4]:
###  RandomSearchCV Round 3  ###
###  Change scoring for different evaluations  ###

# models3 =[DecisionTreeClassifier(class_weight='balanced'),
#           RandomForestClassifier(class_weight='balanced'),
#           GradientBoostingClassifier()]
# grid = {}
# for model in models3:
#     print(model)
#     gscv3 = RandomizedSearchCV(model,param_distributions=grid,n_iter=20,n_jobs=-1,cv=4,scoring='precision')
#     gscv3.fit(X_train,y_train)
#     print(gscv3.best_estimator_)
#     print(gscv3.best_score_)
#     top_models.append(gscv3.best_estimator_)
#     print()

In [13]:
###  Top Predictor Optimized for Recall  ###
###  Use for first-sign-of-danger  ###

top_recall = svm.SVC(C=0.001, class_weight='balanced', kernel='rbf')
top_recall.fit(X_train,y_train)
print(classification_report(y_test,top_recall.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6716
           1       0.13      0.16      0.14        51

   micro avg       0.99      0.99      0.99      6767
   macro avg       0.56      0.57      0.57      6767
weighted avg       0.99      0.99      0.99      6767



In [14]:
###  Top Predictor Optimized for F1  ###
###  Use for Medium Alert Threshold  ###

top_fone = svm.SVC(C=7.74, kernel='rbf',class_weight='balanced')
top_fone.fit(X_train,y_train)
print(classification_report(y_test,top_fone.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6716
           1       0.22      0.25      0.24        51

   micro avg       0.99      0.99      0.99      6767
   macro avg       0.61      0.62      0.62      6767
weighted avg       0.99      0.99      0.99      6767



In [15]:
###  Top Predictor Optimized for Precision  ###
###  Use for Red-Alert Choose-New-Candidate  ###

top_precision = RandomForestClassifier(class_weight='balanced')
top_precision.fit(X_train,y_train)
print(classification_report(y_test,top_precision.predict(X_test)))



              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6716
           1       0.50      0.04      0.07        51

   micro avg       0.99      0.99      0.99      6767
   macro avg       0.75      0.52      0.53      6767
weighted avg       0.99      0.99      0.99      6767



## Testing the Models on Completely Unseen Data

Here, the models will be tested on politicians they have not encountered before.

In [16]:
X_holdout = pd.read_pickle('data/X_test.pkl')
y_holdout = pd.read_pickle('data/y_test.pkl')

In [18]:
#X_holdout = X_holdout.drop([i for i in range(50,65,1)],axis=1)

In [29]:
###  Balancing Columns from Training and Testing
for col in X.columns:
    if col not in X_holdout.columns:
        X_holdout[col] = 0
for col in X_holdout:
    if col not in X.columns:
        X_holdout = X_holdout.drop(col,axis=1)

In [19]:
#X_holdout = X_holdout.drop([0,67], axis=1)
X_holdout = scaler.transform(X_holdout)

In [21]:
print('Top Recall Model')
print(classification_report(y_holdout,top_recall.predict(X_holdout)))
print(top_recall.score(X_holdout, y_holdout))
print()
print('Top F1 Model')
print(classification_report(y_holdout,top_fone.predict(X_holdout)))
print(top_fone.score(X_holdout,y_holdout))
print()
print('Top Precision Model')
print(classification_report(y_holdout,top_precision.predict(X_holdout)))
print(top_precision.score(X_holdout,y_holdout))

Top Recall Model
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      9595
           1       0.21      0.24      0.22        72

   micro avg       0.99      0.99      0.99      9667
   macro avg       0.60      0.61      0.61      9667
weighted avg       0.99      0.99      0.99      9667

0.9875866349436226

Top F1 Model
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9595
           1       0.31      0.38      0.34        72

   micro avg       0.99      0.99      0.99      9667
   macro avg       0.65      0.68      0.67      9667
weighted avg       0.99      0.99      0.99      9667

0.9891383055756698

Top Precision Model
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      9595
           1       1.00      0.06      0.11        72

   micro avg       0.99      0.99      0.99      9667
   macro avg       1.00      0.53     

In [49]:
ord_df = pd.read_pickle('ordered_df.pkl')
ord_df = ord_df.reset_index()
predictions = ord_df.iloc[-100:].drop('index',axis=1)
X_predict = predictions.drop([0,1,67,'label','Week_Label'], axis=1)
X_predict = X_predict.drop([i for i in range(50,65,1)],axis=1)

In [50]:
X_predict.shape

(100, 81)

In [51]:
top_fone.predict(X_predict)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Modeling with only Document Features

In [38]:
X_doc = X
for column in X.columns:
    if type(column) == str:
        X_doc = X_doc.drop(column, axis=1)

In [40]:
cluster_dummies = pd.get_dummies(X_doc[65])

In [42]:
X_doc = pd.merge(X_doc,cluster_dummies,left_index=True, right_index=True)

In [43]:
X_train2,X_test2,y_train2,y_test2 = train_test_split(X_doc,y,test_size = 0.3, stratify=y)
scaler2 = StandardScaler()
X_train2 = scaler2.fit_transform(X_train2)
X_test2 = scaler2.transform(X_test2)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [49]:
###  GridSearch Cross-Validation  ###
###  Change scoring for different evaluations  ###

top_models = []
models1 = [LogisticRegression(class_weight='balanced'),
          svm.LinearSVC(class_weight='balanced'),
          svm.SVC(kernel = 'rbf', class_weight='balanced')]
grid = {'C':np.logspace(-3,2,10)}
for model in models1:
    print(model)
    gscv4 = RandomizedSearchCV(model,param_distributions=grid,n_jobs=-1,cv=4, scoring='precision')
    gscv4.fit(X_train2,y_train2)
    print(gscv4.best_estimator_)
    print(gscv4.best_score_)
    top_models.append(gscv4.best_estimator_)
    print()

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)




LogisticRegression(C=0.046415888336127795, class_weight='balanced',
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
          random_state=None, solver='warn', tol=0.0001, verbose=0,
          warm_start=False)
0.01505556883188548

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)




LinearSVC(C=0.5994842503189409, class_weight='balanced', dual=True,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
     tol=0.0001, verbose=0)
0.031536588689770474

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
SVC(C=27.825594022071257, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
0.044197113505036086



In [50]:
###  RandomSearchCV Round 3  ###
###  Change scoring for different evaluations  ###

models3 =[DecisionTreeClassifier(class_weight='balanced'),
          RandomForestClassifier(class_weight='balanced'),
          GradientBoostingClassifier()]
grid = {}
for model in models3:
    print(model)
    gscv5 = RandomizedSearchCV(model,param_distributions=grid,n_iter=20,n_jobs=-1,cv=4,scoring='precision')
    gscv5.fit(X_train2,y_train2)
    print(gscv5.best_estimator_)
    print(gscv5.best_score_)
    top_models.append(gscv5.best_estimator_)
    print()

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')




DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.014425677897751702

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators='warn', n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False)




RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False)
0.0

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)




GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
0.05968338725554741



In [46]:
###  Top Predictor Optimized for Recall  ###
###  Use for Low Alert Threshold  ###

top_recall = svm.SVC(C=0.001, class_weight='balanced', kernel='rbf')
top_recall.fit(X_train2,y_train2)
print(classification_report(y_test2,top_recall.predict(X_test2)))

              precision    recall  f1-score   support

           0       1.00      0.13      0.23      9595
           1       0.01      0.97      0.02        72

   micro avg       0.14      0.14      0.14      9667
   macro avg       0.50      0.55      0.12      9667
weighted avg       0.99      0.14      0.23      9667



In [None]:
###  Top Predictor Optimized for F1  ###
###  Use for Medium Alert Threshold  ###

#no good predictor, all f1s are very low

In [None]:
###  Top Predictor Optimized for Precision  ###
###  Use for Red-Alert Choose-New-Candidate  ###

top_precision = RandomForestClassifier(class_weight='balanced')
top_precision.fit(X_train,y_train)
print(classification_report(y_test,top_precision.predict(X_test)))

(9341, 81)