# Set up

In [32]:
import pandas as pd
import numpy as np
import json
import pickle

# Load training set

In [33]:
train_set = pd.read_pickle('train_set.pkl')
train_set

Unnamed: 0,claim,claimant,date,label,related_articles,id
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,2017-10-31,1,"[34218, 55700, 18736, 39031, 34219, 34220]",10354
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,2014-09-12,0,"[73190, 76997, 38841, 77415, 77303, 9280, 8332...",2053
11035,Says Target installed urinals in a women’s bat...,Facebook posts,2016-04-22,0,"[9619, 22197]",12160
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,2019-04-15,0,"[57163, 31528, 40908, 31536, 68904, 44601]",13458
11354,: The AMBER Alert system has been discontinu...,,2013-10-13,0,"[103978, 121475, 121849]",12504
...,...,...,...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,2014-09-23,1,"[9581, 89571, 7836, 7945, 7949, 77360, 83491, ...",3208
6096,"A photograph captures Harriet Tubman as a ""Gun...",,2019-03-25,0,"[125108, 125968, 126005]",6701
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,2014-06-14,0,"[80115, 93998, 5968, 175, 91475, 8710, 89881, ...",11514
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,2008-02-25,1,"[96453, 71123, 61, 69968, 96477]",5966


# Load test data

In [34]:
test_set = pd.read_pickle('test_set.pkl')

# Combine claim and claimant

In [35]:
train_set["combined"] = train_set["claim"].map(str) + ' ' + train_set["claimant"]

In [36]:
df = pd.DataFrame(train_set, columns=['combined', 'label'])
df

Unnamed: 0,combined,label
9389,While arguing over President Reagan’s 1981 tax...,1
1861,"Recently Rick Scott ""closed 30 women’s health ...",0
11035,Says Target installed urinals in a women’s bat...,0
12221,"Says ""combined doses of vaccines"" have never b...",0
11354,: The AMBER Alert system has been discontinu...,0
...,...,...
2910,Health insurance costs for Floridians are up 3...,1
6096,"A photograph captures Harriet Tubman as a ""Gun...",0
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",0
5414,"""The board of a nonprofit organization on whic...",1


In [37]:
df.iloc[2,0]

'Says Target installed urinals in a women’s bathroom to "accommodate the ones who have giblets." Facebook posts'

# Train model

In [40]:
import sklearn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE

In [41]:
X_train = df.combined
y_train = df.label
X_test = test_set.claim
y_test = test_set.label

In [58]:
seed = 1075
np.random.seed(seed)

In [59]:
sm = SMOTE()
cv = CountVectorizer()
tf = TfidfVectorizer()

In [60]:
X_train_vect = cv.fit_transform(X_train)
X_test_vect = cv.transform(X_test)

In [61]:
# Create classifiers
nb = MultinomialNB()
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = LinearSVC()
rg = RidgeClassifier()

In [69]:
clf_list = [nb, rf, et, knn, svc, rg]

In [77]:
for clf in clf_list:
    vanilla_scores = cross_val_score(clf, X_train_vect, y_train, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, 
       max_samples=0.4, max_features=10, random_state=seed)
    bagging_scores = cross_val_score(bagging_clf, X_train_vect, y_train, cv=10, 
       n_jobs=-1)
    
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]".format(clf.__class__.__name__, 
                                                              vanilla_scores.mean(), vanilla_scores.std()))
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(clf.__class__.__name__, 
                                                                        bagging_scores.mean(), bagging_scores.std()))

Mean of: 0.607, std: (+/-) 0.008 [MultinomialNB]
Mean of: 0.478, std: (+/-) 0.001 [Bagging MultinomialNB]

Mean of: 0.578, std: (+/-) 0.004 [RandomForestClassifier]
Mean of: 0.479, std: (+/-) 0.003 [Bagging RandomForestClassifier]

Mean of: 0.585, std: (+/-) 0.013 [ExtraTreesClassifier]
Mean of: 0.479, std: (+/-) 0.002 [Bagging ExtraTreesClassifier]

Mean of: 0.512, std: (+/-) 0.009 [KNeighborsClassifier]
Mean of: 0.473, std: (+/-) 0.019 [Bagging KNeighborsClassifier]

Mean of: 0.538, std: (+/-) 0.010 [LinearSVC]
Mean of: 0.478, std: (+/-) 0.000 [Bagging LinearSVC]

Mean of: 0.608, std: (+/-) 0.008 [RidgeClassifier]
Mean of: 0.478, std: (+/-) 0.000 [Bagging RidgeClassifier]



In [78]:
# Set up voting
from sklearn.ensemble import VotingClassifier
import warnings

warnings.filterwarnings('ignore')

eclf = VotingClassifier(estimators=[('Naive Bayes', nb), ('Random Forests', rf), ('Extra Trees', et), 
                                    ('KNeighbors', knn), ('SVC', svc), ('Ridge Classifier', rg)], voting='hard')

for clf, label in zip([nb, rf, et, knn, svc, rg, eclf], ['Naive Bayes', 'Random Forest', 'Extra Trees', 
                                                     'KNeighbors', 'SVC', 'Ridge Classifier', 'Ensemble']):
    scores = cross_val_score(clf, X_train_vect, y_train, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

Mean: 0.607, std: (+/-) 0.008 [Naive Bayes]
Mean: 0.581, std: (+/-) 0.011 [Random Forest]
Mean: 0.591, std: (+/-) 0.010 [Extra Trees]
Mean: 0.512, std: (+/-) 0.009 [KNeighbors]
Mean: 0.538, std: (+/-) 0.010 [SVC]
Mean: 0.608, std: (+/-) 0.008 [Ridge Classifier]
Mean: 0.605, std: (+/-) 0.008 [Ensemble]


In [79]:
# Set up ensemble voting for bagging
ebclf_array = []

for clf in clf_list:
    ebclf_array.append(BaggingClassifier(clf, max_samples=0.25, 
                                   max_features=10, random_state=seed))

v_eclf = VotingClassifier(estimators=zip(['Bagging Naive Bayes', 'Bagging Random Forest', 'Bagging Extra Trees', 'Bagging KNeighbors',
                                          'Bagging SVC', 'Bagging Ridge Classifier', 'Bagging Ensemble'],
                                         ebclf_array), 
                          voting='hard')

ebclf_array.append(v_eclf)

for clf, label in zip(ebclf_array, ['Bagging Naive Bayes', 'Bagging Random Forest', 'Bagging Extra Trees', 'Bagging KNeighbors',
                              'Bagging SVC', 'BaggingRidge Classifier', 'Bagging Ensemble']):
    scores = cross_val_score(clf, X_train_vect, y_train, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

Mean: 0.478, std: (+/-) 0.001 [Bagging Naive Bayes]
Mean: 0.479, std: (+/-) 0.002 [Bagging Random Forest]
Mean: 0.478, std: (+/-) 0.002 [Bagging Extra Trees]
Mean: 0.446, std: (+/-) 0.033 [Bagging KNeighbors]
Mean: 0.478, std: (+/-) 0.000 [Bagging SVC]
Mean: 0.478, std: (+/-) 0.000 [BaggingRidge Classifier]


TypeError: object of type 'zip' has no len()

In [81]:
from mlxtend.classifier import EnsembleVoteClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

warnings.filterwarnings('ignore')

# Create boosting classifiers
ada_boost = AdaBoostClassifier()
grad_boost = GradientBoostingClassifier()
xgb_boost = XGBClassifier()

boost_array = [ada_boost, grad_boost, xgb_boost]

eclf = EnsembleVoteClassifier(clfs=[ada_boost, grad_boost, xgb_boost], voting='hard')

labels = ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble']

for clf, label in zip([ada_boost, grad_boost, xgb_boost, eclf], labels):
    scores = cross_val_score(clf, X_train_vect, y_train, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

Mean: 0.560, std: (+/-) 0.012 [Ada Boost]
Mean: 0.591, std: (+/-) 0.009 [Grad Boost]
Mean: 0.586, std: (+/-) 0.012 [XG Boost]
Mean: 0.586, std: (+/-) 0.012 [Ensemble]


In [82]:
pipeline1 = Pipeline([('cv', cv), ('nb', nb)])

In [83]:
pipeline1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [84]:
y_pred = pipeline1.predict(X_test)

In [85]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 59.56%

F1 Score: 43.08

Confusion Matrix:
 [[910 532  12]
 [388 937   9]
 [131 186   6]]


In [86]:
pipeline2 = Pipeline([('cv', cv),('sm', sm), ('nb', nb)])

In [87]:
pipeline2.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('sm',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('nb',
                 Mu

In [88]:
y_pred = pipeline1.predict(X_test)

In [89]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 57.73%

F1 Score: 47.59

Confusion Matrix:
 [[820 519 115]
 [343 920  71]
 [104 163  56]]


In [90]:
counts = combined.label.value_counts()
print(counts)

0    5954
1    5117
2    1373
Name: label, dtype: int64


In [96]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
sm_enn = SMOTEENN()

In [92]:
pipeline3 = Pipeline([('cv', cv),('sm_enn', sm_enn), ('nb', nb)])

In [93]:
pipeline3.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('sm_enn',
                 SMOTEENN(enn=None, n_jobs=1, random_state=None, ratio=None,
                          sampling_strategy='auto', smote=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [94]:
y_pred = pipeline3.predict(X_test)

In [95]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 10.67%

F1 Score: 6.81

Confusion Matrix:
 [[   0    9 1445]
 [   0   11 1323]
 [   0    2  321]]


In [97]:
from imblearn.combine import SMOTETomek
sm_tomek = SMOTETomek(random_state=0)

In [98]:
pipeline4 = Pipeline([('cv', cv),('sm_tomek', sm_tomek), ('nb', nb)])

In [101]:
pipeline4.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('sm_tomek',
                 SMOTETomek(n_jobs=1, random_state=0, ratio=None,
                            sampling_strategy='auto', smote=None, tomek=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [102]:
y_pred = pipeline4.predict(X_test)

In [103]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 57.67%

F1 Score: 47.20

Confusion Matrix:
 [[829 515 110]
 [348 913  73]
 [101 170  52]]


In [104]:
pipeline5 = Pipeline([('cv', cv),('sm', sm), ('rg', rg)])

In [105]:
pipeline5.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('sm',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('rg',
                 Ri

In [106]:
y_pred = pipeline5.predict(X_test)

In [107]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 49.86%

F1 Score: 43.59

Confusion Matrix:
 [[710 422 322]
 [327 744 263]
 [ 96 130  97]]


In [109]:
with open("../models/combined_claim_claimant.pkl", 'wb') as f:
    pickle.dump(pipeline1, f)