# Set up

In [1]:
import pandas as pd
import numpy as np
import json
import pickle

# Load training set

In [2]:
train_set = pd.read_pickle('train_set.pkl')
train_set

Unnamed: 0,claim,claimant,date,label,related_articles,id
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,2017-10-31,1,"[34218, 55700, 18736, 39031, 34219, 34220]",10354
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,2014-09-12,0,"[73190, 76997, 38841, 77415, 77303, 9280, 8332...",2053
11035,Says Target installed urinals in a women’s bat...,Facebook posts,2016-04-22,0,"[9619, 22197]",12160
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,2019-04-15,0,"[57163, 31528, 40908, 31536, 68904, 44601]",13458
11354,: The AMBER Alert system has been discontinu...,,2013-10-13,0,"[103978, 121475, 121849]",12504
...,...,...,...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,2014-09-23,1,"[9581, 89571, 7836, 7945, 7949, 77360, 83491, ...",3208
6096,"A photograph captures Harriet Tubman as a ""Gun...",,2019-03-25,0,"[125108, 125968, 126005]",6701
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,2014-06-14,0,"[80115, 93998, 5968, 175, 91475, 8710, 89881, ...",11514
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,2008-02-25,1,"[96453, 71123, 61, 69968, 96477]",5966


# Load test data

In [3]:
test_set = pd.read_pickle('test_set.pkl')

# Separate training data into different claim types

In [4]:
claims = pd.DataFrame(train_set, columns=['claim', 'claimant', 'label'])
claims

Unnamed: 0,claim,claimant,label
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,1
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,0
11035,Says Target installed urinals in a women’s bat...,Facebook posts,0
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,0
11354,: The AMBER Alert system has been discontinu...,,0
...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,1
6096,"A photograph captures Harriet Tubman as a ""Gun...",,0
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,0
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,1


In [5]:
# Replace all partly true and true claims with -1 label
false_claims = claims.copy(deep=True)
false_claims['label'] = false_claims['label'].replace([1, 2], -1)
false_claims

Unnamed: 0,claim,claimant,label
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,-1
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,0
11035,Says Target installed urinals in a women’s bat...,Facebook posts,0
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,0
11354,: The AMBER Alert system has been discontinu...,,0
...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,-1
6096,"A photograph captures Harriet Tubman as a ""Gun...",,0
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,0
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,-1


In [6]:
false_counts = false_claims.label.value_counts()
print(false_counts)

-1    6490
 0    5954
Name: label, dtype: int64


In [7]:
# Replace all false and true claims with -1 label
partly_claims = claims.copy(deep=True)
partly_claims['label'] = partly_claims['label'].replace([0, 2], -1)
partly_claims

Unnamed: 0,claim,claimant,label
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,1
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,-1
11035,Says Target installed urinals in a women’s bat...,Facebook posts,-1
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,-1
11354,: The AMBER Alert system has been discontinu...,,-1
...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,1
6096,"A photograph captures Harriet Tubman as a ""Gun...",,-1
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,-1
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,1


In [8]:
partly_counts = partly_claims.label.value_counts()
print(partly_counts)

-1    7327
 1    5117
Name: label, dtype: int64


In [9]:
# Replace all false and partly true with -1 label
true_claims = claims.copy(deep=True)
true_claims['label'] = true_claims['label'].replace([0, 1], -1)
true_claims

Unnamed: 0,claim,claimant,label
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,-1
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,-1
11035,Says Target installed urinals in a women’s bat...,Facebook posts,-1
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,-1
11354,: The AMBER Alert system has been discontinu...,,-1
...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,-1
6096,"A photograph captures Harriet Tubman as a ""Gun...",,-1
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,-1
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,-1


In [10]:
true_counts = true_claims.label.value_counts()
print(true_counts)

-1    11071
 2     1373
Name: label, dtype: int64


# Train the models

In [32]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE

In [33]:
sm = SMOTE()
cv = CountVectorizer()
tf = TfidfVectorizer()
nb_m = MultinomialNB()
nb_b = BernoulliNB()
svc = LinearSVC()
rf = RandomForestClassifier()

In [56]:
test_false_claims = test_set.copy(deep=True)
test_false_claims['label'] = test_false_claims['label'].replace([1, 2], -1)
X_test_false = test_false_claims.claim
y_test_false = test_false_claims.label

In [57]:
X_train_false = false_claims.claim
y_train_false = false_claims.label

In [58]:
p1 = Pipeline([('cv', cv),('sm', sm), ('nb_m', nb_m)])

In [59]:
p1.fit(X_train_false, y_train_false)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('sm',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('nb_m',
                 

In [60]:
y_pred_false = p1.predict(X_test_false)

In [62]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test_false, y_pred_false) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test_false, y_pred_false, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_false, y_pred_false))

Accuracy: 66.96%

F1 Score: 66.56

Confusion Matrix:
 [[1210  447]
 [ 581  873]]


In [70]:
test_partly_claims = test_set.copy(deep=True)
test_partly_claims['label'] = test_partly_claims['label'].replace([0, 2], -1)
X_test_partly = test_partly_claims.claim
y_test_partly = test_partly_claims.label

In [72]:
X_train_partly = partly_claims.claim
y_train_partly = partly_claims.label

In [73]:
p2 = Pipeline([('cv', cv), ('sm', sm), ('nb_m', nb_m)])

In [74]:
p2.fit(X_train_partly, y_train_partly)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('sm',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('nb_m',
                 

In [75]:
y_pred_partly = p2.predict(X_test_partly)

In [76]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test_partly, y_pred_partly) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test_partly, y_pred_partly, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_partly, y_pred_partly))

Accuracy: 64.83%

F1 Score: 64.76

Confusion Matrix:
 [[1078  699]
 [ 395  939]]


In [77]:
test_true_claims = test_set.copy(deep=True)
test_true_claims['label'] = test_true_claims['label'].replace([0, 1], -1)
X_test_true = test_true_claims.claim
y_test_true = test_true_claims.label

In [78]:
X_train_true = true_claims.claim
y_train_true = true_claims.label

In [79]:
p3 = Pipeline([('cv', cv), ('sm', sm), ('nb_m', nb_m)])

In [80]:
p3.fit(X_train_true, y_train_true)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('sm',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('nb_m',
                 

In [81]:
y_pred_true = p3.predict(X_test_true)

In [82]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test_true, y_pred_true) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test_true, y_pred_true, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_true, y_pred_true))

Accuracy: 81.26%

F1 Score: 55.24

Confusion Matrix:
 [[2450  338]
 [ 245   78]]


In [83]:
with open("../models/train_false_claims.pkl", 'wb') as f:
    pickle.dump(p1, f)

In [84]:
with open("../models/train_partly_claims.pkl", 'wb') as f:
    pickle.dump(p2, f)

In [85]:
with open("../models/train_true_claims.pkl", 'wb') as f:
    pickle.dump(p3, f)