# Set up

In [1]:
import pandas as pd
import numpy as np
import json
import pickle

# Load training set

In [2]:
df1 = pd.read_csv('../reference/liar/train.tsv', sep='\t', header=None)
df1.columns = ['id', 'label', 'claim', 'topic', 'claimant', 'position', 'location', 'party', 'pants on fire', 'false', 'barely true', 'half true', 'mostly true', 'context']

In [3]:
df1.head(5)

Unnamed: 0,id,label,claim,topic,claimant,position,location,party,pants on fire,false,barely true,half true,mostly true,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


# Load validation data

In [4]:
df2 = pd.read_csv('../reference/liar/valid.tsv', sep='\t', header=None)
df2.columns = ['id', 'label', 'claim', 'topic', 'claimant', 'position', 'location', 'party', 'pants on fire', 'false', 'barely true', 'half true', 'mostly true', 'context']

In [5]:
df2.head(1)

Unnamed: 0,id,label,claim,topic,claimant,position,location,party,pants on fire,false,barely true,half true,mostly true,context
0,12134.json,barely-true,We have less Americans working now than in the...,"economy,jobs",vicky-hartzler,U.S. Representative,Missouri,republican,1,0,1,0,0,an interview with ABC17 News


# Load validation data

In [6]:
df3 = pd.read_csv('../reference/liar/test.tsv', sep='\t', header=None)
df3.columns = ['id', 'label', 'claim', 'topic', 'claimant', 'position', 'location', 'party', 'pants on fire', 'false', 'barely true', 'half true', 'mostly true', 'context']

In [7]:
df3.head(1)

Unnamed: 0,id,label,claim,topic,claimant,position,location,party,pants on fire,false,barely true,half true,mostly true,context
0,11972.json,True,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview


# Concatenate all data

In [8]:
frames = [df1, df2, df3]

In [9]:
result = pd.concat(frames)

In [10]:
result.head(1)

Unnamed: 0,id,label,claim,topic,claimant,position,location,party,pants on fire,false,barely true,half true,mostly true,context
0,2635.json,False,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer


In [11]:
liar = result[['label', 'claim', 'claimant']]

In [12]:
liar.head(10)

Unnamed: 0,label,claim,claimant
0,false,Says the Annies List political group supports ...,dwayne-bohac
1,half-true,When did the decline of coal start? It started...,scott-surovell
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",barack-obama
3,false,Health care reform legislation is likely to ma...,blog-posting
4,half-true,The economic turnaround started at the end of ...,charlie-crist
5,true,The Chicago Bears have had more starting quart...,robin-vos
6,barely-true,Jim Dunnam has not lived in the district he re...,republican-party-texas
7,half-true,I'm the only person on this stage who has work...,barack-obama
8,half-true,"However, it took $19.5 million in Oregon Lotte...",oregon-lottery
9,mostly-true,Says GOP primary opponents Glenn Grothman and ...,duey-stroebel


In [13]:
len(liar)

12791

In [14]:
# Reorder to match contest data

cols = liar.columns.tolist()
cols = cols[-2:] + cols[:-2]
liar = liar[cols] 

In [15]:
liar.head(10)

Unnamed: 0,claim,claimant,label
0,Says the Annies List political group supports ...,dwayne-bohac,false
1,When did the decline of coal start? It started...,scott-surovell,half-true
2,"Hillary Clinton agrees with John McCain ""by vo...",barack-obama,mostly-true
3,Health care reform legislation is likely to ma...,blog-posting,false
4,The economic turnaround started at the end of ...,charlie-crist,half-true
5,The Chicago Bears have had more starting quart...,robin-vos,true
6,Jim Dunnam has not lived in the district he re...,republican-party-texas,barely-true
7,I'm the only person on this stage who has work...,barack-obama,half-true
8,"However, it took $19.5 million in Oregon Lotte...",oregon-lottery,half-true
9,Says GOP primary opponents Glenn Grothman and ...,duey-stroebel,mostly-true


In [16]:
# Replace text labels with integer labels
liar1 = liar.copy(deep=True) 
liar1['label'] = liar['label'].replace(['pants-fire', 'false'], 0)

In [17]:
liar1.head(10)

Unnamed: 0,claim,claimant,label
0,Says the Annies List political group supports ...,dwayne-bohac,0
1,When did the decline of coal start? It started...,scott-surovell,half-true
2,"Hillary Clinton agrees with John McCain ""by vo...",barack-obama,mostly-true
3,Health care reform legislation is likely to ma...,blog-posting,0
4,The economic turnaround started at the end of ...,charlie-crist,half-true
5,The Chicago Bears have had more starting quart...,robin-vos,true
6,Jim Dunnam has not lived in the district he re...,republican-party-texas,barely-true
7,I'm the only person on this stage who has work...,barack-obama,half-true
8,"However, it took $19.5 million in Oregon Lotte...",oregon-lottery,half-true
9,Says GOP primary opponents Glenn Grothman and ...,duey-stroebel,mostly-true


In [18]:
liar2 = liar1.copy(deep=True) 
liar2['label'] = liar1['label'].replace(['barely-true', 'half-true', 'mostly-true'], 1)

In [19]:
liar2.head(10)

Unnamed: 0,claim,claimant,label
0,Says the Annies List political group supports ...,dwayne-bohac,0
1,When did the decline of coal start? It started...,scott-surovell,1
2,"Hillary Clinton agrees with John McCain ""by vo...",barack-obama,1
3,Health care reform legislation is likely to ma...,blog-posting,0
4,The economic turnaround started at the end of ...,charlie-crist,1
5,The Chicago Bears have had more starting quart...,robin-vos,true
6,Jim Dunnam has not lived in the district he re...,republican-party-texas,1
7,I'm the only person on this stage who has work...,barack-obama,1
8,"However, it took $19.5 million in Oregon Lotte...",oregon-lottery,1
9,Says GOP primary opponents Glenn Grothman and ...,duey-stroebel,1


In [20]:
liar3 = liar2.copy(deep=True) 
liar3['label'] = liar2['label'].replace('true', 2)

In [21]:
liar3.head(10)

Unnamed: 0,claim,claimant,label
0,Says the Annies List political group supports ...,dwayne-bohac,0
1,When did the decline of coal start? It started...,scott-surovell,1
2,"Hillary Clinton agrees with John McCain ""by vo...",barack-obama,1
3,Health care reform legislation is likely to ma...,blog-posting,0
4,The economic turnaround started at the end of ...,charlie-crist,1
5,The Chicago Bears have had more starting quart...,robin-vos,2
6,Jim Dunnam has not lived in the district he re...,republican-party-texas,1
7,I'm the only person on this stage who has work...,barack-obama,1
8,"However, it took $19.5 million in Oregon Lotte...",oregon-lottery,1
9,Says GOP primary opponents Glenn Grothman and ...,duey-stroebel,1


# Load contest data

In [22]:
with open('../dataset/metadata.json', 'r') as f:
    data = json.load(f)

In [23]:
df = pd.DataFrame(data)

In [24]:
# Split into training set and test set
df_copy = df.copy()
train1 = df_copy.sample(frac=0.8, random_state=0)
test = df_copy.drop(train1.index)

In [25]:
train1.head(10)

Unnamed: 0,claim,claimant,date,label,related_articles,id
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,2017-10-31,1,"[34218, 55700, 18736, 39031, 34219, 34220]",10354
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,2014-09-12,0,"[73190, 76997, 38841, 77415, 77303, 9280, 8332...",2053
11035,Says Target installed urinals in a women’s bat...,Facebook posts,2016-04-22,0,"[9619, 22197]",12160
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,2019-04-15,0,"[57163, 31528, 40908, 31536, 68904, 44601]",13458
11354,: The AMBER Alert system has been discontinu...,,2013-10-13,0,"[103978, 121475, 121849]",12504
8662,"Judge Gonzalo Curiel ""is giving us very unfair...",Donald Trump,2016-06-05,0,"[77208, 21821, 63461, 58883]",9554
14307,Continental Airlines CEO Gordon Bethune once p...,,2016-10-23,0,"[112600, 114369, 114400, 123364]",15756
31,"""Expanding Medicaid would require borrowing mo...",Will Weatherford,2013-05-09,1,"[66749, 1228, 7897, 10786]",36
12687,Says a Washington Post reporter who broke news...,Bloggers,2017-11-10,0,"[12178, 53283, 89573, 19796]",13981
6422,About half the people who would qualify for Me...,Will Weatherford,2014-02-27,1,"[93033, 9294, 7826, 11450, 93049, 6737, 92851,...",7062


In [26]:
train2 = train1[['claim', 'claimant', 'label']]
test_set = test[['claim', 'claimant', 'label']]

In [27]:
train2.head(10)

Unnamed: 0,claim,claimant,label
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,1
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,0
11035,Says Target installed urinals in a women’s bat...,Facebook posts,0
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,0
11354,: The AMBER Alert system has been discontinu...,,0
8662,"Judge Gonzalo Curiel ""is giving us very unfair...",Donald Trump,0
14307,Continental Airlines CEO Gordon Bethune once p...,,0
31,"""Expanding Medicaid would require borrowing mo...",Will Weatherford,1
12687,Says a Washington Post reporter who broke news...,Bloggers,0
6422,About half the people who would qualify for Me...,Will Weatherford,1


# Join liar and contest data

In [28]:
frames = [liar3, train2]
train_set = pd.concat(frames)

In [29]:
train_set.head(10)

Unnamed: 0,claim,claimant,label
0,Says the Annies List political group supports ...,dwayne-bohac,0
1,When did the decline of coal start? It started...,scott-surovell,1
2,"Hillary Clinton agrees with John McCain ""by vo...",barack-obama,1
3,Health care reform legislation is likely to ma...,blog-posting,0
4,The economic turnaround started at the end of ...,charlie-crist,1
5,The Chicago Bears have had more starting quart...,robin-vos,2
6,Jim Dunnam has not lived in the district he re...,republican-party-texas,1
7,I'm the only person on this stage who has work...,barack-obama,1
8,"However, it took $19.5 million in Oregon Lotte...",oregon-lottery,1
9,Says GOP primary opponents Glenn Grothman and ...,duey-stroebel,1


In [30]:
print(len(train_set))
print(len(test_set))

25235
3111


In [31]:
# # Save training and test data frames
# train_set.to_pickle('liar_train.pkl')
# test_set.to_pickle('liar_test.pkl')

In [32]:
# # Save training and test sets as json files
# test.to_json('../input/liar_test.json', orient='records')

# Train claims model

In [33]:
import sklearn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [34]:
X_train_claims = train_set.claim
y_train_claims = train_set.label
X_test_claims = test_set.claim
y_test_claims = test_set.label

In [35]:
sm = SMOTE()
cv = CountVectorizer()
tfidf = TfidfVectorizer()
nb = MultinomialNB()
svm = LinearSVC()

In [36]:
pipeline1 = Pipeline([('cv', cv), ('sm', sm), ('nb', nb)])
pipeline1.fit(X_train_claims, y_train_claims)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('sm',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('nb',
                 Mu

In [37]:
y_pred_claims = pipeline1.predict(X_test_claims)

In [38]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test_claims, y_pred_claims) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test_claims, y_pred_claims, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_claims, y_pred_claims))

Accuracy: 60.98%

F1 Score: 52.88

Confusion Matrix:
 [[999 353 102]
 [377 798 159]
 [118 105 100]]


In [39]:
# with open("../models/liar_claims_cv_nb.pkl", 'wb') as f:
#     pickle.dump(pipeline1, f)

In [49]:
y_pred_claims = pipeline4.predict(X_test_claims)

In [50]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test_claims, y_pred_claims) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test_claims, y_pred_claims, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_claims, y_pred_claims))

Accuracy: 63.58%

F1 Score: 57.45

Confusion Matrix:
 [[936 364 154]
 [303 898 133]
 [ 87  92 144]]


In [51]:
# with open("../models/liar_claims_cv_svm.pkl", 'wb') as f:
#     pickle.dump(pipeline4, f)

# Train claimant model

In [65]:
train_claimants = train_set['claimant'].replace('', np.nan)
test_claimants = test_set['claimant'].replace('', np.nan)

25235

In [72]:
train_claimants

0                       dwayne-bohac
1                     scott-surovell
2                       barack-obama
3                       blog-posting
4                      charlie-crist
                    ...             
5974                    Steve Cortes
11698                 Garnet Coleman
2910     Republican Party of Florida
10446                 Jeanine  Pirro
5414      Tennessee Republican Party
Name: claimant, Length: 21246, dtype: object

In [68]:
train_claimants.dropna(how='all', inplace=True)
test_claimants.dropna(how='all', inplace=True)
len(train_claimants)

21246