# Data Analysis

## Set up

In [1]:
import pandas as pd
import pickle
import json
import glob
import os

## Load training data

In [2]:
train_set = pd.read_pickle('../input/train_set.pkl')

In [3]:
train_set

Unnamed: 0,claim,claimant,date,label,related_articles,id
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,2017-10-31,1,"[34218, 55700, 18736, 39031, 34219, 34220]",10354
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,2014-09-12,0,"[73190, 76997, 38841, 77415, 77303, 9280, 8332...",2053
11035,Says Target installed urinals in a women’s bat...,Facebook posts,2016-04-22,0,"[9619, 22197]",12160
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,2019-04-15,0,"[57163, 31528, 40908, 31536, 68904, 44601]",13458
11354,: The AMBER Alert system has been discontinu...,,2013-10-13,0,"[103978, 121475, 121849]",12504
...,...,...,...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,2014-09-23,1,"[9581, 89571, 7836, 7945, 7949, 77360, 83491, ...",3208
6096,"A photograph captures Harriet Tubman as a ""Gun...",,2019-03-25,0,"[125108, 125968, 126005]",6701
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,2014-06-14,0,"[80115, 93998, 5968, 175, 91475, 8710, 89881, ...",11514
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,2008-02-25,1,"[96453, 71123, 61, 69968, 96477]",5966


In [4]:
test_set = pd.read_pickle('../input/test_set.pkl')

## Load article data

In [5]:
articles = []

for file in glob.glob(os.path.join('../dataset/articles', '*.txt')):
    with open(file) as f:
        body = " ".join(line for line in f)
    
    base = os.path.basename(file)
    file_name = os.path.splitext(base)[0]
    
    article = (os.path.basename(file_name), body)
    articles.append(article)

In [6]:
articles_df = pd.DataFrame(articles)
articles_df.columns = ['article_id', 'article']

In [7]:
articles_df

Unnamed: 0,article_id,article
0,60583,These Republicans are misleading voters about ...
1,120801,They sued for Clinton's emails. Now they want ...
2,66570,Heritage's Fun With 'Defund Obamacare' Polling...
3,123469,Size of U.S. Unauthoriized Immigrant Workforce...
4,13133,"Border Crossing/Entry Data\n Hide Coverage, Av..."
...,...,...
64969,93026,Florida legislators hope to fix nuclear advanc...
64970,9904,"A Chart Is Worth 1,000 Words\n Todd Harrison, ..."
64971,87442,"Prison plan assailed as 'sneaky,' misleading\n..."
64972,59427,Updated: Do Russia probe attorneys’ donations ...


# Related Articles

In [8]:
train_articles = []

for index, row in train_set.iterrows():
    for article in row['related_articles']:
        train_articles.append((article, row['id'], row['label']))

In [9]:
train_articles_df = pd.DataFrame(train_articles)
train_articles_df.columns = ['article_id', 'claim_id', 'label']
train_articles_df

Unnamed: 0,article_id,claim_id,label
0,34218,10354,1
1,55700,10354,1
2,18736,10354,1
3,39031,10354,1
4,34219,10354,1
...,...,...,...
62323,61,5966,1
62324,69968,5966,1
62325,96477,5966,1
62326,120293,7328,1


In [10]:
test_articles = []

for index, row in test_set.iterrows():
    for article in row['related_articles']:
        test_articles.append((article, row['id'], row['label']))

In [11]:
test_articles_df = pd.DataFrame(test_articles)
test_articles_df.columns = ['article_id', 'claim_id', 'label']

In [12]:
len(test_articles_df)

15352

In [13]:
# Associate each article with the labels of the claims that reference it

train_article_labels = train_articles_df.groupby('article_id')['label'].apply(list).to_dict()
train_article_labels

{2: [0],
 8: [0],
 15: [2],
 18: [1, 1],
 19: [1],
 21: [1, 1],
 22: [1, 0],
 23: [1, 1],
 31: [0],
 32: [0],
 33: [0],
 34: [0],
 35: [1],
 39: [1],
 40: [1],
 41: [1],
 42: [0],
 43: [0, 0],
 57: [1],
 61: [1],
 66: [0],
 67: [0],
 70: [0],
 73: [0],
 82: [1, 2],
 84: [1],
 88: [0],
 89: [2],
 90: [0, 0, 0, 0],
 92: [0],
 93: [1],
 94: [2],
 97: [1],
 98: [0],
 100: [1, 1],
 101: [0],
 105: [1],
 111: [0],
 114: [0],
 117: [0],
 119: [1],
 120: [1],
 121: [1],
 122: [0],
 123: [1],
 125: [0],
 127: [0],
 129: [1],
 131: [0],
 132: [1],
 134: [0],
 137: [1],
 138: [1],
 142: [1],
 145: [1],
 146: [1],
 148: [1],
 149: [1],
 151: [0],
 152: [1, 0],
 153: [1],
 154: [0],
 155: [1],
 156: [1],
 157: [1],
 158: [1, 0],
 159: [1],
 160: [1],
 161: [0],
 162: [1],
 163: [0],
 164: [0],
 168: [0],
 171: [0, 0, 1],
 172: [0],
 173: [0],
 175: [0, 0],
 176: [1],
 177: [0],
 178: [1],
 179: [0],
 180: [1],
 181: [0],
 183: [1],
 184: [1],
 185: [0, 0, 0, 0],
 187: [1],
 190: [0],
 200: [0],
 20

In [14]:
test_article_labels = test_articles_df.groupby('article_id')['label'].apply(list).to_dict()

In [15]:
# Assign the average value of the label of the claims that reference the article
train_article_single_label = {}

for article_id, label in train_article_labels.items():
    train_article_single_label[article_id] = round(sum(label) / len(label))

In [16]:
# train_article_single_label
len(train_article_single_label)

51580

In [17]:
test_article_single_label = {}

for article_id, label in test_article_labels.items():
    test_article_single_label[article_id] = round(sum(label) / len(label))

In [18]:
len(test_article_single_label)

14396

In [19]:
labelled_train_articles = articles_df.copy()
labelled_test_articles = articles_df.copy()

In [20]:
import numpy as np
# Initialize all article labels to -1
labelled_train_articles['label'] = -1
labelled_test_articles['label'] = -1

In [21]:
labelled_train_articles

Unnamed: 0,article_id,article,label
0,60583,These Republicans are misleading voters about ...,-1
1,120801,They sued for Clinton's emails. Now they want ...,-1
2,66570,Heritage's Fun With 'Defund Obamacare' Polling...,-1
3,123469,Size of U.S. Unauthoriized Immigrant Workforce...,-1
4,13133,"Border Crossing/Entry Data\n Hide Coverage, Av...",-1
...,...,...,...
64969,93026,Florida legislators hope to fix nuclear advanc...,-1
64970,9904,"A Chart Is Worth 1,000 Words\n Todd Harrison, ...",-1
64971,87442,"Prison plan assailed as 'sneaky,' misleading\n...",-1
64972,59427,Updated: Do Russia probe attorneys’ donations ...,-1


In [22]:
# Assign labels to articles that are cited by claims in the training data
for index, row in labelled_train_articles.iterrows():
    key = int(row['article_id'])
    if key in train_article_single_label:
       labelled_train_articles.at[index, 'label'] = train_article_single_label[key]

In [23]:
labelled_train_articles

Unnamed: 0,article_id,article,label
0,60583,These Republicans are misleading voters about ...,0
1,120801,They sued for Clinton's emails. Now they want ...,-1
2,66570,Heritage's Fun With 'Defund Obamacare' Polling...,1
3,123469,Size of U.S. Unauthoriized Immigrant Workforce...,1
4,13133,"Border Crossing/Entry Data\n Hide Coverage, Av...",1
...,...,...,...
64969,93026,Florida legislators hope to fix nuclear advanc...,-1
64970,9904,"A Chart Is Worth 1,000 Words\n Todd Harrison, ...",-1
64971,87442,"Prison plan assailed as 'sneaky,' misleading\n...",0
64972,59427,Updated: Do Russia probe attorneys’ donations ...,0


In [24]:
# Drop any articles that were not cited by any claims
df_labelled_train_articles = labelled_train_articles[labelled_train_articles.label != -1]

In [25]:
df_labelled_train_articles

Unnamed: 0,article_id,article,label
0,60583,These Republicans are misleading voters about ...,0
2,66570,Heritage's Fun With 'Defund Obamacare' Polling...,1
3,123469,Size of U.S. Unauthoriized Immigrant Workforce...,1
4,13133,"Border Crossing/Entry Data\n Hide Coverage, Av...",1
5,21607,Pelosi Floor Speech in Opposition to the Repub...,0
...,...,...,...
64965,58727,Trump’s claim that Kerry and Obama said discus...,0
64968,45171,FACT CHECK: Trump's Oval Office Pitch For A Bo...,1
64971,87442,"Prison plan assailed as 'sneaky,' misleading\n...",0
64972,59427,Updated: Do Russia probe attorneys’ donations ...,0


In [26]:
# Save articles
df_labelled_train_articles.to_pickle('../input/labelled_train_articles.pkl')

In [27]:
# Assign labels to articles that are cited by claims in the test data
for index, row in labelled_test_articles.iterrows():
    key = int(row['article_id'])
    if key in test_article_single_label:
       labelled_test_articles.at[index, 'label'] = test_article_single_label[key]

In [28]:
# Drop any articles that were not cited by any claims
df_labelled_test_articles = labelled_test_articles[labelled_test_articles.label != -1]

In [29]:
# Save articles
df_labelled_test_articles.to_pickle('../input/labelled_test_articles.pkl')

In [30]:
# Prepare data set for training
X_train = df_labelled_train_articles.article
y_train = df_labelled_train_articles.label
X_test = df_labelled_test_articles.article
y_test = df_labelled_test_articles.label

In [31]:
import sklearn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [32]:
sm = SMOTE()
cv = CountVectorizer()
nb = MultinomialNB()

pipeline = Pipeline([('cv', cv),('sm', sm), ('nb', nb)])

In [33]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('sm',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('nb',
                 Mu

In [34]:
y_pred = pipeline.predict(X_test)

In [35]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 57.49%

F1 Score: 43.41

Confusion Matrix:
 [[4615 2061  206]
 [2404 3577  229]
 [ 570  650   84]]


In [36]:
with open("../models/train_articles_nb.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

In [37]:
tf = TfidfVectorizer()
svm = LinearSVC()

pipeline2 = Pipeline([('tf', tf),('sm', sm), ('nb', nb)])

In [38]:
pipeline2.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('sm',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='de

In [39]:
y_pred = pipeline2.predict(X_test)

In [40]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 46.58%

F1 Score: 41.82

Confusion Matrix:
 [[3165 1856 1861]
 [1188 3015 2007]
 [ 312  467  525]]


In [41]:
pipeline3 = Pipeline([('tf', tf), ('nb', nb)])

In [42]:
pipeline3.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [43]:
y_pred = pipeline3.predict(X_test)

In [44]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 57.51%

F1 Score: 39.21

Confusion Matrix:
 [[5438 1444    0]
 [3367 2841    2]
 [ 802  502    0]]


In [45]:
counts = train_set.label.value_counts()
print(counts)

0    5954
1    5117
2    1373
Name: label, dtype: int64


In [46]:
pipeline4 = Pipeline([('cv', cv),('sm', sm), ('svm', svm)])

In [47]:
pipeline4.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)...
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('svm',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_in

In [48]:
y_pred = pipeline4.predict(X_test)

In [49]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 51.27%

F1 Score: 42.12

Confusion Matrix:
 [[4008 2138  736]
 [2238 3130  842]
 [ 459  602  243]]
