# Data Analysis

## Set up

In [1]:
import pandas as pd
import pickle
import json
import glob
import os

METADATA_FILEPATH = '../dataset/metadata.json'
ARTICLES_FILEPATH = '../dataset/articles'

## Load training data

In [2]:
df = pd.read_pickle('train_set.pkl')

In [3]:
df

Unnamed: 0,claim,claimant,date,label,related_articles,id
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,2017-10-31,1,"[34218, 55700, 18736, 39031, 34219, 34220]",10354
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,2014-09-12,0,"[73190, 76997, 38841, 77415, 77303, 9280, 8332...",2053
11035,Says Target installed urinals in a women’s bat...,Facebook posts,2016-04-22,0,"[9619, 22197]",12160
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,2019-04-15,0,"[57163, 31528, 40908, 31536, 68904, 44601]",13458
11354,: The AMBER Alert system has been discontinu...,,2013-10-13,0,"[103978, 121475, 121849]",12504
...,...,...,...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,2014-09-23,1,"[9581, 89571, 7836, 7945, 7949, 77360, 83491, ...",3208
6096,"A photograph captures Harriet Tubman as a ""Gun...",,2019-03-25,0,"[125108, 125968, 126005]",6701
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,2014-06-14,0,"[80115, 93998, 5968, 175, 91475, 8710, 89881, ...",11514
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,2008-02-25,1,"[96453, 71123, 61, 69968, 96477]",5966


## Load article data

In [4]:
articles = []

for file in glob.glob(os.path.join(ARTICLES_FILEPATH, '*.txt')):
    with open(file) as f:
        body = " ".join(line for line in f)
    
    base = os.path.basename(file)
    file_name = os.path.splitext(base)[0]
    
    article = (os.path.basename(file_name), body)
    articles.append(article)

In [5]:
articles_df = pd.DataFrame(articles)
articles_df.columns = ['article_id', 'article']

# Related Articles

In [6]:
related_articles = []

for index, row in df.iterrows():
    for article in row['related_articles']:
        related_articles.append((article, row['id'], row['label']))

In [7]:
related_articles_df = pd.DataFrame(related_articles)
related_articles_df.columns = ['article_id', 'claim_id', 'label']
related_articles_df

Unnamed: 0,article_id,claim_id,label
0,34218,10354,1
1,55700,10354,1
2,18736,10354,1
3,39031,10354,1
4,34219,10354,1
...,...,...,...
62323,61,5966,1
62324,69968,5966,1
62325,96477,5966,1
62326,120293,7328,1


In [8]:
# Associate each article with the labels of the claims that reference it

labelled_articles = related_articles_df.groupby('article_id')['label'].apply(list).to_dict()
labelled_articles

{2: [0],
 8: [0],
 15: [2],
 18: [1, 1],
 19: [1],
 21: [1, 1],
 22: [1, 0],
 23: [1, 1],
 31: [0],
 32: [0],
 33: [0],
 34: [0],
 35: [1],
 39: [1],
 40: [1],
 41: [1],
 42: [0],
 43: [0, 0],
 57: [1],
 61: [1],
 66: [0],
 67: [0],
 70: [0],
 73: [0],
 82: [1, 2],
 84: [1],
 88: [0],
 89: [2],
 90: [0, 0, 0, 0],
 92: [0],
 93: [1],
 94: [2],
 97: [1],
 98: [0],
 100: [1, 1],
 101: [0],
 105: [1],
 111: [0],
 114: [0],
 117: [0],
 119: [1],
 120: [1],
 121: [1],
 122: [0],
 123: [1],
 125: [0],
 127: [0],
 129: [1],
 131: [0],
 132: [1],
 134: [0],
 137: [1],
 138: [1],
 142: [1],
 145: [1],
 146: [1],
 148: [1],
 149: [1],
 151: [0],
 152: [1, 0],
 153: [1],
 154: [0],
 155: [1],
 156: [1],
 157: [1],
 158: [1, 0],
 159: [1],
 160: [1],
 161: [0],
 162: [1],
 163: [0],
 164: [0],
 168: [0],
 171: [0, 0, 1],
 172: [0],
 173: [0],
 175: [0, 0],
 176: [1],
 177: [0],
 178: [1],
 179: [0],
 180: [1],
 181: [0],
 183: [1],
 184: [1],
 185: [0, 0, 0, 0],
 187: [1],
 190: [0],
 200: [0],
 20

In [9]:
labelled_articles_df = pd.DataFrame.from_dict(labelled_articles, orient='index')
labelled_articles_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
2,0,,,,,,,,,,...,,,,,,,,,,
8,0,,,,,,,,,,...,,,,,,,,,,
15,2,,,,,,,,,,...,,,,,,,,,,
18,1,1.0,,,,,,,,,...,,,,,,,,,,
19,1,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163086,1,,,,,,,,,,...,,,,,,,,,,
163089,0,,,,,,,,,,...,,,,,,,,,,
163090,1,,,,,,,,,,...,,,,,,,,,,
163091,1,,,,,,,,,,...,,,,,,,,,,


In [10]:
pickle.dump( labelled_articles, open( "../input/train_articles_ids.p", "wb" ) )

In [11]:
dict1 = {}

for key, value in labelled_articles.items():
    if len(value) > 1:
        dict1[key] = value

In [12]:
dict1

{18: [1, 1],
 21: [1, 1],
 22: [1, 0],
 23: [1, 1],
 43: [0, 0],
 82: [1, 2],
 90: [0, 0, 0, 0],
 100: [1, 1],
 152: [1, 0],
 158: [1, 0],
 171: [0, 0, 1],
 175: [0, 0],
 185: [0, 0, 0, 0],
 223: [0, 0],
 234: [1, 1, 1],
 240: [1, 0, 0],
 246: [1, 1],
 259: [0, 0, 1],
 277: [1, 1, 1],
 286: [1, 1],
 303: [0, 0],
 322: [0, 0],
 323: [0, 0, 0, 0],
 324: [1, 0, 0, 0, 0],
 331: [0, 0],
 348: [1, 0],
 352: [0, 1, 0, 0],
 355: [0, 0],
 356: [0, 0, 0],
 363: [0, 0],
 373: [2, 1],
 376: [1, 0, 1],
 377: [0, 0, 0, 0],
 378: [0, 0, 0, 0, 0],
 382: [0, 0],
 390: [1, 1],
 392: [0, 0, 0, 0],
 420: [0, 0],
 427: [0, 0],
 490: [1, 1, 2],
 499: [1, 1],
 509: [1, 1],
 518: [0, 1],
 538: [1, 1],
 551: [2, 1],
 558: [1, 1],
 560: [0, 0],
 562: [1, 1],
 609: [1, 1, 1, 1, 1, 1],
 610: [1, 1, 1],
 706: [1, 2],
 798: [0, 0, 0],
 802: [0, 0],
 807: [1, 1],
 809: [1, 1],
 812: [0, 0, 0],
 852: [0, 0, 0],
 905: [1, 1],
 913: [0, 1, 0],
 960: [0, 0],
 977: [1, 1, 1],
 978: [0, 0],
 1036: [0, 0],
 1069: [0, 1],
 

In [13]:
df1 = pd.DataFrame.from_dict(dict1, orient='index')

In [14]:
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
18,1,1,,,,,,,,,...,,,,,,,,,,
21,1,1,,,,,,,,,...,,,,,,,,,,
22,1,0,,,,,,,,,...,,,,,,,,,,
23,1,1,,,,,,,,,...,,,,,,,,,,
43,0,0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162934,0,0,,,,,,,,,...,,,,,,,,,,
162971,0,0,,,,,,,,,...,,,,,,,,,,
162992,1,1,,,,,,,,,...,,,,,,,,,,
162993,1,1,,,,,,,,,...,,,,,,,,,,


In [15]:
pickle.dump( dict1, open( "../input/train_articles1.p", "wb" ) )

In [16]:
dict2 = {}

for key, value in labelled_articles.items():
    if len(value) > 2:
        dict2[key] = value

In [17]:
dict2

{90: [0, 0, 0, 0],
 171: [0, 0, 1],
 185: [0, 0, 0, 0],
 234: [1, 1, 1],
 240: [1, 0, 0],
 259: [0, 0, 1],
 277: [1, 1, 1],
 323: [0, 0, 0, 0],
 324: [1, 0, 0, 0, 0],
 352: [0, 1, 0, 0],
 356: [0, 0, 0],
 376: [1, 0, 1],
 377: [0, 0, 0, 0],
 378: [0, 0, 0, 0, 0],
 392: [0, 0, 0, 0],
 490: [1, 1, 2],
 609: [1, 1, 1, 1, 1, 1],
 610: [1, 1, 1],
 798: [0, 0, 0],
 812: [0, 0, 0],
 852: [0, 0, 0],
 913: [0, 1, 0],
 977: [1, 1, 1],
 1184: [0, 0, 0, 0],
 1248: [0, 0, 0],
 1292: [1, 1, 0],
 1322: [1, 1, 1],
 1539: [0, 0, 0],
 1628: [1, 1, 1],
 1742: [0, 1, 0],
 1882: [1, 0, 0],
 2010: [0, 0, 0, 0],
 2252: [0, 1, 0],
 2475: [0, 1, 0],
 2563: [0, 0, 0, 0],
 2734: [1, 0, 1, 1],
 2806: [0, 1, 1, 1],
 2920: [1, 1, 1, 2],
 2964: [2, 1, 2],
 2974: [0, 1, 0],
 3177: [0, 0, 1],
 3214: [2, 0, 0],
 3221: [0, 0, 0, 0, 0, 0, 0, 0, 0],
 3223: [0, 0, 0, 0, 0, 0, 2, 0],
 3224: [1, 1, 1],
 3226: [1, 1, 1],
 3237: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 3256: [0, 0, 0, 0, 0, 0],
 3259: [0, 0, 0, 0, 0, 0],
 3635: [0,

In [18]:
df2 = pd.DataFrame.from_dict(dict2, orient='index')
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
90,0,0,0,0.0,,,,,,,...,,,,,,,,,,
171,0,0,1,,,,,,,,...,,,,,,,,,,
185,0,0,0,0.0,,,,,,,...,,,,,,,,,,
234,1,1,1,,,,,,,,...,,,,,,,,,,
240,1,0,0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161333,0,0,0,,,,,,,,...,,,,,,,,,,
161334,0,0,0,,,,,,,,...,,,,,,,,,,
162148,2,2,1,0.0,2.0,1.0,2.0,2.0,1.0,,...,,,,,,,,,,
162172,2,2,1,1.0,2.0,1.0,,,,,...,,,,,,,,,,


In [19]:
pickle.dump( dict2, open( "../input/train_articles2.p", "wb" ) )

In [20]:
num_related_articles = related_articles_df.drop_duplicates(subset=['article_id'], keep='first')

print("Number of related articles in claims:", len(num_related_articles))
print("Number of labelled articles:", len(article_id_labels))
print("Number of total articles:", len(articles_df))

Number of related articles in claims: 51580


NameError: name 'article_id_labels' is not defined

In [None]:
# Assign the most frequent label to each article, or the lowest label value in the event of a tie

from statistics import mode

article_id_single_label = {}

for article_id, label in article_id_labels.items():
    try:
        article_id_single_label[article_id] = mode(label)
    except:
        article_id_single_label[article_id] = min(label)

In [None]:
article_id_single_label

In [None]:
pickle.dump( article_id_single_label, open( "../preprocessing/labelled_article_ids.p", "wb" ) )

In [None]:
labelled_articles_df = articles_df

In [None]:
labelled_articles_df['label'] = -1

In [None]:
labelled_articles_df

In [None]:
for index, row in labelled_articles_df.iterrows():
    key = int(row['article_id'])
    if key in article_single_label:
       labelled_articles_df.at[index, 'label'] = article_single_label[key]

In [None]:
labelled_articles_df

In [None]:
labelled_articles_df.head(20)

In [None]:
print(article_single_label[60583])
print(article_single_label[120801])
print(article_single_label[66570])
print(article_single_label[123469])
print(article_single_label[69314])