# Set up

In [1]:
import pandas as pd
import numpy as np
import json
import pickle
import glob
import os

# Load data

In [2]:
with open('../dataset/metadata.json', 'r') as f:
    data = json.load(f)

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head(10)

Unnamed: 0,claim,claimant,date,label,related_articles,id
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,"[122094, 122580, 130685, 134765]",0
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,2,"[106868, 127320, 128060]",1
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,1,"[132130, 132132, 149722]",4
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,2,"[123254, 123418, 127464]",5
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6
5,"Rhode Island is ""almost dead last"" among North...",Leonidas Raptakis,2014-02-11,2,"[8284, 3768, 20091, 82368, 73148, 4493]",7
6,The poorest counties in the U.S. are in Appala...,Jim Webb,2014-11-19,1,"[70709, 70708]",8
7,Koch Industries paid the legal fees of George ...,,2013-07-18,0,"[120591, 120592, 127866, 129483]",9
8,"""Minnesota, Michigan, Iowa already have 70 mph...",Robin Vos,2013-08-22,1,"[69547, 80095, 7994, 81116, 77621]",11
9,"""FBI Uniform Crime Report for 2016 shows more ...",Nick Schroer,2017-10-17,1,"[72012, 26005, 43481, 55671]",12


# Split into training and test data

In [5]:
# Split into training set and test set
df_copy = df.copy()
train_set = df_copy.sample(frac=0.8, random_state=0)
test_set = df_copy.drop(train_set.index)

In [6]:
train_set

Unnamed: 0,claim,claimant,date,label,related_articles,id
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,2017-10-31,1,"[34218, 55700, 18736, 39031, 34219, 34220]",10354
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,2014-09-12,0,"[73190, 76997, 38841, 77415, 77303, 9280, 8332...",2053
11035,Says Target installed urinals in a women’s bat...,Facebook posts,2016-04-22,0,"[9619, 22197]",12160
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,2019-04-15,0,"[57163, 31528, 40908, 31536, 68904, 44601]",13458
11354,: The AMBER Alert system has been discontinu...,,2013-10-13,0,"[103978, 121475, 121849]",12504
...,...,...,...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,2014-09-23,1,"[9581, 89571, 7836, 7945, 7949, 77360, 83491, ...",3208
6096,"A photograph captures Harriet Tubman as a ""Gun...",,2019-03-25,0,"[125108, 125968, 126005]",6701
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,2014-06-14,0,"[80115, 93998, 5968, 175, 91475, 8710, 89881, ...",11514
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,2008-02-25,1,"[96453, 71123, 61, 69968, 96477]",5966


In [7]:
len(test_set)

3111

# Save training and test data

In [8]:
# Save training and test data frames
train_set.to_pickle('train_set.pkl')
test_set.to_pickle('test_set.pkl')

In [9]:
# Save training and test sets as json files
train_set.to_json('../input/train.json', orient='records')
test_set.to_json('../input/test.json', orient='records')

In [10]:
# Save training solutions
with open("../output/train_sol.txt", 'w') as f:
    for index, row in train_set.iterrows():
        f.write('%d,%d\n' % (row['id'], row['label']) )

In [11]:
# Save test solutions
with open("../output/test_sol.txt", 'w') as f:
    for index, row in test_set.iterrows():
        f.write('%d,%d\n' % (row['id'], row['label']) )

# Related articles data

In [12]:
articles = []

for file in glob.glob(os.path.join("../dataset/articles", '*.txt')):
    with open(file) as f:
        body = " ".join(line for line in f)
    
    base = os.path.basename(file)
    file_name = os.path.splitext(base)[0]
    
    article = (os.path.basename(file_name), body)
    articles.append(article)

In [13]:
articles_df = pd.DataFrame(articles)
articles_df.columns = ['article_id', 'article']
articles_df.to_pickle('articles.pkl')

In [14]:
articles_df

Unnamed: 0,article_id,article
0,60583,These Republicans are misleading voters about ...
1,120801,They sued for Clinton's emails. Now they want ...
2,66570,Heritage's Fun With 'Defund Obamacare' Polling...
3,123469,Size of U.S. Unauthoriized Immigrant Workforce...
4,13133,"Border Crossing/Entry Data\n Hide Coverage, Av..."
...,...,...
64969,93026,Florida legislators hope to fix nuclear advanc...
64970,9904,"A Chart Is Worth 1,000 Words\n Todd Harrison, ..."
64971,87442,"Prison plan assailed as 'sneaky,' misleading\n..."
64972,59427,Updated: Do Russia probe attorneys’ donations ...
