# Data Analysis

## Set up

In [1]:
import pandas as pd
import json
import glob
import os

## File paths ##
# data/train.json
# data/train_articles/

## Load metadata

In [2]:
with open("data/train.json") as json_file:
    data = json.load(json_file)

In [3]:
meta = pd.DataFrame(data)

In [4]:
# Print first 10 rows for sanity check
meta.head(10)

Unnamed: 0,claim,claimant,date,label,related_articles,id
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,"[122094, 122580, 130685, 134765]",0
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,2,"[106868, 127320, 128060]",1
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,1,"[132130, 132132, 149722]",4
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,2,"[123254, 123418, 127464]",5
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6
5,"Rhode Island is ""almost dead last"" among North...",Leonidas Raptakis,2014-02-11,2,"[8284, 3768, 20091, 82368, 73148, 4493]",7
6,The poorest counties in the U.S. are in Appala...,Jim Webb,2014-11-19,1,"[70709, 70708]",8
7,Koch Industries paid the legal fees of George ...,,2013-07-18,0,"[120591, 120592, 127866, 129483]",9
8,"""Minnesota, Michigan, Iowa already have 70 mph...",Robin Vos,2013-08-22,1,"[69547, 80095, 7994, 81116, 77621]",11
9,"""FBI Uniform Crime Report for 2016 shows more ...",Nick Schroer,2017-10-17,1,"[72012, 26005, 43481, 55671]",12


In [5]:
false_claims = meta.loc[meta['label'] == 0]
false_claims

Unnamed: 0,claim,claimant,date,label,related_articles,id
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,"[122094, 122580, 130685, 134765]",0
7,Koch Industries paid the legal fees of George ...,,2013-07-18,0,"[120591, 120592, 127866, 129483]",9
10,"""Pelosi Sinks to New Low, Tells Dems: If You ...",Western Journal,2018-08-21,0,"[27062, 27061, 20679, 61872, 20677]",13
13,NASA Has Just Confirmed Earth Has A New Moon,Bloggers,2018-03-29,0,"[91455, 72179, 18903, 42080]",17
15,"""Justin Amash is rated Michigan’s No. 1 conser...",Justin Amash,2014-07-01,0,"[22383, 72467, 72466, 86512, 73422, 83732, 83730]",19
...,...,...,...,...,...,...
15542,"""The average premium across this country has ...",Mike Pence,2017-05-25,0,"[32457, 7958]",17129
15547,"""I haven’t really proposed (phasing out aid to...",Rand Paul,2014-08-04,0,"[88399, 91476, 11371, 91483, 7021]",17134
15549,"""They (Clinton and Obama) have never to my kno...",John McCain,2008-05-13,0,"[67611, 67699, 67610, 82239, 86166, 3653, 7112...",17136
15551,Representative Maxine Waters said Muslims were...,,2017-06-06,0,"[103780, 104726, 126025]",17138


In [6]:
part_true_claims = meta.loc[meta['label'] == 1]
part_true_claims

Unnamed: 0,claim,claimant,date,label,related_articles,id
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,1,"[132130, 132132, 149722]",4
6,The poorest counties in the U.S. are in Appala...,Jim Webb,2014-11-19,1,"[70709, 70708]",8
8,"""Minnesota, Michigan, Iowa already have 70 mph...",Robin Vos,2013-08-22,1,"[69547, 80095, 7994, 81116, 77621]",11
9,"""FBI Uniform Crime Report for 2016 shows more ...",Nick Schroer,2017-10-17,1,"[72012, 26005, 43481, 55671]",12
11,Socialist teachers at South Charlotte Middle S...,,2018-10-17,1,"[104287, 144516]",14
...,...,...,...,...,...,...
15535,"Says Rep. Martha McSally ""is a #FlipFlopBorder...",Kelli Ward,2018-03-16,1,"[67183, 20180, 41193, 28711, 20181, 34090, 201...",17122
15543,"At an Arizona town hall event, Sen. Jeff Flake...",,2017-04-17,1,"[143451, 145474]",17130
15546,President Obama signed a law permanently prote...,,2017-01-09,1,"[107369, 122972, 147969, 38987, 151939]",17133
15548,"Says Aaron Rodgers ""is not the highest tax rat...",Paul Ryan,2017-08-21,1,"[53671, 30934, 94982, 30953, 30949]",17135


In [7]:
true_claims = meta.loc[meta['label'] == 2]
true_claims

Unnamed: 0,claim,claimant,date,label,related_articles,id
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,2,"[106868, 127320, 128060]",1
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,2,"[123254, 123418, 127464]",5
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6
5,"Rhode Island is ""almost dead last"" among North...",Leonidas Raptakis,2014-02-11,2,"[8284, 3768, 20091, 82368, 73148, 4493]",7
14,"""We are always going to need architects, docto...",Mike Parson,2019-01-24,2,"[42685, 32007, 33562]",18
...,...,...,...,...,...,...
15539,A photograph shows a musher riding over snowle...,,2017-11-06,2,"[108255, 109040, 110398, 114040, 114042, 11410...",17126
15544,"A photograph shows Donald Trump, Muhammad Ali,...",,2018-08-13,2,"[118938, 125644, 127592]",17131
15545,A photograph shows a man mowing his lawn durin...,,2017-06-05,2,"[107244, 115705, 142189]",17132
15550,"The omnibus spending bill has ""9,427 pork barr...",John McCain,2009-02-25,2,"[82947, 93503]",17137


## Simple statistics

In [8]:
num_claims = len(meta.index)
print("Total number of claims:\t\t\t", num_claims)

num_false_claims = len(false_claims.index)
print("\nNumber of false claims:\t\t\t", num_false_claims)

num_part_true_claims = len(part_true_claims.index)
print("Number of partially true claims:\t", num_part_true_claims)

num_true_claims = len(true_claims.index)
print("Number of false claims:\t\t\t", num_true_claims)

percent_false = num_false_claims / num_claims
percent_part_true = num_part_true_claims / num_claims
percent_true = num_true_claims / num_claims

print("\nPercentage of false claims:\t\t", "{0:.2%}".format(percent_false))
print("Percentage of partially true claims:\t", "{0:.2%}".format(percent_part_true))
print("Percentage of true claims:\t\t", "{0:.2%}".format(percent_true))

Total number of claims:			 15555

Number of false claims:			 7408
Number of partially true claims:	 6451
Number of false claims:			 1696

Percentage of false claims:		 47.62%
Percentage of partially true claims:	 41.47%
Percentage of true claims:		 10.90%


## Load article data

In [9]:
path = "data/train_articles"
articles = []

for file in glob.glob(os.path.join(path, '*.txt')):
    with open(file) as f:
        body = " ".join(line for line in f)
    
    base = os.path.basename(file)
    file_name = os.path.splitext(base)[0]
    
    article = (os.path.basename(file_name), body)
    articles.append(article)

In [10]:
news = pd.DataFrame(articles)
news.columns = ["article_id", "article"]

In [11]:
news

Unnamed: 0,article_id,article
0,60583,These Republicans are misleading voters about ...
1,120801,They sued for Clinton's emails. Now they want ...
2,66570,Heritage's Fun With 'Defund Obamacare' Polling...
3,123469,Size of U.S. Unauthoriized Immigrant Workforce...
4,13133,"Border Crossing/Entry Data\n Hide Coverage, Av..."
...,...,...
64969,93026,Florida legislators hope to fix nuclear advanc...
64970,9904,"A Chart Is Worth 1,000 Words\n Todd Harrison, ..."
64971,87442,"Prison plan assailed as 'sneaky,' misleading\n..."
64972,59427,Updated: Do Russia probe attorneys’ donations ...
