# Claims

## Set Up

In [1]:
import pandas as pd
import numpy as np
import json
import pickle

METADATA_FILEPATH = '../dataset/metadata.json'
SOLUTIONS_FILEPATH = 'solutions.txt'

## Load metadata

In [2]:
with open(METADATA_FILEPATH, 'r') as f:
    claims = json.load(f)

In [3]:
claims_df = pd.DataFrame(claims)

In [4]:
claims_df.head(10)

Unnamed: 0,claim,claimant,date,label,related_articles,id
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,"[122094, 122580, 130685, 134765]",0
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,2,"[106868, 127320, 128060]",1
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,1,"[132130, 132132, 149722]",4
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,2,"[123254, 123418, 127464]",5
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6
5,"Rhode Island is ""almost dead last"" among North...",Leonidas Raptakis,2014-02-11,2,"[8284, 3768, 20091, 82368, 73148, 4493]",7
6,The poorest counties in the U.S. are in Appala...,Jim Webb,2014-11-19,1,"[70709, 70708]",8
7,Koch Industries paid the legal fees of George ...,,2013-07-18,0,"[120591, 120592, 127866, 129483]",9
8,"""Minnesota, Michigan, Iowa already have 70 mph...",Robin Vos,2013-08-22,1,"[69547, 80095, 7994, 81116, 77621]",11
9,"""FBI Uniform Crime Report for 2016 shows more ...",Nick Schroer,2017-10-17,1,"[72012, 26005, 43481, 55671]",12


## Claimaints

In [5]:
claimaints = pd.DataFrame(claims_df, columns=['claim', 'claimant', 'label'])
claimaints

Unnamed: 0,claim,claimant,label
0,A line from George Orwell's novel 1984 predict...,,0
1,Maine legislature candidate Leslie Gibson insu...,,2
2,A 17-year-old girl named Alyssa Carson is bein...,,1
3,In 1988 author Roald Dahl penned an open lette...,,2
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2
...,...,...,...
15550,"The omnibus spending bill has ""9,427 pork barr...",John McCain,2
15551,Representative Maxine Waters said Muslims were...,,0
15552,"""We were not, I repeat, were not told that wat...",Nancy Pelosi,0
15553,"As of August 2017, members of the public could...",,2


In [6]:
claimaints['claimant'].replace('', np.nan, inplace=True)
claimaints

Unnamed: 0,claim,claimant,label
0,A line from George Orwell's novel 1984 predict...,,0
1,Maine legislature candidate Leslie Gibson insu...,,2
2,A 17-year-old girl named Alyssa Carson is bein...,,1
3,In 1988 author Roald Dahl penned an open lette...,,2
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2
...,...,...,...
15550,"The omnibus spending bill has ""9,427 pork barr...",John McCain,2
15551,Representative Maxine Waters said Muslims were...,,0
15552,"""We were not, I repeat, were not told that wat...",Nancy Pelosi,0
15553,"As of August 2017, members of the public could...",,2


In [7]:
unique_claimaints = claimaints.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
unique_claimaints

Unnamed: 0,claim,claimant,label
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2
5,"Rhode Island is ""almost dead last"" among North...",Leonidas Raptakis,2
6,The poorest counties in the U.S. are in Appala...,Jim Webb,1
8,"""Minnesota, Michigan, Iowa already have 70 mph...",Robin Vos,1
9,"""FBI Uniform Crime Report for 2016 shows more ...",Nick Schroer,1
...,...,...,...
15548,"Says Aaron Rodgers ""is not the highest tax rat...",Paul Ryan,1
15549,"""They (Clinton and Obama) have never to my kno...",John McCain,0
15550,"The omnibus spending bill has ""9,427 pork barr...",John McCain,2
15552,"""We were not, I repeat, were not told that wat...",Nancy Pelosi,0


In [8]:
labelled_claimants = unique_claimaints.groupby('claimant')['label'].apply(list).to_dict()
labelled_claimants

{'"A Woman’s Right to Know Information Material”': [1],
 '"suburban mom" for Scott Taylor': [1],
 '@LagBeachAntifa9': [0],
 '@Sowellnomics': [1, 1, 0],
 '@WhiteHouse': [0],
 'A Facebook page': [0],
 'A Stronger Wisconsin': [1],
 'A.J. Jacobs': [2],
 'AARP': [0],
 'ABC NEWS-US': [0],
 'ACLU Foundation of Georgia': [2],
 'ACLU of North Carolina': [2],
 'AFL-CIO': [1],
 'AFL-CIO of New Jersey': [1],
 'AFP Fact Check': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 'AFP Fact CheckBobby Mazaratti': [0],
 'AFSCME': [1, 0, 1],
 'AFSCME People': [1, 1],
 'Aaron DeGroot': [1],
 'Abdikadir Mohamed': [0],
 'Abia Pulse News': [0],
 'Abigail Spanberger': [1],
 'Abubakar Bwari': [0],
 'ActionAid UK': [1, 1],
 'Activist Mommy': [1],
 'Adam Hasner': [1, 2, 1, 1],
 'Adam Kinzinger': [0, 0],
 'Adam Putnam': [1, 0, 0, 1, 1, 1, 1, 1],
 'Adam Schefter': [1],
 'Adam Schiff': [2, 0],
 'Addicting Information': [0],
 'Aden Duale': [0, 2, 0],
 'Adrian Garcia': [1],
 'African Bro': [0],
 'African Na

In [9]:
from statistics import mode

single_label_claimants = {}

for claimant, label in labelled_claimants.items():
    try:
        single_label_claimants[claimant] = mode(label)
    except:
        single_label_claimants[claimant] = min(label)

single_label_claimants

{'"A Woman’s Right to Know Information Material”': 1,
 '"suburban mom" for Scott Taylor': 1,
 '@LagBeachAntifa9': 0,
 '@Sowellnomics': 1,
 '@WhiteHouse': 0,
 'A Facebook page': 0,
 'A Stronger Wisconsin': 1,
 'A.J. Jacobs': 2,
 'AARP': 0,
 'ABC NEWS-US': 0,
 'ACLU Foundation of Georgia': 2,
 'ACLU of North Carolina': 2,
 'AFL-CIO': 1,
 'AFL-CIO of New Jersey': 1,
 'AFP Fact Check': 0,
 'AFP Fact CheckBobby Mazaratti': 0,
 'AFSCME': 1,
 'AFSCME People': 1,
 'Aaron DeGroot': 1,
 'Abdikadir Mohamed': 0,
 'Abia Pulse News': 0,
 'Abigail Spanberger': 1,
 'Abubakar Bwari': 0,
 'ActionAid UK': 1,
 'Activist Mommy': 1,
 'Adam Hasner': 1,
 'Adam Kinzinger': 0,
 'Adam Putnam': 1,
 'Adam Schefter': 1,
 'Adam Schiff': 0,
 'Addicting Information': 0,
 'Aden Duale': 0,
 'Adrian Garcia': 1,
 'African Bro': 0,
 'African National Congess': 1,
 'African National Congress': 0,
 'Afrikan Daily': 0,
 'Aftab Pureval': 1,
 'Ahmednasir Abdullahi': 0,
 'Ainsley Earhardt': 0,
 'Airline Ambassadors International

In [10]:
pickle.dump( single_label_claimants, open( "preprocessing/labelled_claimants.p", "wb" ) )

## Top Claimaints

In [11]:
top_claimants = claims_df['claimant'].value_counts()
top_claimants.columns = ['claimant', 'claims']
top_claimants_df = top_claimants.rename_axis('claimant').reset_index(name='claims')
top_claimants_df

Unnamed: 0,claimant,claims
0,,4962
1,Donald Trump,1233
2,Bloggers,372
3,Barack Obama,234
4,Hillary Clinton,220
...,...,...
3100,Florida Strong,1
3101,Americans United for Change,1
3102,National Association of Realtors,1
3103,Igor Korotchenko,1


## False Claimants

In [12]:
false_claims = claims_df.loc[claims_df['label'] == 0]
false_claimants = false_claims['claimant'].value_counts()
false_claimants_df = false_claimants.rename_axis('claimant').reset_index(name='claims')

In [13]:
false_claimants_df.head(10)

Unnamed: 0,claimant,claims
0,,3034
1,Donald Trump,821
2,Bloggers,309
3,Viral image,115
4,Various websites,106
5,Facebook posts,79
6,multiple sources,69
7,Chain email,61
8,Facebook user,58
9,Hillary Clinton,47


In [14]:
top_false_claimants = false_claimants_df[false_claimants_df['claims'] > 0]

In [15]:
top_false_claimants

Unnamed: 0,claimant,claims
0,,3034
1,Donald Trump,821
2,Bloggers,309
3,Viral image,115
4,Various websites,106
...,...,...
1449,Someonesbones,1
1450,Evelyn Sanguinetti,1
1451,Joseph Henchman,1
1452,Kay Bailey Hutchison,1


In [16]:
labelled_false_claimants = pd.DataFrame(top_false_claimants, columns=['claimant'])
labelled_false_claimants['label'] = 0
labelled_false_claimants.drop(labelled_false_claimants.index[0], inplace=True) # Drop empty claimant row
labelled_false_claimants

Unnamed: 0,claimant,label
1,Donald Trump,0
2,Bloggers,0
3,Viral image,0
4,Various websites,0
5,Facebook posts,0
...,...,...
1449,Someonesbones,0
1450,Evelyn Sanguinetti,0
1451,Joseph Henchman,0
1452,Kay Bailey Hutchison,0


## Partly True Claimants

In [17]:
partly_claims = claims_df.loc[claims_df['label'] == 1]
partly_claimants = partly_claims['claimant'].value_counts()
partly_claimants_df = partly_claimants.rename_axis('claimant').reset_index(name='claims')

In [18]:
partly_claimants_df.head(10)

Unnamed: 0,claimant,claims
0,,1287
1,Donald Trump,387
2,Barack Obama,150
3,Hillary Clinton,132
4,Bernie Sanders,74
5,Marco Rubio,65
6,Ted Cruz,64
7,Scott Walker,59
8,Bloggers,59
9,Rick Scott,57


In [19]:
top_partly_claimants = partly_claimants_df[partly_claimants_df['claims'] > 0]

In [20]:
top_partly_claimants

Unnamed: 0,claimant,claims
0,,1287
1,Donald Trump,387
2,Barack Obama,150
3,Hillary Clinton,132
4,Bernie Sanders,74
...,...,...
1899,Aaron DeGroot,1
1900,Mark Emmert,1
1901,Unite Here Florida PAC,1
1902,Laurence Tribe,1


In [21]:
labelled_partly_claimants = pd.DataFrame(top_partly_claimants, columns=['claimant'])
labelled_partly_claimants['label'] = 1
labelled_partly_claimants.drop(labelled_partly_claimants.index[0], inplace=True)
labelled_partly_claimants

Unnamed: 0,claimant,label
1,Donald Trump,1
2,Barack Obama,1
3,Hillary Clinton,1
4,Bernie Sanders,1
5,Marco Rubio,1
...,...,...
1899,Aaron DeGroot,1
1900,Mark Emmert,1
1901,Unite Here Florida PAC,1
1902,Laurence Tribe,1


## True Claimants

In [22]:
true_claims = claims_df.loc[claims_df['label'] == 2]
true_claimants = true_claims['claimant'].value_counts()
true_claimants_df = true_claimants.rename_axis('claimant').reset_index(name='claims')

In [23]:
true_claimants_df.head(10)

Unnamed: 0,claimant,claims
0,,641
1,Barack Obama,42
2,Hillary Clinton,41
3,Donald Trump,25
4,President Cyril Ramaphosa,16
5,John McCain,16
6,Scott Walker,13
7,Bernie Sanders,12
8,Jeb Bush,12
9,Rick Perry,11


In [24]:
top_true_claimants = true_claimants_df[true_claimants_df['claims'] > 0]

In [25]:
top_true_claimants

Unnamed: 0,claimant,claims
0,,641
1,Barack Obama,42
2,Hillary Clinton,41
3,Donald Trump,25
4,President Cyril Ramaphosa,16
...,...,...
617,Texas State Employees Union,1
618,Kirk Cox,1
619,Aden Duale,1
620,John Plumb,1


In [26]:
labelled_true_claimants = pd.DataFrame(top_true_claimants, columns=['claimant'])
labelled_true_claimants['label'] = 2
labelled_true_claimants.drop(labelled_true_claimants.index[0], inplace=True)
labelled_true_claimants

Unnamed: 0,claimant,label
1,Barack Obama,2
2,Hillary Clinton,2
3,Donald Trump,2
4,President Cyril Ramaphosa,2
5,John McCain,2
...,...,...
617,Texas State Employees Union,2
618,Kirk Cox,2
619,Aden Duale,2
620,John Plumb,2


## Labelled Claimants

In [27]:
# claimants_df1 = labelled_false_claimants
# claimants_df1

In [28]:
# claimants_df2 = claimants_df1.append(labelled_partly_claimants, ignore_index=True)
# claimants_df2

In [29]:
# claimants_df3 = claimants_df2.append(labelled_true_claimants, ignore_index=True)
# claimants_df3

In [30]:
# unique_labelled_claimants = claimants_df3.drop_duplicates(subset=['claimant'], keep='first')
# unique_labelled_claimants

In [31]:
# labelled_claimants = unique_labelled_claimants.groupby('claimant')['label'].apply(list).to_dict()
# labelled_claimants
# pickle.dump( labelled_claimants, open( "labelled_claimants.p", "wb" ) )

In [32]:
# labelled_claimants = claimants_df3.groupby('claimant')['label'].apply(list).to_dict()

In [33]:
# labelled_claimants

In [34]:
# # Assign the most frequent label to each claimant, or the lowest label value in the event of a tie

# from statistics import mode

# single_label_claimants = {}

# for claimant, label in labelled_claimants.items():
#     try:
#         single_label_claimants[claimant] = mode(label)
#     except:
#         single_label_claimants[claimant] = min(label)

In [35]:
# single_label_claimants

In [36]:
# pickle.dump( single_label_claimants, open( "labelled_claimants.p", "wb" ) )