# Set up

In [37]:
import pandas as pd
import numpy as np
import json
import pickle

# Load training data

In [38]:
df = pd.read_pickle('train_set.pkl')

# Claimant classifier

In [39]:
claimaints = pd.DataFrame(df, columns=['claim', 'claimant', 'label'])
claimaints

Unnamed: 0,claim,claimant,label
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,1
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,0
11035,Says Target installed urinals in a women’s bat...,Facebook posts,0
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,0
11354,: The AMBER Alert system has been discontinu...,,0
...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,1
6096,"A photograph captures Harriet Tubman as a ""Gun...",,0
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,0
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,1


In [40]:
claimaints['claimant'].replace('', np.nan, inplace=True)
claimaints

Unnamed: 0,claim,claimant,label
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,1
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,0
11035,Says Target installed urinals in a women’s bat...,Facebook posts,0
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,0
11354,: The AMBER Alert system has been discontinu...,,0
...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,1
6096,"A photograph captures Harriet Tubman as a ""Gun...",,0
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,0
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,1


In [41]:
unique_claimaints = claimaints.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
unique_claimaints

Unnamed: 0,claim,claimant,label
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,1
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,0
11035,Says Target installed urinals in a women’s bat...,Facebook posts,0
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,0
8662,"Judge Gonzalo Curiel ""is giving us very unfair...",Donald Trump,0
...,...,...,...
5974,"""We have right now, … in the state of Californ...",Steve Cortes,1
11698,"""Transgender individuals in the U.S. have a 1-...",Garnet Coleman,0
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,1
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,0


In [42]:
labelled_claimants = unique_claimaints.groupby('claimant')['label'].apply(list).to_dict()
labelled_claimants

{'"A Woman’s Right to Know Information Material”': [1],
 '"suburban mom" for Scott Taylor': [1],
 '@LagBeachAntifa9': [0],
 '@Sowellnomics': [1, 0],
 'A Facebook page': [0],
 'A Stronger Wisconsin': [1],
 'AARP': [0],
 'ABC NEWS-US': [0],
 'ACLU Foundation of Georgia': [2],
 'AFL-CIO': [1],
 'AFL-CIO of New Jersey': [1],
 'AFP Fact Check': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 'AFSCME': [1, 1, 0],
 'AFSCME People': [1, 1],
 'Aaron DeGroot': [1],
 'Abdikadir Mohamed': [0],
 'Abia Pulse News': [0],
 'Abigail Spanberger': [1],
 'Abubakar Bwari': [0],
 'ActionAid UK': [1],
 'Activist Mommy': [1],
 'Adam Hasner': [1, 1, 1, 2],
 'Adam Kinzinger': [0],
 'Adam Putnam': [1, 1, 0, 1],
 'Adam Schefter': [1],
 'Adam Schiff': [2, 0],
 'Addicting Information': [0],
 'Aden Duale': [0, 0],
 'Adrian Garcia': [1],
 'African National Congess': [1],
 'African National Congress': [2, 0, 0, 0],
 'Afrikan Daily': [0, 0, 0, 0],
 'Ahmednasir Abdullahi': [0, 0],
 'Ainsley Earhardt': [0],
 'Airline Amb

In [43]:
# Assign the "average" truth value of the claimant's claims
from statistics import mode

single_label_claimants = {}

for claimant, label in labelled_claimants.items():
    single_label_claimants[claimant] = round(sum(label)/len(label))

single_label_claimants

{'"A Woman’s Right to Know Information Material”': 1,
 '"suburban mom" for Scott Taylor': 1,
 '@LagBeachAntifa9': 0,
 '@Sowellnomics': 0,
 'A Facebook page': 0,
 'A Stronger Wisconsin': 1,
 'AARP': 0,
 'ABC NEWS-US': 0,
 'ACLU Foundation of Georgia': 2,
 'AFL-CIO': 1,
 'AFL-CIO of New Jersey': 1,
 'AFP Fact Check': 0,
 'AFSCME': 1,
 'AFSCME People': 1,
 'Aaron DeGroot': 1,
 'Abdikadir Mohamed': 0,
 'Abia Pulse News': 0,
 'Abigail Spanberger': 1,
 'Abubakar Bwari': 0,
 'ActionAid UK': 1,
 'Activist Mommy': 1,
 'Adam Hasner': 1,
 'Adam Kinzinger': 0,
 'Adam Putnam': 1,
 'Adam Schefter': 1,
 'Adam Schiff': 1,
 'Addicting Information': 0,
 'Aden Duale': 0,
 'Adrian Garcia': 1,
 'African National Congess': 1,
 'African National Congress': 0,
 'Afrikan Daily': 0,
 'Ahmednasir Abdullahi': 0,
 'Ainsley Earhardt': 0,
 'Airline Ambassadors International': 0,
 'Al Franken': 0,
 'Al Gore': 1,
 'Al Jazeera': 0,
 'Al Jazeera America': 0,
 'Alan  Essig': 2,
 'Alan Grayson': 1,
 'Alberta Darling': 1,


In [44]:
pickle.dump( single_label_claimants, open( "../input/train_claimants.p", "wb" ) )

# Related articles classifier

In [45]:
related_articles = []

for index, row in df.iterrows():
    for article in row['related_articles']:
        related_articles.append((article, row['id'], row['label']))

In [46]:
related_articles_df = pd.DataFrame(related_articles)
related_articles_df.columns = ['article_id', 'claim_id', 'label']
related_articles_df

Unnamed: 0,article_id,claim_id,label
0,34218,10354,1
1,55700,10354,1
2,18736,10354,1
3,39031,10354,1
4,34219,10354,1
...,...,...,...
62323,61,5966,1
62324,69968,5966,1
62325,96477,5966,1
62326,120293,7328,1


In [47]:
article_id_labels = related_articles_df.groupby('article_id')['label'].apply(list).to_dict()
article_id_labels

{2: [0],
 8: [0],
 15: [2],
 18: [1, 1],
 19: [1],
 21: [1, 1],
 22: [1, 0],
 23: [1, 1],
 31: [0],
 32: [0],
 33: [0],
 34: [0],
 35: [1],
 39: [1],
 40: [1],
 41: [1],
 42: [0],
 43: [0, 0],
 57: [1],
 61: [1],
 66: [0],
 67: [0],
 70: [0],
 73: [0],
 82: [1, 2],
 84: [1],
 88: [0],
 89: [2],
 90: [0, 0, 0, 0],
 92: [0],
 93: [1],
 94: [2],
 97: [1],
 98: [0],
 100: [1, 1],
 101: [0],
 105: [1],
 111: [0],
 114: [0],
 117: [0],
 119: [1],
 120: [1],
 121: [1],
 122: [0],
 123: [1],
 125: [0],
 127: [0],
 129: [1],
 131: [0],
 132: [1],
 134: [0],
 137: [1],
 138: [1],
 142: [1],
 145: [1],
 146: [1],
 148: [1],
 149: [1],
 151: [0],
 152: [1, 0],
 153: [1],
 154: [0],
 155: [1],
 156: [1],
 157: [1],
 158: [1, 0],
 159: [1],
 160: [1],
 161: [0],
 162: [1],
 163: [0],
 164: [0],
 168: [0],
 171: [0, 0, 1],
 172: [0],
 173: [0],
 175: [0, 0],
 176: [1],
 177: [0],
 178: [1],
 179: [0],
 180: [1],
 181: [0],
 183: [1],
 184: [1],
 185: [0, 0, 0, 0],
 187: [1],
 190: [0],
 200: [0],
 20

In [48]:
# Assign the average apeparing label to each article

from statistics import mode

article_id_single_label = {}

for article_id, label in article_id_labels.items():
    article_id_single_label[article_id] = round(sum(label)/len(label))

In [49]:
pickle.dump( article_id_single_label, open( "../input/train_article_ids.p", "wb" ) )

# Analyze claim text

In [50]:
# import nltk

# claim_text = train_set
# claim_text["claim"] = train_set["claim"].apply(nltk.word_tokenize)

In [51]:
# claim_text

In [52]:
# counts = dict()

# for index, row in claim_text.iterrows():
#     for word in row['claim']:
#         if row['label'] == 0:
#             counts[word] = counts.get(word, 0) + 1

In [53]:
# print(counts)

In [54]:
# false_claims = train_set.loc[train_set['label'] == 0]
# false_claims

In [55]:
# false_counts = dict()

# for index, row in false_claims.iterrows():
#     for word in row['claim']:
#         false_counts[word] = false_counts.get(word, 0) + 1

# false_counts

In [56]:
# partly_claims = train_set.loc[train_set['label'] == 1]
# partly_claims

In [57]:
# partly_counts = dict()

# for index, row in partly_claims.iterrows():
#     for word in row['claim']:
#         partly_counts[word] = partly_counts.get(word, 0) + 1

# partly_counts

In [58]:
# true_claims = train_set.loc[train_set['label'] == 2]
# true_claims

In [59]:
# true_counts = dict()

# for index, row in true_claims.iterrows():
#     for word in row['claim']:
#         true_counts[word] = true_counts.get(word, 0) + 1

# true_counts

In [60]:
# # Merge word dictionaries

# from itertools import chain
# from collections import defaultdict

In [61]:
# vocab = defaultdict(list)
# for word, count in chain(false_counts.items(), partly_counts.items(), true_counts.items()):
#     vocab[word].append(count)

In [62]:
# vocab