### Imports and Reading in Data

In [143]:
import pandas as pd
import numpy as np
import csv
import io

In [144]:
data_path = "public_data_multi_fc/train.tsv"

data = pd.read_csv(data_path,sep='\t',quoting=csv.QUOTE_NONE,header=None)


In [145]:
data.columns = ['claimID', 'claim', 'label', 'claimURL', 'reason', 'categories', 'speaker', 'checker', 'tags', 'articleTitle', 'publishDate', 'claimDate', 'entities']
print(len(data))
data.head(6)

27940


Unnamed: 0,claimID,claim,label,claimURL,reason,categories,speaker,checker,tags,articleTitle,publishDate,claimDate,entities
0,pomt-03627,"""Six out of 10 of the highest unemployment rat...",half-true,/ohio/statements/2013/may/06/chris-redfern/ohi...,When a couple of Statehouse Republicans prepar...,,Chris Redfern,,,,2013-05-06T06:00:00,2013-04-30,['None']
1,pomt-09611,"""No Democratic campaign for (Fla.) governor ha...",true,/florida/statements/2010/jan/15/alex-sink/flor...,Florida's leading Republican candidate for gov...,,Alex Sink,,,,2010-01-15T13:59:00,2010-01-06,['None']
2,tron-00214,Forward an email for Jasmine,fiction!,https://www.truthorfiction.com/jasmine/,,9-11-attack,,,,Forward an email for Jasmine,"Mar 17, 2015",,['None']
3,snes-04484,Pope Francis endorsed Donald Trump for president.,false,https://www.snopes.com/fact-check/pope-francis...,,Junk News,,Dan Evon,,"Pope Francis Shocks World, Endorses Donald Tru...",10 July 2016,,['None']
4,pomt-06704,Says Ron Paul insisted FEMA should be shut down.,true,/texas/statements/2011/sep/03/maureen-dowd/mau...,Commenting on the federal response to Hurrican...,,Maureen Dowd,,,,2011-09-03T06:00:00,2011-08-30,['None']
5,pomt-15232,"Says Hillary Clinton has ""been in office and i...",mostly false,/truth-o-meter/statements/2015/aug/07/marco-ru...,"Sen. Marco Rubio, R-Fla., says the electorate ...",,Marco Rubio,,,,2015-08-07T00:55:38,2015-08-06,['None']


### Pre Processing Examples for BERT

Here we are preparing each (claim,ranked_doc_snippet) pair as a preliminary test to see what the BERT encodings will look like. 

In [160]:
pre_instances = []
count = 0
for a in range(len(list(data.claim))):
    claim,claimID = list(data.claim)[a], list(data.claimID)[a]
    try:
        f=open("public_data_multi_fc/snippets/{claimID}".format(claimID=claimID), "r")
        for line in f.readlines():
            split = line.split("\t")
            pre_instance = "[CLS] "+ claim +" [SEP] "+split[2]
            pre_instances.append(pre_instance)
    except FileNotFoundError:
        count+=1
    

In [161]:
pre_instances[0]

'[CLS] "Six out of 10 of the highest unemployment rates are also in so-called right to work states." [SEP] May 8, 2013 ... Ron Maag and Kristina Roegner, claiming that "six out of 10 of the highest  unemployment rates are also in so-called right to work states.'

### Current Issues, Questions, and Thoughts for Later On

In [None]:
'''
ISSUES
(1) some claims dont have supporting evidence
    3755 (13.4%) of claims don't have evidence
        
QUESTIONS
(1) where/how do we incorporate labels
(2) max sentence length 

STUFF FOR LATER
(1)instead of (claim,snippet) encodings, we could have claim encoded + dictionary
of key(claim), value(array of snippets) to make concatenation simpler?

CURRENT GOAL
trying to translate prep_instance from tokenization.py for our purposes
'''

### Implementing BERT Tokenizer

In [203]:
from transformers import InputFeatures, BertTokenizer

In [198]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizer.pad_token

I0405 17:40:39.496809 4524725696 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/Aren/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


'[PAD]'

In [201]:
MAX_SENTENCE_LENGTH = 20


encodings = tokenizer.encode_plus(pre_instances[0],add_special_tokens=True)


In [None]:
encodings