### Imports and Reading in Data

In [21]:
import pandas as pd
import numpy as np
import csv
import io

In [22]:
data_path = "public_data_multi_fc/train.tsv"

data = pd.read_csv(data_path,sep='\t',quoting=csv.QUOTE_NONE,header=None)


In [80]:
data.columns = ['claimID', 'claim', 'label', 'claimURL', 'reason', 'categories', 'speaker', 'checker', 'tags', 'articleTitle', 'publishDate', 'claimDate', 'entities']
print(len(data))
data.head()

27940


Unnamed: 0,claimID,claim,label,claimURL,reason,categories,speaker,checker,tags,articleTitle,publishDate,claimDate,entities
0,pomt-03627,"""Six out of 10 of the highest unemployment rat...",half-true,/ohio/statements/2013/may/06/chris-redfern/ohi...,When a couple of Statehouse Republicans prepar...,,Chris Redfern,,,,2013-05-06T06:00:00,2013-04-30,['None']
1,pomt-09611,"""No Democratic campaign for (Fla.) governor ha...",true,/florida/statements/2010/jan/15/alex-sink/flor...,Florida's leading Republican candidate for gov...,,Alex Sink,,,,2010-01-15T13:59:00,2010-01-06,['None']
2,tron-00214,Forward an email for Jasmine,fiction!,https://www.truthorfiction.com/jasmine/,,9-11-attack,,,,Forward an email for Jasmine,"Mar 17, 2015",,['None']
3,snes-04484,Pope Francis endorsed Donald Trump for president.,false,https://www.snopes.com/fact-check/pope-francis...,,Junk News,,Dan Evon,,"Pope Francis Shocks World, Endorses Donald Tru...",10 July 2016,,['None']
4,pomt-06704,Says Ron Paul insisted FEMA should be shut down.,true,/texas/statements/2011/sep/03/maureen-dowd/mau...,Commenting on the federal response to Hurrican...,,Maureen Dowd,,,,2011-09-03T06:00:00,2011-08-30,['None']


### Pre Processing Examples for BERT

Here we are preparing each (claim,ranked_doc_snippet) pair as a preliminary test to see what the BERT encodings will look like. 

The "if" in the except clause has to do with a claimID that has no claim in the original train.tsv file. It is a one-off case. 

In [66]:
pre_instances = []
count = 0
for a in range(len(list(data.claim))):
    claim,claimID = list(data.claim)[a], list(data.claimID)[a]
    try:
        f=open("public_data_multi_fc/snippets/{claimID}".format(claimID=claimID), "r")
        for line in f.readlines():
            split = line.split("\t")
            pre_instance = "[CLS] "+ claim +" [SEP] "+split[2]
            pre_instances.append(pre_instance)
    except FileNotFoundError:
        if(claimID != "bove-00197"):
            pre_instance = "[CLS] "+ claim
            pre_instances.append(pre_instance)
    

In [73]:
pre_instances = []
count = 0
for a in range(len(list(data.claim))):
    claim,claimID,label = list(data.claim)[a], list(data.claimID)[a], list(data.label)[a]
    try:
        f=open("public_data_multi_fc/snippets/{claimID}".format(claimID=claimID), "r")
        for line in f.readlines():
            split = line.split("\t")
            pre_instance = "[CLS] "+ claim +" [SEP] "+split[2]
            pre_instances.append([pre_instance,label])
    except FileNotFoundError:
        if(claimID != "bove-00197"):
            pre_instance = "[CLS] "+ claim
            pre_instances.append([pre_instance,label])
    

In [79]:
pre_instances[-1]

['[CLS] "Obama Announces Bid To Become UN Secretary General" [SEP] Feb 28, 2018 ... Former President Barack Obama never campaigned to become U.N. secretary- general, despite a false story claiming he had announced a bid\xa0...',
 'pants on fire!']

### Current Issues, Questions, and Thoughts for Later On

In [None]:
'''
ISSUES
(1) some claims dont have supporting evidence
    3755 (13.4%) of claims don't have evidence
        
QUESTIONS
(1) where/how do we incorporate labels
(2) max sentence length 

STUFF FOR LATER
(1)instead of (claim,snippet) encodings, we could have claim encoded + dictionary
of key(claim), value(array of snippets) to make concatenation simpler?

CURRENT GOAL
trying to translate prep_instance from tokenization.py for our purposes
'''

### Implementing BERT Tokenizer on Training Data

In [71]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizer.pad_token

I0407 16:42:03.654283 4629784000 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/Aren/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


'[PAD]'

Following is adapted from https://github.com/DAlkemade/bert-for-fever/tree/master/bert_for_fever/tokenization

In [89]:
MAX_SENTENCE_LENGTH= 512
PADDING_TOKEN_TYPE_ID = 0
pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]

features = []
for instance in pre_instances:
    encodings = tokenizer.encode_plus(instance[0], add_special_tokens=True,max_length=MAX_SENTENCE_LENGTH)
    input_ids, token_type_ids = encodings["input_ids"], encodings["token_type_ids"]
    
    attention_mask = [1] * len(input_ids)
    # Pad on the right
    padding_length = MAX_SENTENCE_LENGTH - len(input_ids)
    # The next 3 lines are taken from the example at https://github.com/huggingface/transformers/blob/0cb163865a4c761c226b151283309eedb2b1ca4d/transformers/data/processors/glue.py#L30
    input_ids = input_ids + ([pad_token] * padding_length)
    # We mask padding with 0
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([PADDING_TOKEN_TYPE_ID] * padding_length)
    feature = InputFeatures(input_ids=input_ids, attention_mask=attention_mask,
                         token_type_ids=token_type_ids, label=instance[1])
    features.append(feature)


W0407 17:10:20.279006 4629784000 tokenization_utils.py:677] Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors
W0407 17:16:49.113275 4629784000 tokenization_utils.py:677] Token indices sequence length is longer than the specified maximum sequence length for this model (1245 > 512). Running this sequence through the model will result in indexing errors
W0407 17:18:52.421155 4629784000 tokenization_utils.py:677] Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors


In [90]:
features[0]

{
  "attention_mask": [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
   