In [1]:
import pickle
processed_data = pickle.load(open("../fever_processed.pickle", "rb"))

### Data prep for Evidence Accumulation

* The significance of a given statement as evidence to a claim/question is modelled as a classification problem
* Any length of text (such as a sentence from a document), is appended to the claim/question in the usual way. "CLS" <claim/question tokens> "SEP" <potential/evidence tokens> "SEP"
* A class is awarded to the combined string based on the following:
    - Class 0, if the evidence tokens do not contribute to answering the question
    - Class 1, if the evidence tokens partially answer the question
    - Class 2, if the evidence tokens completely answer the question
* len(claim) + len(evidence) + 3 should be <= 96 

In [2]:
max_len = 96

In [3]:
lines = []
classes = []
import numpy as np
from pytorch_pretrained_bert import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def make_data(claim, evidence):
    _ftokens = ["[CLS]"]
    _ftokens.extend(claim)
    _ftokens.append("[SEP]")
    _ftokens.extend(evidence)
    _ftokens.append("[SEP]")
    while(len(_ftokens) < max_len):
        _ftokens.append("[PAD]")
    _ftokens = _ftokens[:max_len]
    segments = np.ones((max_len,))
    segments[:len(claim) + 2] = 0
    tokens = tokenizer.convert_tokens_to_ids(_ftokens)
    return (tokens, segments)
    
for line in processed_data:
    for evidence in line["processed"]["evidentiary"]:
        lines.append(make_data(line["processed"]["claim"], evidence))
        if (len(line["processed"]["evidentiary"]) == 1):
            classes.append(1)
        else:
            classes.append(2)
    for evidence in line["processed"]["non_evidentiary"]:
        lines.append(make_data(line["processed"]["claim"], evidence))
        classes.append(0)


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [4]:
print("Total data points = ", len(classes))
print("Of which evidentiary:", np.count_nonzero(classes))

training_lines = lines[:-len(classes)//10]
training_classes = classes[:-len(classes)//10]

print("Total training data points = ", len(training_classes))
print("Of which evidentiary:", np.count_nonzero(training_classes))

training_evidentiary_indices = [i for i in range(len(training_classes)) if training_classes[i] > 0 ]
training_nonevidentiary_indices = [i for i in range(len(training_classes)) if training_classes[i] == 0]

testing_lines = lines[-len(classes)//10:]
testing_classes = classes[-len(classes)//10:]

print("Total testing data points = ", len(testing_classes))
print("Of which evidentiary:", np.count_nonzero(testing_classes))

Total data points =  1605875
Of which evidentiary: 201060
Total training data points =  1445287
Of which evidentiary: 180997
Total testing data points =  160588
Of which evidentiary: 20063


### Relevant Fact Extraction (ReFE)


In [5]:
import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import BertModel

In [6]:
#Aim below is to always present a balanced training set. 

def getTrainingBatch(bs = 64):
    ev_total = bs // 8
    nev_total = bs - ev_total
    x = np.random.randint(0, len(training_evidentiary_indices), (ev_total))
    x = np.asarray(training_evidentiary_indices)[x]
    _base_tokens = [training_lines[index][0] for index in x]
    _segment_tokens = [training_lines[index][1] for index in x]
    _classes = [training_classes[index] for index in x]
    
    x = np.random.randint(0, len(training_nonevidentiary_indices), (nev_total))
    x = np.asarray(training_nonevidentiary_indices)[x]
    _base_tokens_ne = [training_lines[index][0] for index in x]
    _segment_tokens_ne = [training_lines[index][1] for index in x]
    _classes_ne = [training_classes[index] for index in x]
    
    _base_tokens.extend(_base_tokens_ne)
    _segment_tokens.extend(_segment_tokens_ne)
    _classes.extend(_classes_ne)
        
    final_seq = [i for i in range(bs)]
    np.random.shuffle(final_seq)
    
    tokens = []
    segments = []
    classes = []
    for index in final_seq:
        tokens.append(_base_tokens[index])
        segments.append(_segment_tokens[index])
        classes.append(_classes[index])
    
    tokens = torch.LongTensor(tokens).cuda()
    segments = torch.LongTensor(segments).cuda()
    classes = torch.LongTensor(classes).cuda()
    att_mask = tokens != 0
    
    return tokens, segments, att_mask, classes
    
tokens, segments, att_mask, classes = getTrainingBatch()

In [7]:
class ReFE(torch.nn.Module):
    def __init__(self, 
                num_bert_layers=1,
                backprop_thru_bert=False,
                internal_dim = 256                
                ):
        super(ReFE, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.out = torch.nn.Linear(768,3)
        self.dropout = torch.nn.Dropout(0.1)
    
    def forward(self, inputs, segments, attention_masks):
        f, _ = self.bert(inputs, 
                         token_type_ids=segments, 
                         attention_mask=attention_masks, 
                         output_all_encoded_layers=False)
        out_ = self.out(self.dropout(f[:,0,:]))        
        return out_

In [8]:
network = ReFE()

* Friends dont let friends use batch sizes > 32

In [None]:
lossFn = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(network.parameters(), lr=3e-5)
def Train(network, bs = 24, epochs=30, batches_per_epoch=10000):
    epoch_losses = []
    for k in range(epochs):
        batch_losses = []
        for i in range(batches_per_epoch):
            tokens, segments, att_mask, classes = getTrainingBatch(bs=bs)
            y_ = network.forward(tokens, segments, att_mask)
            evidences = classes >= 1
            non_evidences = classes == 0
            optimizer.zero_grad()
            loss1 = lossFn(F.log_softmax(y_[evidences], dim=-1), classes[evidences].cuda())
            loss2 = lossFn(F.log_softmax(y_[non_evidences], dim = -1), classes[non_evidences].cuda())
            f_loss = 0.75*loss1 + 0.25*loss2
            batch_losses.append(f_loss.data.item())
            print("Epoch:", k+1, 
                  "Batch:", i+1, 
                  "Loss:", np.round(np.mean(batch_losses),5), 
                  end="\r")
            f_loss.backward()
            optimizer.step()
        epoch_losses.append(np.mean(batch_losses))

Train(network.to("cuda"), bs=24)

Epoch: 1 Batch: 2219 Loss: 0.63763