In [1]:
import pickle
processed_data = pickle.load(open("../fever_processed.pickle", "rb"))

### Data prep for Evidence Accumulation

* The significance of a given statement as evidence to a claim/question is modelled as a classification problem
* Any length of text (such as a sentence from a document), is appended to the claim/question in the usual way. "CLS" <claim/question tokens> "SEP" <potential/evidence tokens> "SEP"
* A class is awarded to the combined string based on the following:
    - Class 0, if the evidence tokens do not contribute to answering the question
    - Class 1, if the evidence tokens partially answer the question
    - Class 2, if the evidence tokens completely answer the question
* len(claim) + len(evidence) + 3 should be <= 96 

In [2]:
max_len = 96

In [3]:
lines = []
classes = []
import numpy as np
from pytorch_pretrained_bert import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def make_data(claim, evidence):
    _ftokens = ["[CLS]"]
    _ftokens.extend(claim)
    _ftokens.append("[SEP]")
    _ftokens.extend(evidence)
    _ftokens.append("[SEP]")
    while(len(_ftokens) < max_len):
        _ftokens.append("[PAD]")
    _ftokens = _ftokens[:max_len]
    segments = np.ones((max_len,))
    segments[:len(claim) + 2] = 0
    tokens = tokenizer.convert_tokens_to_ids(_ftokens)
    return (tokens, segments)

counter = 0
for line in processed_data:
    counter += 1
    print(counter, "/", len(processed_data), end="\r")
    for evidence in line["processed"]["evidentiary"]:
        lines.append(make_data(line["processed"]["claim"], evidence))
        if (len(line["processed"]["evidentiary"]) == 1):
            classes.append(1)
        else:
            classes.append(2)
    for evidence in line["processed"]["non_evidentiary"]:
        lines.append(make_data(line["processed"]["claim"], evidence))
        classes.append(0)


108771 / 108771771108771108771108771108771108771108771108771108771108771108771 108771108771108771108771108771108771108771 108771108771108771108771108771108771108771 108771 108771108771108771 108771 108771108771108771108771108771108771108771108771108771108771/ 108771/ 108771108771108771108771108771 108771 108771 108771 108771 108771 108771 108771108771108771108771108771108771108771108771108771108771108771108771108771108771108771108771108771108771108771 108771108771108771108771 108771 108771 / 108771108771108771 108771 108771108771108771108771108771108771108771108771 108771 108771 108771108771108771108771108771108771108771108771 108771108771108771 108771108771108771108771108771108771108771 108771108771108771108771108771108771108771108771108771108771 108771108771108771 108771 / 108771108771108771 108771108771108771108771108771 108771108771108771 108771108771108771108771108771108771108771

In [4]:
print("Total data points = ", len(classes))
print("Of which evidentiary:", np.count_nonzero(classes))

training_lines = lines[:-len(classes)//10]
training_classes = classes[:-len(classes)//10]

print("Total training data points = ", len(training_classes))
print("Of which evidentiary:", np.count_nonzero(training_classes))

training_evidentiary_indices = [i for i in range(len(training_classes)) if training_classes[i] > 0 ]
training_nonevidentiary_indices = [i for i in range(len(training_classes)) if training_classes[i] == 0]

testing_lines = lines[-len(classes)//10:]
testing_classes = classes[-len(classes)//10:]

testing_evidentiary_indices = [i for i in range(len(testing_classes)) if testing_classes[i] > 0 ]
testing_nonevidentiary_indices = [i for i in range(len(testing_classes)) if testing_classes[i] == 0]

print("Total testing data points = ", len(testing_classes))
print("Of which evidentiary:", np.count_nonzero(testing_classes))


Total data points =  1605875
Of which evidentiary: 201060
Total training data points =  1445287
Of which evidentiary: 180997
Total testing data points =  160588
Of which evidentiary: 20063


### Relevant Fact Extraction (ReFE)

* Going by the paper and the documentation, when fine-tuning bert for classification tasks, only the output of the CLS tag (index 0) needs to be used. We can ignore the rest
* Aim here is to make it look to BERT like an entailment task. Since we are not really worried about the truth value of the claim, all we need to do is decide if a given sentence provides evidence to the claim or not. 

In [5]:
import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import BertModel

In [6]:
#Aim below is to always present a balanced training set. 

def getTrainingBatch(bs = 64, validation = False):
    
    evidentiary = training_evidentiary_indices
    non_evidentiary = training_nonevidentiary_indices
    source = training_lines
    source_classes = training_classes
    
    if (validation):
        evidentiary = testing_evidentiary_indices
        non_evidentiary = testing_nonevidentiary_indices
        source = testing_lines
        source_classes = testing_classes
    
    ev_total = bs // 8
    nev_total = bs - ev_total
    x = np.random.randint(0, len(evidentiary), (ev_total))
    x = np.asarray(evidentiary)[x]
    _base_tokens = [source[index][0] for index in x]
    _segment_tokens = [source[index][1] for index in x]
    _classes = [source_classes[index] for index in x]
    
    x = np.random.randint(0, len(non_evidentiary), (nev_total))
    x = np.asarray(non_evidentiary)[x]
    _base_tokens_ne = [source[index][0] for index in x]
    _segment_tokens_ne = [source[index][1] for index in x]
    _classes_ne = [source_classes[index] for index in x]
    
    _base_tokens.extend(_base_tokens_ne)
    _segment_tokens.extend(_segment_tokens_ne)
    _classes.extend(_classes_ne)
        
    final_seq = [i for i in range(bs)]
    np.random.shuffle(final_seq)
    
    tokens = []
    segments = []
    classes = []
    for index in final_seq:
        tokens.append(_base_tokens[index])
        segments.append(_segment_tokens[index])
        classes.append(_classes[index])
    
    tokens = torch.LongTensor(tokens).cuda()
    segments = torch.LongTensor(segments).cuda()
    classes = torch.LongTensor(classes).cuda()
    att_mask = tokens != 0
    
    return tokens, segments, att_mask, classes
    
tokens, segments, att_mask, classes = getTrainingBatch()

In [7]:
class ReFE(torch.nn.Module):
    def __init__(self):
        super(ReFE, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.out = torch.nn.Linear(768,3)
        self.dropout = torch.nn.Dropout(0.1)
    
    def forward(self, inputs, segments, attention_masks):
        f, _ = self.bert(inputs, 
                         token_type_ids=segments, 
                         attention_mask=attention_masks, 
                         output_all_encoded_layers=False)
        out_ = self.out(self.dropout(f[:,0,:]))        
        return out_

In [8]:
network = ReFE()
torch.cuda.empty_cache()

* Friends dont let friends use batch sizes > 64

In [9]:
import json
lossFn = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(network.parameters(), lr=3e-5)

In [10]:
def validate(network, bs=96, num_batches=5):
    
    classes = torch.LongTensor([]).cuda()
    preds = torch.FloatTensor([]).cuda()
    with torch.no_grad():
        for i in range(num_batches):
            tokens, segments, att_mask, classes_ = getTrainingBatch(bs=bs, validation=True)
            y_ = network.forward(tokens, segments, att_mask)
            classes = torch.cat([classes, classes_], dim=0)
            preds = torch.cat([preds, y_], dim=0)
        evidences = classes >= 1
        non_evidences = classes == 0
        loss1 = lossFn(F.log_softmax(preds[evidences], dim=-1), classes[evidences].cuda())
        loss2 = lossFn(F.log_softmax(preds[non_evidences], dim = -1), classes[non_evidences].cuda())
        f_loss = 0.75*loss1 + 0.25*loss2
        pred = torch.max(preds, dim=-1)[1]
        acc = torch.sum(pred == classes)
        acc = acc.cpu().numpy()/(bs*num_batches)
        positives = torch.sum(pred[evidences] == classes[evidences])
        return f_loss.data.item(), acc, positives.cpu().numpy()/torch.sum(evidences).cpu().numpy()

In [11]:
def Train(network, bs = 24, epochs=30, batches_per_epoch=10000):
    epoch_losses = []
    epoch_vals = []
    epoch_accs = []
    epoch_evid = []
    val_min = 1000
    for k in range(epochs):
        batch_losses = []
        for i in range(batches_per_epoch):
            tokens, segments, att_mask, classes = getTrainingBatch(bs=bs)
            y_ = network.forward(tokens, segments, att_mask)
            evidences = classes >= 1
            non_evidences = classes == 0
            optimizer.zero_grad()
            loss1 = lossFn(F.log_softmax(y_[evidences], dim=-1), classes[evidences].cuda())
            loss2 = lossFn(F.log_softmax(y_[non_evidences], dim = -1), classes[non_evidences].cuda())
            f_loss = 0.75*loss1 + 0.25*loss2
            batch_losses.append(f_loss.data.item())
            print("Epoch:", k+1, 
                  "Batch:", i+1, 
                  "Loss:", np.round(np.mean(batch_losses),5), 
                  end="\r")
            f_loss.backward()
            optimizer.step()
        epoch_losses.append(np.mean(batch_losses))
        val_loss, acc, evid_acc = validate(network, num_batches=10)
        
        epoch_vals.append(val_loss)
        epoch_accs.append(acc)
        epoch_evid.append(evid_acc)
        
        print("\n\tValidation Loss:", np.round(val_loss,5))
        print("\tOverall Validation Accuracy:", np.round(acc,2), "; and for evidence only:", np.round(evid_acc,2))
        
        if (val_loss < val_min):
            print("\tSaving a better model...")
            torch.save(network, "./ReFE_val_save.h5")
            val_min = val_loss
        
        with open("./training_cycle.json", "w") as f:            
            f.write(json.dumps(
                {
                    "training_losses":epoch_losses,
                    "validation_losses":epoch_vals,
                    "validation_accuracy":epoch_accs,
                    "evidence_accuracy":epoch_evid        
                }
            ))
            f.close()

#Train(network.to("cuda"), bs=96, epochs=75, batches_per_epoch=1000)
Train(network.to("cuda"), bs=10, epochs=5, batches_per_epoch=10)

Epoch: 1 Batch: 10 Loss: 1.07917

  "type " + obj.__name__ + ". It won't be checked "



	Validation Loss: 1.03382
	Overall Validation Accuracy: 0.15 ; and for evidence only: 0.64
Epoch: 2 Batch: 10 Loss: 0.81056
	Validation Loss: 1.20604
	Overall Validation Accuracy: 0.09 ; and for evidence only: 0.62
Epoch: 3 Batch: 10 Loss: 1.43551
	Validation Loss: 1.0093
	Overall Validation Accuracy: 0.19 ; and for evidence only: 0.6
Epoch: 4 Batch: 10 Loss: 1.08076
	Validation Loss: 1.01069
	Overall Validation Accuracy: 0.32 ; and for evidence only: 0.55
Epoch: 5 Batch: 10 Loss: 1.08833
	Validation Loss: 1.00468
	Overall Validation Accuracy: 0.28 ; and for evidence only: 0.56
