In [None]:
import pickle
processed_data = pickle.load(open("../fever_processed.pickle", "rb"))

### Data prep for Evidence Accumulation

* The significance of a given statement as evidence to a claim/question is modelled as a classification problem
* Any length of text (such as a sentence from a document), is appended to the claim/question in the usual way. "CLS" <claim/question tokens> "SEP" <potential/evidence tokens> "SEP"
* A class is awarded to the combined string based on the following:
    - Class 0, if the evidence tokens do not contribute to answering the question
    - Class 1, if the evidence tokens partially answer the question
    - Class 2, if the evidence tokens completely answer the question
* len(claim) + len(evidence) + 3 should be <= 96 

In [None]:
max_len_claims = 30
max_len_evid = 60

In [None]:
lines = []
classes = []
import numpy as np
from pytorch_pretrained_bert import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def make_data(claim, evidence):
    _fctokens = ["[CLS]"]
    _fctokens.extend(claim)
    _fetokens = ["[CLS]"]
    _fetokens.extend(evidence) 
    while len(_fctokens) < max_len_claims:
        _fctokens.append("[PAD]")
    while len(_fetokens) < max_len_evid:
        _fetokens.append("[PAD]")
    _fctokens = _fctokens[:max_len_claims]
    _fetokens = _fetokens[:max_len_evid]
    _csegments = np.zeros((max_len_claims,))
    _esegments = np.zeros((max_len_evid,))
    ctokens = tokenizer.convert_tokens_to_ids(_fctokens)
    etokens = tokenizer.convert_tokens_to_ids(_fetokens)
    return (ctokens, etokens, _esegments)

counter = 0
for line in processed_data:
    counter += 1
    print(counter, "/", len(processed_data), end="\r")
    for evidence in line["processed"]["evidentiary"]:
        lines.append(make_data(line["processed"]["claim"], evidence))
        if (len(line["processed"]["evidentiary"]) == 1):
            classes.append(1)
        else:
            classes.append(2)
    for evidence in line["processed"]["non_evidentiary"]:
        lines.append(make_data(line["processed"]["claim"], evidence))
        classes.append(0)


In [None]:
print("Total data points = ", len(classes))
print("Of which evidentiary:", np.count_nonzero(classes))

training_lines = lines[:-len(classes)//10]
training_classes = classes[:-len(classes)//10]

print("Total training data points = ", len(training_classes))
print("Of which evidentiary:", np.count_nonzero(training_classes))

training_evidentiary_indices = [i for i in range(len(training_classes)) if training_classes[i] > 0 ]
training_nonevidentiary_indices = [i for i in range(len(training_classes)) if training_classes[i] == 0]

testing_lines = lines[-len(classes)//10:]
testing_classes = classes[-len(classes)//10:]

testing_evidentiary_indices = [i for i in range(len(testing_classes)) if testing_classes[i] > 0 ]
testing_nonevidentiary_indices = [i for i in range(len(testing_classes)) if testing_classes[i] == 0]

print("Total testing data points = ", len(testing_classes))
print("Of which evidentiary:", np.count_nonzero(testing_classes))


### Relevant Fact Extraction (ReFE)

* Going by the paper and the documentation, when fine-tuning bert for classification tasks, only the output of the CLS tag (index 0) needs to be used. We can ignore the rest
* Aim here is to make it look to BERT like an entailment task. Since we are not really worried about the truth value of the claim, all we need to do is decide if a given sentence provides evidence to the claim or not. 

In [None]:
import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import BertModel

In [None]:
epoch_losses = []
epoch_vals = []
epoch_accs = []
epoch_evid = []

In [None]:
def getTrainingBatch(bs = 64, validation = False):
    
    evidentiary = training_evidentiary_indices
    non_evidentiary = training_nonevidentiary_indices
    source = training_lines
    source_classes = training_classes
    
    if (validation):
        evidentiary = testing_evidentiary_indices
        non_evidentiary = testing_nonevidentiary_indices
        source = testing_lines
        source_classes = testing_classes
    
    #control the number of positive samples being
    #seen by the dataset, since these will be much rarer in real life. 
    
    min_divisor = 2
    divisor = min_divisor
    divisor = divisor + (len(epoch_losses) % 10)
        
    ev_total = bs // divisor
    if (ev_total < bs//4):
        ev_total = bs//4
    nev_total = bs - ev_total
    x = np.random.randint(0, len(evidentiary), (ev_total))
    x = np.asarray(evidentiary)[x]
    _base_ctokens = [source[index][0] for index in x]
    _base_etokens = [source[index][1] for index in x]
    #_segment_ctokens = [source[index][2] for index in x]
    #_segment_etokens = [source[index][3] for index in x]
    _classes = [source_classes[index] for index in x]
    
    x = np.random.randint(0, len(non_evidentiary), (nev_total))
    x = np.asarray(non_evidentiary)[x]
    _base_ctokens_ne = [source[index][0] for index in x]
    _base_etokens_ne = [source[index][1] for index in x]
    #_segment_ctokens_ne = [source[index][2] for index in x]
    #_segment_etokens_ne = [source[index][2] for index in x]
    _classes_ne = [source_classes[index] for index in x]
    
    _base_ctokens.extend(_base_ctokens_ne)
    _base_etokens.extend(_base_etokens_ne)
    #_segment_ctokens.extend(_segment_ctokens_ne)
    #_segment_etokens.extend(_segment_etokens_ne)
    _classes.extend(_classes_ne)
        
    final_seq = [i for i in range(bs)]
    np.random.shuffle(final_seq)
    
    ctokens = []
    etokens = []
    csegments = np.zeros((bs, max_len_claims))
    esegments = np.zeros((bs, max_len_evid))
    classes = []
    for index in final_seq:
        ctokens.append(_base_ctokens[index])
        etokens.append(_base_etokens[index])
        #csegments.append(_segment_ctokens[index])
        #esegments.append(_segment_etokens[index])
        classes.append(_classes[index])
    
    
    """ 
    #Two class vs Three class output
    classes = np.asarray(classes)
    twos = classes == 2
    classes[twos] = 1
    """
    
    
    ctokens = torch.LongTensor(ctokens).cuda()
    csegments = torch.LongTensor(csegments).cuda()
    etokens = torch.LongTensor(etokens).cuda()
    esegments = torch.LongTensor(esegments).cuda()
    classes = torch.LongTensor(classes).cuda()
    catt_mask = ctokens != 0
    eatt_mask = etokens != 0
    
    return ctokens, csegments, catt_mask, etokens, esegments, eatt_mask, classes
    
ctokens, csegments, catt_mask, etokens, esegments, eatt_mask, classes = getTrainingBatch()

In [None]:
print(ctokens.size(), csegments.size(), catt_mask.size())
print(etokens.size(), esegments.size(), eatt_mask.size())

In [None]:
def getLoss(pred, actual, lossFn, e_weight=0.6, ne_weight=0.4):
    evidences = actual >= 1
    non_evidences = actual == 0
    
    loss1 = lossFn(F.log_softmax(pred[evidences], dim=-1), actual[evidences].cuda())
    loss2 = lossFn(F.log_softmax(pred[non_evidences], dim = -1), actual[non_evidences].cuda())
    """
    e_trg_losses.append(loss1)
    ne_training_losses.append(loss2)
    """
    
    return (loss1 + loss2)/2

In [None]:
#from QA_Attentions import biDAF as biDAF
import torch
from QA_Attentions import *

class ReFE(torch.nn.Module):
    def __init__(self):
        super(ReFE, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.wd = torch.nn.Parameter(torch.FloatTensor(np.random.uniform(0, 1, (3*768,))))
        self.innerAttQuery = torch.nn.Parameter(torch.FloatTensor(np.random.uniform(0, 1, (768, 256))))
        self.innerAttDoc = torch.nn.Parameter(torch.FloatTensor(np.random.uniform(0, 1, (768*4, 256))))
        self.out = torch.nn.Linear((768*5),3)
        self.dropout = torch.nn.Dropout(0.1)
    
    def forward(self, dt, ds, da, qt, qs, qa):
        queries, pooled = self.bert(qt, 
                         token_type_ids=qs, 
                         attention_mask=qa, 
                         output_all_encoded_layers=False)
        
        documents, pooled = self.bert(dt, 
                         token_type_ids=ds, 
                         attention_mask=da, 
                         output_all_encoded_layers=False)
        
        bdaf, ad2q, aq2d = biDAF(documents, queries, self.wd)
        q = InnerAttention(queries, self.innerAttQuery)
        d = InnerAttention(bdaf, self.innerAttDoc)
        _f = torch.cat([q,d],dim=-1)
        out_ = self.out(_f)
        return out_

In [None]:
continue_from_prev = False

In [None]:
import json
network = None
if (continue_from_prev):
    network = torch.load("./ReFE_BestValidationLoss_save.h5")
    tcycle = None
    with open("./saved_model_training_cycle.json", "r") as f:
        tcycle = json.loads(f.read())
    epoch_losses = tcycle["training_losses"][:-2]
    epoch_vals = tcycle["validation_losses"][:-2]
    epoch_accs = tcycle["validation_accuracy"][:-2]
    epoch_evid = tcycle["evidence_accuracy"][:-2]
else:
    network = ReFE()


* Friends dont let friends use batch sizes > 64

In [None]:
import json
lossFn = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(network.parameters(), lr=3e-5)

In [None]:
e_trg_losses = []
ne_training_losses = []

def _save(cause, network):
    print("\tSaving Model for Cause:", cause)
    torch.save(network, "./ReFE_" + cause + "_save.h5")
    with open("./" + cause + "_training_cycle.json", "w") as f:            
        f.write(json.dumps(
            {
                "training_losses":epoch_losses,
                "validation_losses":epoch_vals,
                "validation_accuracy":epoch_accs,
                "evidence_accuracy":epoch_evid        
            }
        ))
        f.close()
    
def chooseModelSave(network):
    save = False
    if (np.min(epoch_vals) == epoch_vals[-1]):
        cause = "BestValidationLoss"
        _save(cause, network)
    
    if (np.max(epoch_accs) == epoch_accs[-1]):
        cause = "BestValidationOverallAccuracy"
        _save(cause, network)
    
    if (np.max(epoch_evid) == epoch_evid[-1]):
        cause = "BestValidationEvidentiaryAccuracy"
        _save(cause, network)

In [None]:
def validate(network, bs=100, num_batches=5):
    
    classes = torch.LongTensor([]).cuda()
    preds = torch.FloatTensor([]).cuda()
    with torch.no_grad():
        for i in range(num_batches):
            ct, cs, ca, et, es, ea, classes_ = getTrainingBatch(bs=bs, validation=True)
            y_ = network.forward(et, es, ea, ct, cs, ca)
            classes = torch.cat([classes, classes_], dim=0)
            preds = torch.cat([preds, y_], dim=0)
        
        evidences = classes >= 1
        f_loss = getLoss(preds, classes, lossFn)
        pred = torch.max(preds, dim=-1)[1]
        acc = torch.sum(pred == classes)
        acc = acc.cpu().numpy()/(bs*num_batches)
        positives = torch.sum(pred[evidences] == classes[evidences])
        
        return f_loss.data.item(), acc, positives.cpu().numpy()/torch.sum(evidences).cpu().numpy()

In [None]:
import random
def chooseLossWeights():
    
    if (random.randint(0,100) % 5 == 0):
        return 1.0, 0.0
    elif (random.randint(0,100) % 20 == 0):
        return 0.25, 0.75
    
    return 0.5, 0.5

def Train(network, bs = 24, epochs=30, batches_per_epoch=10000):
    val_min = 1000
    if (continue_from_prev):
        val_min = np.min(epoch_vals)
    for k in range(epochs):
        batch_losses = []
        for i in range(batches_per_epoch):
            ct, cs, ca, et, es, ea, classes = getTrainingBatch(bs=bs)
            y_ = network.forward(et, es, ea, ct, cs, ca)
            optimizer.zero_grad()
            e_w, ne_w = chooseLossWeights()
            f_loss = getLoss(y_, classes, lossFn, e_weight=e_w, ne_weight=ne_w)
            batch_losses.append(f_loss.data.item())
            print("Epoch:", k+1, 
                  "Batch:", i+1, 
                  "Loss:", np.round(np.mean(batch_losses),5), 
                  end="\r")
            f_loss.backward()
            optimizer.step()
        
        epoch_losses.append(np.mean(batch_losses))
        val_loss, acc, evid_acc = validate(network, num_batches=10)
        
        epoch_vals.append(val_loss)
        epoch_accs.append(acc)
        epoch_evid.append(evid_acc)
        
        print("\n\tValidation Loss:", np.round(val_loss,5))
        print("\tOverall Validation Accuracy:", np.round(acc,2), "; and for evidence only:", np.round(evid_acc,2))
        
        if (val_loss < val_min):
            val_min = val_loss
            
        chooseModelSave(network)
        
        with open("./training_cycle.json", "w") as f:            
            f.write(json.dumps(
                {
                    "training_losses":epoch_losses,
                    "validation_losses":epoch_vals,
                    "validation_accuracy":epoch_accs,
                    "evidence_accuracy":epoch_evid        
                }
            ))
            f.close()

tot_epochs = 100
"""
if (continue_from_prev):
    tot_epochs = 75 - len(epoch_losses)
"""
Train(network.to("cuda"), bs=75, epochs=tot_epochs, batches_per_epoch=750)
#Train(network.to("cuda"), bs=75, epochs=5, batches_per_epoch=10)

### Testing the network

In [None]:
import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import BertModel

In [None]:
network = torch.load("./ReFE_BestValidationLoss_save.h5")

In [None]:
def testingBatch(bs = 100):
    source = testing_lines
    source_classes = testing_classes
    x = np.random.randint(0, len(testing_lines), (bs,))
    _ctokens = [source[index][0] for index in x]
    _etokens = [source[index][1] for index in x]
    csegments = np.zeros((bs, max_len_claims))
    esegments = np.zeros((bs, max_len_evid))
    
    _classes = [source_classes[index] for index in x]
        
    """
    _classes = np.asarray(_classes)
    twos = _classes == 2
    _classes[twos] = 1
    """
    
    ctokens = torch.LongTensor(_ctokens).cuda()
    csegments = torch.LongTensor(csegments).cuda()
    etokens = torch.LongTensor(_etokens).cuda()
    esegments = torch.LongTensor(esegments).cuda()
    classes = torch.LongTensor(_classes).cuda()
    catt_mask = ctokens != 0
    eatt_mask = etokens != 0
    
    return ctokens, csegments, catt_mask, etokens, esegments, eatt_mask, classes
    
ct, cs, ca, et, es, ea, classes_ = testingBatch()
print(ct.size())
print(cs.size())
print(ca.size())
print(et.size())

In [None]:
def validate(network, bs=100, num_batches=10):
    
    classes = torch.LongTensor([]).cuda()
    preds = torch.FloatTensor([]).cuda()
    with torch.no_grad():
        for i in range(num_batches):
            ct, cs, ca, et, es, ea, classes_ = testingBatch(bs=bs)
            y_ = network.forward(ct, cs, ca, et, es, ea)
            classes = torch.cat([classes, classes_], dim=0)
            preds = torch.cat([preds, y_], dim=0)
        
        evidences = classes >= 1
        f_loss = getLoss(preds, classes, lossFn)
        pred = torch.max(preds, dim=-1)[1]
        acc = torch.sum(pred == classes)
        acc = acc.cpu().numpy()/(bs*num_batches)
        positives = torch.sum(pred[evidences] == classes[evidences])
        
        return f_loss.data.item(), acc, positives.cpu().numpy()/torch.sum(evidences).cpu().numpy(), preds, classes

In [None]:
l, acc, p_acc, y_, y = validate(network)
y_ = F.softmax(y_, dim=-1)
y_ = torch.max(y_, dim=-1)[1]

act_1 = y >= 1
act_2 = y_ >= 1
#print(act_1)
#print(act_2)
print(torch.sum(act_1 != act_2), torch.sum(act_1))
print(acc)