In [1]:
import pickle
import numpy as np
import torch
import torch.nn.functional as F
from apex import amp
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

- The first 512 tokens of the concatenated evidences as shown in the dataset are used. Substitute this for a sentence formation. 

In [2]:
all_data = pickle.load(open("./usable_verifiable_fever_data.pickle", "rb"))
data = all_data[:72000]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(str(len(data)) + " data points.")

72000 data points.


In [3]:
mixed_precision = True
num_layers_to_take_from_bert = 1

In [4]:
def getBERT(claims, evidences, bert, extract_layers=4):
    seq_tokens = torch.zeros(claims.size(), dtype=torch.long).cuda()
    claims_, _ = bert(claims, seq_tokens)
    claims_ = torch.cat(claims_[-extract_layers:], dim=-1)
    evidences_, _ = bert(evidences)
    evidences_ = torch.cat(evidences_[-extract_layers:], dim=-1)
    return claims_, evidences_

In [5]:
def biDAF(claims, evidences, af_vector, output_perceptron):
    #modified the alignment matrix computation because it's more conventient this way
    align = torch.matmul(claims, af_vector)
    align = torch.matmul(align, evidences.transpose(2,1)) #bs, claims_ts, evidences_ts
    
    #context-to-evidence alignment
    att_c2e = F.softmax(align, dim=-1)
    att_c2e = torch.matmul(att_c2e, evidences) #(bs, ts, dim)

    #evidence-to-context alignment
    maxes = torch.max(align, dim=-1)
    att_e2c = torch.matmul(maxes[0], claims)
    att_e2c = torch.sum(att_e2c, dim=-2) #(bs, dim)
    att_e2c = att_e2c.unsqueeze(1).repeat(1, claims.size()[1], 1) #(bs, ts, dim)
    
    #concat and return after push through perceptron
    out_ = torch.cat([claims, att_c2e, att_e2c], dim=-1)
    out_ = output_perceptron(out_)
    return out_

In [6]:
def computePostBiDAF(bidaf_out, lstm, perceptron, out):
    out_, _ = lstm(bidaf_out)
    out_ = torch.tanh(perceptron(out_[:,-1,:]))
    out_ = out(out_)
    return out_.squeeze(dim=-1)

In [7]:
class Verifier(torch.nn.Module):
    def __init__(self,
                 num_layers_to_take_from_bert=4, 
                 biDAF_out = 1024,
                 backprop_thru_bert = False):
        super(Verifier, self).__init__()
        self.num_bert_layers = num_layers_to_take_from_bert
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        if (not backprop_thru_bert):
            self.bert.embeddings.requires_grad = False
        
        self.lstm = torch.nn.LSTM(biDAF_out, biDAF_out, bidirectional=True)
        self.biDAF_perceptron = torch.nn.Linear(num_layers_to_take_from_bert*768*3,biDAF_out)
        self.biDAF_alignment_vector = torch.nn.Parameter(torch.zeros(num_layers_to_take_from_bert*768,num_layers_to_take_from_bert*768))
        self.perceptron = torch.nn.Linear(biDAF_out*2, 256)
        self.out = torch.nn.Linear(256, 1)
        self.dropout = torch.nn.Dropout(0.15)
        print("remember to use BCEWithLogitsLoss since the output is not put through sigmoid")
    
    def forward(self, claims, evidences):
        #put both the claims and the evidences thru BERT; concat the last n layers
        claims_, evidences_ = getBERT(claims, evidences, self.bert, extract_layers=self.num_bert_layers)
        out_ = biDAF(self.dropout(claims_), self.dropout(evidences_), self.biDAF_alignment_vector, self.biDAF_perceptron)
        return computePostBiDAF(out_, self.lstm, self.perceptron, self.out)

In [8]:
loss_fn = torch.nn.BCEWithLogitsLoss()
verifier = Verifier(num_layers_to_take_from_bert=2).cuda()
optimizer = torch.optim.Adam(verifier.parameters(), lr=0.001)
if (mixed_precision):
    verifier, optimizer = amp.initialize(verifier, optimizer, opt_level="O2")

remember to use BCEWithLogitsLoss since the output is not put through sigmoid
Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


In [9]:
def getBatch(bs = 5, max_len=512, claim_len=30):
    indices = np.random.randint(0, len(data), (bs,))
    batch_evidences = np.asarray([data[index]["evidence"] for index in indices])
    batch_claims = np.asarray([data[index]["claim"] for index in indices])
    batch_evidences = batch_evidences[:,:200]
    batch_claims = batch_claims[:,:10]
    y = [data[index]["class"] for index in indices]
    return batch_claims, batch_evidences, y

In [10]:
from apex.fp16_utils import FP16_Optimizer
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)

FP16_Optimizer processing param group 0:
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([1536, 1536])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([30522, 768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([512, 768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([2, 768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([768, 768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([768, 768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([768, 768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([768])
FP16_Optimizer received torch.cuda.FloatTensor with torch.Size([768, 768])
FP16_Op

In [11]:
def train(epochs = 1, batch_size=4, batches_per_epoch=100):
    for k in range(epochs):
        losses = []
        for i in range(batches_per_epoch):            
            claim, evidence, y = getBatch(bs=batch_size)
            claim = torch.LongTensor(claim).cuda()
            evidence = torch.LongTensor(evidence).cuda()
            y = torch.FloatTensor(y).cuda()
            #print(y)
            output = verifier(claim, evidence)
            """
            with torch.no_grad():
                print(F.sigmoid(output))
            """
            loss = loss_fn(output, y)
            losses.append(loss.data.item())
            if (mixed_precision):
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            losses = losses[-100:]
            print("Epoch:", str(k), 
                  "; Batch:", str(i), 
                  "; Average Loss:", str(np.round(np.mean(losses), 7)), 
                  end="\r")

In [12]:
epochs = 10
bpe = 1000
batch_size = 16
if (mixed_precision):
    batch_size = 70
train(epochs= epochs, batches_per_epoch=bpe, batch_size=batch_size)

  self.dropout, self.training, self.bidirectional, self.batch_first)


Epoch: 2 ; Batch: 162 ; Average Loss: 0.6908445

KeyboardInterrupt: 