In [0]:
import numpy as np
import os, tqdm, time, json
import torch
import matplotlib.pyplot as plt
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset 

In [0]:
from tokenization import FullTokenizer
from Bert import *
from scorer import fever_score

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
weights_path = "/content/drive/My Drive/NN-NLP-Project-Data/uncased_L-12_H-768_A-12/bert_model.ckpt"
vocab_file = "/content/drive/My Drive/NN-NLP-Project-Data/uncased_L-12_H-768_A-12/vocab.txt"
model_name = "ClaimVerification"

In [0]:
class SentenceDataset(Dataset):
    def __init__(self, tok_ip, sent_ip, pos_ip, masks, y):
        self.tok_ip = tok_ip
        self.sent_ip = sent_ip
        self.pos_ip = pos_ip
        self.masks = masks
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.tok_ip[index], self.sent_ip[index], self.pos_ip[index], self.masks[index], self.y[index]

In [0]:
class ClaimVerification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.enbedding_layer = EmbeddingLayer(config)
        self.encoders = nn.ModuleList([EncoderLayer(config) for i in range(config.num_encoders)])
        self.output = nn.Linear(config.emb_dim, 3)
        
    def forward(self, token_ip, sent_ip, pos_ip, mask=None):
        embeddings = self.enbedding_layer(token_ip, sent_ip, pos_ip)
        for encoder in self.encoders:
            embeddings = encoder(embeddings, mask)
        out = self.output(embeddings[:, 0])
        
        return out

In [0]:
def load_data(fname):
    label_dict = {}
    label_dict['UNK'] = -1
    label_dict['NOT ENOUGH INFO'] = 0
    label_dict['SUPPORTS'] = 1
    label_dict['REFUTES'] = 2
    f = open(fname, encoding='utf8')
    data = []
    claim_ids = []
    labels = []
    predicted_evidence = []
    for line in f:
        line = json.loads(line)
        sentence = ["[CLS]" + line['claim'] + "[SEP]", line['sentence'] + "[SEP]"]
        label = label_dict[line['label']]
        data.append(sentence)
        labels.append(label)
        claim_ids.append(line['id'])
        predicted_evidence.append([line['doc'], line['sid']])
    f.close()
    
    return data, labels, claim_ids, predicted_evidence

In [0]:
def preprocess(data):
    tokenizer = FullTokenizer(vocab_file)
    tok_ip = np.zeros((len(data), 128), dtype="int16")
    sent_ip = np.zeros((len(data), 128), dtype="int16")
    pos_ip = np.zeros((len(data), 128), dtype="int16")
    masks = np.zeros((len(data), 128), dtype="float32")
    
    for pos, text in tqdm.tqdm_notebook(enumerate(data)):
        tok0 = tokenizer.tokenize(text[0])
        tok1 = tokenizer.tokenize(text[1])
        tok = tok0 + tok1
        if len(tok) > 128:
            tok = tok[:127] + ["[SEP]"]
        pad_len = 128-len(tok)
        tok_len = len(tok)
        tok0_len = len(tok0)
        tok = tokenizer.convert_tokens_to_ids(tok) + [0]*pad_len
        pos_val = range(128)
        sent = [0]*tok0_len + [1]*(tok_len-tok0_len) + [0]*pad_len
        mask = [1]*tok_len + [0]*pad_len
        
        tok_ip[pos] = tok
        pos_ip[pos] = pos_val
        masks[pos] = mask
        
    masks = masks[:, None, None, :]
    return tok_ip, sent_ip, pos_ip, masks

In [0]:
if not os.path.exists("train/train-tok.npy"):
    data, labels, ids, predicted_evidence = load_data("train-data.jsonl")
    tok_ip, sent_ip, pos_ip, masks = preprocess(data)
    labels = np.array(labels)
    os.mkdir("train")
    np.save("train/train-tok.npy", tok_ip)
    np.save("train/train-sent.npy", sent_ip)
    np.save("train/train-sent.npy", pos_ip)
    np.save("train/train-masks.npy", masks)
    np.save("train/train-labels.npy", labels)
else:
    data, labels, ids, predicted_evidence = load_data("train-data.jsonl")
    tok_ip = np.load("train/train-tok.npy")
    sent_ip = np.load("train/train-sent.npy")
    pos_ip = np.load("train/train-sent.npy")
    masks = np.load("train/train-masks.npy")
    labels = np.load("train/train-labels.npy")   

In [0]:
if not os.path.exists("dev/dev-tok.npy"):
    data_dev, labels_dev, ids_dev, predicted_evidence_dev = load_data("dev-data.jsonl")
    tok_ip_dev, sent_ip_dev, pos_ip_dev, masks_dev = preprocess(data_dev)
    labels_dev = np.array(labels_dev)
    os.mkdir("dev")
    np.save("dev/dev-tok.npy", tok_ip_dev)
    np.save("dev/dev-sent.npy", sent_ip_dev)
    np.save("dev/dev-pos.npy", pos_ip_dev)
    np.save("dev/dev-masks.npy", masks_dev)
    np.save("dev/dev-labels.npy", labels_dev)
else:
    data_dev, labels_dev, ids_dev, predicted_evidence_dev = load_data("dev-data.jsonl")
    tok_ip_dev = np.load("dev/dev-tok.npy")
    sent_ip_dev = np.load("dev/dev-sent.npy")
    pos_ip_dev = np.load("dev/dev-pos.npy")
    masks_dev = np.load("dev/dev-masks.npy")
    labels_dev = np.load("dev/dev-labels.npy")

In [0]:
if not os.path.exists("test/test-tok.npy"):
    data_test, labels_test, ids_test, predicted_evidence_test = load_data("test-data.jsonl")
    tok_ip_test, sent_ip_test, pos_ip_test, masks_test = preprocess(data_test)
    labels_test = np.array(labels_test)
    os.mkdir("test")
    np.save("test/test-tok.npy", tok_ip_test)
    np.save("test/test-sent.npy", sent_ip_test)
    np.save("test/test-pos.npy", pos_ip_test)
    np.save("test/test-masks.npy", masks_test)
    np.save("test/test-labels.npy", labels_test)
else:
    data_test, labels_test, ids_test, predicted_evidence_test = load_data("test-data.jsonl")
    tok_ip_test = np.load("test/test-tok.npy")
    sent_ip_test = np.load("test/test-sent.npy")
    pos_ip_test = np.load("test/test-pos.npy")
    masks_test = np.load("test/test-masks.npy")
    labels_test = np.load("test/test-labels.npy")

In [0]:
def train(model, loader, criterion, optimizer):
    model.train()
    loss_epoch = 0
    for tok_ip, sent_ip, pos_ip, masks, y in tqdm.tqdm_notebook(loader):
        optimizer.zero_grad()
        tok_ip = tok_ip.type(torch.LongTensor).to(device)
        sent_ip = sent_ip.type(torch.LongTensor).to(device)
        pos_ip = pos_ip.type(torch.LongTensor).to(device)
        masks = masks.to(device)
        y = y.to(device)
        O = model(tok_ip, sent_ip, pos_ip, masks)
        loss = criterion(O, y)
        loss_epoch += loss.item()
        loss.backward()
        optimizer.step()
    print ("Loss:", loss_epoch/len(loader))
    
    return loss_epoch/len(loader)

In [0]:
def test(model, loader):
    model.eval()
    outputs = []
    for tok_ip, sent_ip, pos_ip, masks, y in tqdm.tqdm_notebook(loader):
        optimizer.zero_grad()
        tok_ip = tok_ip.to(device)
        sent_ip = sent_ip.to(device)
        pos_ip = pos_ip.to(device)
        masks = masks.to(device)
        y = y.to(device)
        output = model(tok_ip, sent_ip, pos_ip, masks)
        outputs.extend(output.detach().cpu().argmax(dim=1).numpy())

    return np.asarray(outputs)

In [0]:
# Merge predictions for each claim
def merge_preds(preds, ids, predicted_evidence):
    merged_preds = []
    merged_evidence = []
    cur_id = ids[0]
    # Indices represent NEI, Supports, Refutes
    stats = [0, 0, 0]
    evidence_line = []
    stats[preds[0]] += 1
    for i in range(1,len(ids)):
        if ids[i] == cur_id:
            stats[preds[i]] += 1
            evidence_line.append(predicted_evidence[i])
        else:
            # Label Assignment according to rules mentioned in paper
            if stats[1] > 0:
                merged_preds.append("SUPPORTS")
            elif stats[2] > 0 and stats[1] == 0:
                merged_preds.append("REFUTES")
            elif stats[1] == 0 and stats[2] == 0:
                merged_preds.append("NOT ENOUGH INFO")
            stats = [0, 0, 0]
            cur_id = ids[i]
            stats[preds[i]] += 1
            merged_evidence.append(evidence_line)
            evidence_line = []
    if stats[1] > 0:
        merged_preds.append("SUPPORTS")
    elif stats[2] > 0 and stats[1] == 0:
        merged_preds.append("REFUTES")
    elif stats[1] == 0 and stats[2] == 0:
        merged_preds.append("NOT ENOUGH INFO")
    merged_evidence.append(evidence_line)
    return merged_preds, merged_evidence

In [0]:
# Make final json with id, label, predicted_label, evidence and predicted_evidence
def format_output(in_path, out_path, preds, evidence, dev=True):
    outputs = []
    with open(in_path, 'r', encoding='utf8') as f:
        i = 0
        for line in f.readlines():
            output_obj = {}
            input_obj = json.loads(line.strip())
            output_obj['id'] = input_obj['id']
            if dev:
                output_obj['label'] = input_obj['label']
            output_obj['predicted_label'] = preds[i]
            output_obj['predicted_evidence'] = evidence[i]
            if dev:
                output_obj['evidence'] = input_obj['evidence']
            i += 1
            outputs.append(output_obj)

    # Calculate Fever score for dev set
    if dev:
        print('Dev Set Results')
        fever_sc, label_accuracy, precision, recall, f1 = fever_score(outputs)
        print('Fever Score: ',fever_sc)     
        print('Label Accuracy: ',label_accuracy)   
        print('Precision: ',precision)       
        print('Recall: ',recall)      
        print('F1 Score: ',f1)    

    # Write final predictions to file
    with open(out_path, 'w', encoding='utf8') as f:
        for line in outputs:
            json.dump(line, f)
            f.write("\n")

In [0]:
train_dataset = SentenceDataset(tok_ip, sent_ip, pos_ip, masks, labels)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32, num_workers=8)

In [0]:
dev_dataset = SentenceDataset(tok_ip_dev, sent_ip_dev, pos_ip_dev, masks_dev, labels_dev)
dev_loader = DataLoader(dev_dataset, shuffle=False, batch_size=32, num_workers=8)

In [0]:
test_dataset = SentenceDataset(tok_ip_test, sent_ip_test, pos_ip_test, masks_test, labels_test)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=32, num_workers=8)

In [0]:
config = Config()
model = ClaimVerification(config)
load_model(model, weights_path)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
model.to(device)

In [0]:
# Train
for i in range(1):
    x = train(model, dev_loader, criterion, optimizer)
    torch.save(model.state_dict(), model_name)

In [0]:
# Dev Set
preds = test(model, dev_loader)
merged_preds, merged_evidence = merge_preds(preds, ids_dev, predicted_evidence_dev)
format_output('dev.jsonl', 'dev_results.txt', merged_preds, merged_evidence)

In [0]:
# Test Set
preds = test(model, test_loader)
merged_preds, merged_evidence = merge_preds(preds, ids_dev, predicted_evidence_dev)
format_output('test.jsonl', 'test_results.txt', merged
_preds, merged_evidence)