In [1]:
import numpy as np
import os, tqdm, time, json
import torch
import matplotlib.pyplot as plt
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset 

In [2]:
from tokenization import FullTokenizer
from Bert import *

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
weights_path = "../uncased_L-12_H-768_A-12/bert_model.ckpt"
vocab_file = "../uncased_L-12_H-768_A-12/vocab.txt"
model_name = "SentenceRetrieval"

In [5]:
class SentenceDataset(Dataset):
    def __init__(self, tok_ip, sent_ip, pos_ip, masks, y):
        self.tok_ip = tok_ip
        self.sent_ip = sent_ip
        self.pos_ip = pos_ip
        self.masks = masks
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.tok_ip[index], self.sent_ip[index], self.pos_ip[index], self.masks[index], self.y[index]

In [6]:
class SentenceRetrieval(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.enbedding_layer = EmbeddingLayer(config)
        self.encoders = nn.ModuleList([nn.TransformerEncoderLayer(d_model=config.emb_dim, nhead=config.num_heads, dim_feedforward=config.hidden_dim) for i in range(config.num_encoders)])
        self.output = nn.Linear(config.emb_dim, 2)
        
    def forward(self, token_ip, sent_ip, pos_ip, mask=None):
        embeddings = self.enbedding_layer(token_ip, sent_ip, pos_ip)
        for encoder in self.encoders:
            embeddings = encoder(embeddings, mask)
        out = self.output(embeddings[:, 0])
        
        return out

In [7]:
def load_model(model, checkpoint_file):
    """ Load the pytorch model from checkpoint file """

    # Embedding layer
    e, p = model.enbedding_layer, 'bert/embeddings/'
    load_param(checkpoint_file, {
        e.token_embeddings.weight: p+"word_embeddings",
        e.positional_embeddings.weight: p+"position_embeddings",
        e.sentence_embeddings.weight: p+"token_type_embeddings",
        e.layer_norm.weight:       p+"LayerNorm/gamma",
        e.layer_norm.bias:        p+"LayerNorm/beta"
    })

    # Transformer blocks
    for i in range(len(model.encoders)):
        b, p = model.encoders[i], "bert/encoder/layer_%d/"%i
        load_param(checkpoint_file, {
            b.self_attn.out_proj.weight:          p+"attention/output/dense/kernel",
            b.self_attn.out_proj.bias:            p+"attention/output/dense/bias",
            b.linear1.weight:      p+"intermediate/dense/kernel",
            b.linear1.bias:        p+"intermediate/dense/bias",
            b.linear2.weight:      p+"output/dense/kernel",
            b.linear2.bias:        p+"output/dense/bias",
            b.norm1.weight:          p+"attention/output/LayerNorm/gamma",
            b.norm1.bias:           p+"attention/output/LayerNorm/beta",
            b.norm2.weight:          p+"output/LayerNorm/gamma",
            b.norm2.bias:           p+"output/LayerNorm/beta",
        })
        load_param_num(checkpoint_file, {
            b.self_attn.in_proj_weight:   [p+"attention/self/query/kernel", p+"attention/self/key/kernel", p+"attention/self/value/kernel"],
            b.self_attn.in_proj_bias:     [p+"attention/self/query/bias", p+"attention/self/key/bias", p+"attention/self/value/bias"],
        })

In [8]:
config = Config()
model = SentenceRetrieval(config)
print (model)

SentenceRetrieval(
  (enbedding_layer): EmbeddingLayer(
    (token_embeddings): Embedding(30522, 768)
    (sentence_embeddings): Embedding(2, 768)
    (positional_embeddings): Embedding(512, 768)
    (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (emb_dropout): Dropout(p=0.1, inplace=False)
  )
  (encoders): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (linear1): Linear(in_features=768, out_features=3072, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=3072, out_features=768, bias=True)
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerEncoderLayer(
      (self_attn): MultiheadAttentio

In [9]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print (name)

enbedding_layer.token_embeddings.weight
enbedding_layer.sentence_embeddings.weight
enbedding_layer.positional_embeddings.weight
enbedding_layer.layer_norm.weight
enbedding_layer.layer_norm.bias
encoders.0.self_attn.in_proj_weight
encoders.0.self_attn.in_proj_bias
encoders.0.self_attn.out_proj.weight
encoders.0.self_attn.out_proj.bias
encoders.0.linear1.weight
encoders.0.linear1.bias
encoders.0.linear2.weight
encoders.0.linear2.bias
encoders.0.norm1.weight
encoders.0.norm1.bias
encoders.0.norm2.weight
encoders.0.norm2.bias
encoders.1.self_attn.in_proj_weight
encoders.1.self_attn.in_proj_bias
encoders.1.self_attn.out_proj.weight
encoders.1.self_attn.out_proj.bias
encoders.1.linear1.weight
encoders.1.linear1.bias
encoders.1.linear2.weight
encoders.1.linear2.bias
encoders.1.norm1.weight
encoders.1.norm1.bias
encoders.1.norm2.weight
encoders.1.norm2.bias
encoders.2.self_attn.in_proj_weight
encoders.2.self_attn.in_proj_bias
encoders.2.self_attn.out_proj.weight
encoders.2.self_attn.out_proj.b

In [10]:
load_model(model, weights_path)

In [7]:
# def load_data(fname):
#     f = open(fname, encoding='utf8')
#     data = []
#     labels = []
#     for line in f:
#         line = json.loads(line)
#         sentence = ["[CLS]" + line['claim'] + "[SEP]", line['sentence'] + "[SEP]"]
#         label = line['is_evidence']
#         data.append(sentence)
#         labels.append(label)
#     f.close()
    
#     return data, labels

In [15]:
def load_data(fname):
    f = open(fname, encoding='utf8')
    data = []
    claim_ids = []
    labels = []
    predicted_evidence = []
    for line in f:
        line = json.loads(line)
        sentence = ["[CLS]" + line['claim'] + "[SEP]", line['doc'] + " " + line['sentence'] + "[SEP]"]
        label = line['is_evidence']
        data.append(sentence)
        labels.append(label)
        claim_ids.append(line['id'])
        predicted_evidence.append([line['doc'], line['sid'], line['claim'], line['sentence'], line['label']])
    f.close()
    
    return data, labels, claim_ids, predicted_evidence

In [5]:
def preprocess(data):
    tokenizer = FullTokenizer(vocab_file)
    tok_ip = np.zeros((len(data), 512), dtype="int32")
    sent_ip = np.zeros((len(data), 512), dtype="int8")
    pos_ip = np.zeros((len(data), 512), dtype="int8")
    masks = np.zeros((len(data), 512), dtype="int8")
    
    for pos, text in tqdm.tqdm_notebook(enumerate(data)):
        tok0 = tokenizer.tokenize(text[0])
        tok1 = tokenizer.tokenize(text[1])
        tok = tok0 + tok1
        if len(tok) > 512:
            tok = tok[:511] + ["[SEP]"]
        pad_len = 512-len(tok)
        tok_len = len(tok)
        tok0_len = len(tok0)
        tok = tokenizer.convert_tokens_to_ids(tok) + [0]*pad_len
        pos_val = range(512)
        sent = [0]*tok0_len + [1]*(tok_len-tok0_len) + [0]*pad_len
        mask = [1]*tok_len + [0]*pad_len
        
        tok_ip[pos] = tok
        pos_ip[pos] = pos_val
        masks[pos] = mask
        
    masks = masks[:, None, None, :]
    return tok_ip, sent_ip, pos_ip, masks

In [9]:
data_train, labels_train, ids_train, predicted_evidence_train = load_data("train-data.jsonl")

if not os.path.exists("train/train-tok.npy"):
    tok_ip, sent_ip, pos_ip, masks = preprocess(data_train)
    labels = np.array(labels_train)
    os.mkdir("train")
    np.save("train/train-tok.npy", tok_ip)
    np.save("train/train-sent.npy", sent_ip)
    np.save("train/train-sent.npy", pos_ip)
    np.save("train/train-masks.npy", masks)
    np.save("train/train-labels.npy", labels)
else:
    tok_ip = np.load("train/train-tok.npy")
    sent_ip = np.load("train/train-sent.npy")
    pos_ip = np.load("train/train-sent.npy")
    masks = np.load("train/train-masks.npy")
    labels = np.load("train/train-labels.npy")   

KeyboardInterrupt: 

In [10]:
data_dev, labels_dev, ids_dev, predicted_evidence_dev = load_data("dev-data.jsonl")

if not os.path.exists("dev/dev-tok.npy"):
    tok_ip_dev, sent_ip_dev, pos_ip_dev, masks_dev = preprocess(data_dev)
    labels_dev = np.array(labels_dev)
    os.mkdir("dev")
    np.save("dev/dev-tok.npy", tok_ip_dev)
    np.save("dev/dev-sent.npy", sent_ip_dev)
    np.save("dev/dev-pos.npy", pos_ip_dev)
    np.save("dev/dev-masks.npy", masks_dev)
    np.save("dev/dev-labels.npy", labels_dev)
else:
    tok_ip_dev = np.load("dev/dev-tok.npy")
    sent_ip_dev = np.load("dev/dev-sent.npy")
    pos_ip_dev = np.load("dev/dev-pos.npy")
    masks_dev = np.load("dev/dev-masks.npy")
    labels_dev = np.load("dev/dev-labels.npy")




KeyboardInterrupt: 

In [None]:
data_test, labels_test, ids_test, predicted_evidence_test = load_data("test-data.jsonl")

if not os.path.exists("test/test-tok.npy"):
    tok_ip_test, sent_ip_test, pos_ip_test, masks_test = preprocess(data_test)
    labels_test = np.array(labels_test)
    os.mkdir("test")
    np.save("test/test-tok.npy", tok_ip_test)
    np.save("test/test-sent.npy", sent_ip_test)
    np.save("test/test-pos.npy", pos_ip_test)
    np.save("test/test-masks.npy", masks_test)
    np.save("test/test-labels.npy", labels_test)
else:
    tok_ip_test = np.load("test/test-tok.npy")
    sent_ip_test = np.load("test/test-sent.npy")
    pos_ip_test = np.load("test/test-pos.npy")
    masks_test = np.load("test/test-masks.npy")
    labels_test = np.load("test/test-labels.npy")

In [None]:
def train(model, loader, criterion, optimizer):
    model.train()
    loss_epoch = 0
    for tok_ip, sent_ip, pos_ip, masks, y in tqdm.tqdm_notebook(loader):
        optimizer.zero_grad()
        tok_ip = tok_ip.type(torch.LongTensor).to(device)
        sent_ip = sent_ip.type(torch.LongTensor).to(device)
        pos_ip = pos_ip.type(torch.LongTensor).to(device)
        masks = masks.type(torch.FloatTensor).to(device)
        y = y.to(device)
        O = model(tok_ip, sent_ip, pos_ip, masks)
        loss = criterion(O, y)
        loss_epoch += loss.item()
        loss.backward()
        optimizer.step()
    print ("Loss:", loss_epoch/len(loader))
    
    return loss_epoch/len(loader)

In [10]:
def test(model, loader):
    model.eval()
    outputs = []
    scores = []
    for tok_ip, sent_ip, pos_ip, masks, y in tqdm.tqdm_notebook(loader):
        optimizer.zero_grad()
        tok_ip = tok_ip.to(device)
        sent_ip = sent_ip.to(device)
        pos_ip = pos_ip.to(device)
        masks = masks.to(device)
        y = y.to(device)
        output = model(tok_ip, sent_ip, pos_ip, masks)
        
        scores.extend(output.detach().cpu().numpy()[:, 1])
        outputs.extend(output.detach().cpu().argmax(dim=1).numpy())

    return np.asarray(outputs), np.asarray(scores)

In [12]:
# Merge predictions for each claim
def get_top_5(preds, scores, ids, predicted_evidence):
    
    evidence_map = {}
    top_5_map = {}
    
    for i in range(len(ids)):
        if preds[i] != 1:
            continue
        if ids[i] not in evidence_map.keys():
            evidence_map[ids[i]] = []
        evidence_map[ids[i]].append((scores[i], predicted_evidence))
        
    for id, sents in evidence_map.items():
        top_5_sents = sorted(sents, key=lambda x: x[0], reverse=True)[:5]
        top_5_map[id] = top_5_sents
    
    return top_5_map

In [16]:
# Make final json with id, label, predicted_label, evidence and predicted_evidence
def format_output(out_path, top_5_map):
    
    outputs = []
    for id, sents in top_5_map.items():
        
        for sent, meta in sents:
            output_obj = {}
            output_obj['id'] = id
            output_obj['claim'] =  meta[2]
            output_obj['label'] = meta[4]
            output_obj['doc'] = meta[0]
            output_obj['sid'] = meta[1]
            output_obj['sentence'] = meta[3]
            
            outputs.append(output_obj)

    # Write final predictions to file
    with open(out_path, 'w', encoding='utf8') as f:
        for line in outputs:
            json.dump(line, f)
            f.write("\n")

In [None]:
train_dataset = SentenceDataset(tok_ip, sent_ip, pos_ip, masks, labels)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16, num_workers=4)

In [None]:
dev_dataset = SentenceDataset(tok_ip_dev, sent_ip_dev, pos_ip_dev, masks_dev, labels_dev)
dev_loader = DataLoader(dev_dataset, shuffle=False, batch_size=16, num_workers=4)

In [None]:
test_dataset = SentenceDataset(tok_ip_test, sent_ip_test, pos_ip_test, masks_test, labels_test)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=16, num_workers=4)

In [None]:
config = Config()
model = SentenceRetrieval(config)
load_model(model, weights_path)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5)
model.to(device)

In [None]:
for i in range(1):
    x = train(model, dev_loader, criterion, optimizer)
    torch.save(model.state_dict(), model_name)

In [None]:
# Train Set
preds, scores = test(model, train_loader)
top_5_map = get_top_5(preds, scores, ids_train, predicted_evidence_train)
format_output('train_results.txt',top_5_map)

In [None]:
# Dev Set
preds, scores = test(model, dev_loader)
top_5_map = get_top_5(preds, ids_dev, predicted_evidence_dev)
format_output('dev_results.txt', top_5_map)

In [None]:
# Test Set
preds, scores = test(model, test_loader)
top_5_map = get_top_5(preds, ids_test, predicted_evidence_test)
format_output('test_results.txt', top_5_map)