## Imports and Parameters

In [15]:
import json
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from tqdm.auto import tqdm
from torch.autograd import Variable
from torch.nn.functional import cross_entropy
from torch.utils.data import Dataset, DataLoader

In [4]:
params = {
    "epochs": 10,
    "lr": 1e-5,
    "num_feat": 10,
    "lin_dim": 10,
    "reg": 0.1,
    "batch_size": 32,
    "pairs_max_depth": 10,
    "pairs_max_num": 100,
    "features" : ['sum_span_score', 'sum_doc_score', 'doc_score', 'span_score', 'min_doc_score', 'max_doc_score', 'avg_doc_score', 'max_span_score', 'min_span_score', 'avg_span_score', 'first_occ', 'num_occ', 'context_len', 'question_len'],
    "filenames": ["test1.txt", "test2.txt"],
}

In [4]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
print("device - " + str(device))

device - cpu


## Dataset

In [16]:
class Data():
    def __init__(self):
        self.x = 0
    
    def gen_rank_pairs(self, data, max_depth, max_num):
        t = []
        for i, d1 in enumerate(data):
            i2 = min(i, max_depth)
            for j, d2 in enumerate(data[:i2]):
                if d1["target"] - d2["target"] == 1:
                    t.append((d1, d2))
                    max_num -= 1
                if d2["target"] - d1["target"] == 1:
                    t.append((d2, d1))
                    max_num -= 1
                if max_num == 0:
                    break
            if max_num == 0:
                break
        return t
    
    def gen_subsample(self, filenames):
        train = []
        val = []
        for i, file in enumerate(filenames):
            with open(file, "r") as f:
                train.append([])
                val.append([])
                for line in f:
                    ans = json.loads(line)
                    if len(ans) == 0:
                        continue
                    pairs = gen_rank_pairs(ans, params["pairs_max_depth"], params["pairs_max_num"])
                    if np.random.random() < params["val_split"]:
                        val[i].extend(pairs)
                    else:
                        train[i].extend(pairs)
        return train, val

In [14]:
dummy_data = []

for i in range(10):
    dummy_data.append({
        "target": i%2,
        "name": chr(ord("a")+i),
    })

gen_rank_pairs(dummy_data, 2, 10)

[({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'a'}),
 ({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'c'}),
 ({'target': 1, 'name': 'd'}, {'target': 0, 'name': 'a'}),
 ({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'e'}),
 ({'target': 1, 'name': 'f'}, {'target': 0, 'name': 'a'}),
 ({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'g'}),
 ({'target': 1, 'name': 'h'}, {'target': 0, 'name': 'a'}),
 ({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'i'}),
 ({'target': 1, 'name': 'j'}, {'target': 0, 'name': 'a'})]

In [3]:
class rankqaDS(Dataset):
    def __init__(self, sub_data):
        self.x1 = []
        self.x2 = []
        self.y = []
        for d1, d2 in sub_data:
            if np.random.random() <= 0.5:
                self.x1.append(Features.get_features(d1))
                self.x2.append(Features.get_features(d2))
                self.y.append(d1["target"])
            else:
                self.x1.append(Features.get_features(d2))
                self.x2.append(Features.get_features(d1))
                self.y.append(d2["target"])
    
    def __getitem__(self, indx):
        return self.x1[indx], self.x2[indx], self.y[indx]
    
    def __len__(self):
        return len(self.y)

In [None]:
sub_data = gen_subsample(params["filenames"])

In [None]:
f_data = rankqaDS(sub_data)
data_loader = DataLoader(f_data, batch_size = params["batch_size"])

## Features

In [None]:
class Features():
    def __init__(self, features):
        self.features = features
        self.pos_dict = {}
        self.pos_values = ['NNP', 'JJ', 'NN', 'IN', ',', 'CC', 'DT', 'VBG', 'VB', 'NNS', 'POS', 'VBZ', 'RB', 'TO', 'FW', 'PRP$', 'CD', 'VBN', 'NNPS', 'JJR', 'VBP', ':', 'VBD', 'PRP', '#', 'JJS', '$', 'WRB', '-LRB-', '-RRB-', '.', '``', "''", 'PDT', 'MD', 'WP', 'RP', 'WDT', 'EX', 'UH', 'SYM', 'LS', 'RBS', 'RBR', 'WP$']
        for indx, x in enumerate(self.pos_values):
            self.pos_dict[x] = indx
        self.ner_dict = {}
        self.ner_values = ['location', 'person', 'organization', 'money', 'percent', 'date', 'time', 'o', 'set', 'duration', 'number', 'ordinal', 'misc']
        for indx, x in enumerate(self.ner_values):
            self.ner_dict[x] = indx
        self.ques_dict = {}
        self.ques_values = ['what was', 'what is', 'what', 'in what', 'in which', 'in', 'when', 'where', 'who', 'why', 'which', 'is', 'other']
        for indx, x in enumerate(self.ques_values):
            self.ques_dict[x] = indx

    def fill_pos_vec(self, data):
        vec = np.zeros(len(self.pos_values))
        for i in data["span_pos"]:
            vec[self.pos_dict[i]] = 1
        return vec

    def fill_ner_vec(self, data):
        vec = np.zeros(len(self.ner_values))
        for i in data["span_pos"]:
            vec[self.pos_dict[i.lower()]] = 1
        return vec

    def fill_ques_vec(self, data):
        vec = np.zeros(len(self.ques_values))
        f_word = data["question"].split()[0].lower()
        if f_word in self.ques_values:
            vec[self.ques_dict[f_word]] = 1
        s_word = data["question"].split()[1].lower()
        fs_word = f_word + " " + s_word
        if fs_word in self.ques_values:
            vec[self.ques_dict[fs_word]] = 1
        if f_word not in self.ques_values and fs_word not in self.ques_values:
            vec[self.ques_dict["other"]] = 1
        return vec
    
    def get_features(self, data):
        pos = self.fill_pos_vec(data)
        ner = self.fill_ner_vec(data)
        ques = self.fill_ques_vec(data)
        return (pos, ner, ques, data[self.features])

## Evaluator

In [None]:
class Evaluator():
    def __init__(self, x, y, k, model):
        self.x = x
        self.y = y
        self.k = k
    
    def evaluate(self, model):
        baseline = 0
        current = 0
        best = 0.
        unsolvable = []
        for i, x in enumerate(self.x):
            sol = 0
            xs = []
            for j, ans in enumerate(x):
                sol = int(sol or self.y[i][j])
                xs.append(ans)
            if not sol:
                unsolvable.append(i)
                continue
            best += 1
            baseline += int(self.y[i][0]) #Top answer before re-ranking
            out = model.predict(xs)
            best_indx = np.argmax(out[:self.k])
            current += int(self.y[i][best_indx]) #Top answer after re-ranking
            best_indx = np.argmax(out)
            if int(self.y[i][best_indx]) == 0:
                unsolvable.append(i)
    
        baseline_acc = baseline/best
        current_acc = current/best
        print(f"Previous accuracy - {baseline_acc}%, Accuracy after re-ranking - {current_acc}%")
        return unsolvable

## Re-ranking

In [5]:
class ReRanker(nn.Module):
    def __init__(self, num_feat, lin_dim):
        super(ReRanker, self).__init__()
        self.l1 = nn.Linear(num_feat, lin_dim)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(lin_dim, 1)
        self.sig = nn.Sigmoid()

    def forward(self, x1, x2):
        out1 = self.forward_pass(x1)
        out2 = self.forward_pass(x2)
        out = self.sig(out1 - out2)
        return out
    
    def forward_pass(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        return out
    
    def predict(self, x):
        out = self.forward_pass(x)
        return out

In [6]:
model = ReRanker(params["num_feat"], params["lin_dim"])
model.to(device)

ReRanker(
  (l1): Linear(in_features=10, out_features=10, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=10, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [11]:
loss_func = nn.MSELoss()
opt = optim.Adam(model.parameters(), lr=params["lr"])

In [None]:
for epoch in range(params["epochs"]):
    total_loss = 0
    for batch in tqdm(data_loader):
        x1, x2, target, i = batch
        x1 = x1.to(device)
        x2 = x2.to(device)
        target = target.to(device)
        y_pred = model.forward(x1, x2)
        loss = loss_func(y_pred[:, 0], target)
        reg_l2 = 0
        for p in model.paramters():
            reg_l2 += p.norm(2)
        loss += params["reg"] * reg_l2
        loss.backward()
        opt.step()
        opt.zero_grad()
        total_loss += loss.detach().item()
    
    if epoch%2 == 1 or epoch == params["epochs"] - 1:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': opt.state_dict(),
            'loss': total_loss,
        }, f"./checkpoints/model.pt")
    print(f"Total Loss - {total_loss}")