In [None]:
import numpy as np
import json
import matplotlib.pyplot as plt
import random
from tqdm.auto import tqdm
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
torch.manual_seed(69420)
sns.set_style("darkgrid")
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
config={
    "batch_size" : 256,
    "epochs" : 100,
    "reg" : 0.00005,
    "linearD" : 512,
    "learning_rate" : 0.0005,
    "model_path" : './models/',
    "train_file" : '/scratch/arjunth2001/t1.jsonl',
    "test_file":'/scratch/arjunth2001/t2.jsonl',
    "features" : ['sum_span_score', 'sum_doc_score', 'doc_sim', 'par_sim', 'min_doc_score', 'max_doc_score', 'avg_doc_score',
                'max_span_score', 'min_span_score', 'avg_span_score', 'first_occurence', 'num_occurence', 'par_length'],
    "features2" : ['sum_span_score', 'sum_doc_score',  'min_doc_score', 'max_doc_score', 'avg_doc_score',
                'max_span_score', 'min_span_score', 'avg_span_score', 'first_occurence', 'num_occurence' ],
    "features3" : [ 'doc_sim', 'par_sim', 'par_length'],
    "maximum_depth" : 2,
    "maximum_pairs" : 10,
    "validation_set_split" : 0.9,
    "early_stopping" : 20,
    "cuda":True,
    "top_k":4,
}

In [None]:
def generate_pairs(data):
    training_pairs = []
    new_pairs = 0
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            if data[i]['target'] == data[j]['target']:
                continue
            new_pairs += 1
            x = (data[i], data[j]) if data[i]['target'] == 1 else (data[j], data[i])
            training_pairs.append(x)
            if new_pairs == config["maximum_pairs"]:
                break
        if new_pairs == config["maximum_pairs"]:
            break
    return training_pairs

In [None]:
def generate_subsample():
    train, valid = [], []
    with  open(config["train_file"], 'r') as f:
        for line in tqdm(f):
            answers = json.loads(line)
            if len(answers) < 1:
                continue
            pairs = generate_pairs(answers)

            if len(pairs) == 0:
                continue

            if random.random() < config["validation_set_split"]:
                train.extend(pairs)
            else:
                valid.extend(pairs)
    return train, valid

In [None]:
pos  = ["$", "''", ",", "_SP", "-LRB-", "-RRB-", ".", ":", "ADD", "AFX", "CC", "CD", "DT", "EX", "FW", "HYPH", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NFP", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "XX", "``"]
ner = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG", "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]
qtype = ['what was', 'what is', 'what', 'in what', 'in which', 'in','when', 'where', 'who', 'why', 'which', 'is', 'other']

In [None]:
def fill_pos_vec(data):
    vec = np.zeros(len(pos))
    for i in eval(data["pos"]):
        vec[pos.index(i)] = 1
    return vec

def fill_ner_vec(data):
    vec = np.zeros(len(ner))
    for i in eval(data["ner"]):
        try:
            vec[ner.index(i)] = 1
        except:
            pass
    return vec

def fill_ques_vec(data):
    vec = np.zeros(len(qtype))
    vec[data["question_type"]]=1
    return vec
    
def get_features(data):
    pos = fill_pos_vec(data)
    ner = fill_ner_vec(data)
    ques = fill_ques_vec(data)
    all_features = (ques)
    all_features = (ner, pos, ques)
    return torch.from_numpy(np.concatenate(all_features, axis=-1)).float()
    #return torch.from_numpy(all_features).float()

In [None]:
train_data, valid_data  = generate_subsample()

In [None]:
len(train_data)

In [None]:
class Tester(object):
    def __init__(self):
        test_data = []
        with open(config["test_file"], 'r') as f:
            for line in f:
                test_data.append(json.loads(line))
        self.X, self.y, self.types, self.questions, self.answers = [], [], [], [], []
        for data in test_data:
            tx, ty, ans, i = [], [], [], 0
            for d in data:
                ok = get_features(d)
                self.types.append(d["question_type"])
                tx.append(ok)
                ty.append(d['target'])
                ans.append(d['para'])
                if i == 0:
                    self.questions.append(d['q'])
                    i += 1
            self.X.append(tx)
            self.y.append(ty)
            self.answers.append(ans)
        self.curr_best = 0
        self.baseline=0
        self.n = len(self.y)
        self.qtype= qtype
        self.total_dist, self.wrong_dist = {k:0 for k in self.qtype}, {k:0 for k in self.qtype}
        X, y, questions, answers = [], [], [], []
        for i , x in enumerate(self.X):
            solvable=False
            for j,_ in enumerate(x):
                if self.y[i][j]==1:
                    solvable=True
            self.baseline+=int(self.y[i][0])
            if not solvable:
                continue
            self.total_dist[self.qtype[self.types[i]]] += 1
            if int(self.y[i][0]) == 0:
                self.wrong_dist[self.qtype[self.types[i]]] += 1
            X, y, questions, answers = X + [x], y + [self.y[i]], questions + [self.questions[i]], answers + [self.answers[i]]
        self.X, self.y, self.questions, self.answers = X, y, questions, answers
        self.baseline=self.baseline/self.n

    def test(self, model):
        wrong, correct = 0 , 0
        self.wrong_dist={k:0 for k in self.qtype}
        with torch.no_grad():
            model.eval()
            for i, x in enumerate(self.X):
                inp = []
                for j, candidate in enumerate(x):
                    inp.append(candidate)
                inp = torch.stack(inp)
                inp= Variable(inp)
                scores = model.predict(inp).data.cpu()
                j = np.argmax(scores[:config["top_k"]])
                self.curr_best += int(self.y[i][j])  
                self.total_dist[self.qtype[self.types[i]]] += 1      
                if int(self.y[i][j]) == 0:
                    self.wrong_dist[self.qtype[self.types[i]]] += 1
                    wrong+=1
                else:
                    correct+=1
        self.curr_best = self.curr_best / self.n
        return correct, wrong

In [None]:
import matplotlib.pyplot as plt
def plot_losses(train_loss, validation_loss,epochs):
    ax1 = sns.lineplot(x=[i for i in range(1, epochs+2)], y=train_loss, label = "Train Loss")
    sns.lineplot(x=[i for i in range(1, epochs+2)], y=validation_loss, label = "Val Loss")
    ax1.set(xlabel = "Epochs", ylabel = "Loss", title = "Loss over epochs")
    plt.show()

In [None]:
def bar_plot(tester):
    y = [1 - (tester.wrong_dist[i] / tester.total_dist[i]) if tester.total_dist[i]!=0 else 0 for  i in tester.qtype]
    ax = sns.barplot(x=tester.qtype , y=y)
    ax.set(xlabel = "Question Type", ylabel = "Accuracies", title = "Accuracy for each question type")
    plt.show()

In [None]:
tester = Tester()
bar_plot(tester)

In [None]:
class PairwiseRankingDataSet():

    def __init__(self, subsampled):
        self.Xa, self.Xb, self.y = [], [], []
        for xa, xb in subsampled:
            if random.randint(0, 1) == 0:
                self.Xa.append((get_features(xa)))
                self.Xb.append((get_features(xb)))
                self.y.append(torch.tensor(float(xa['target'])))
            else:
                self.Xa.append((get_features(xb)))
                self.Xb.append((get_features(xa)))
                self.y.append(torch.tensor(float(xb['target'])))
        self.num_feat = len(self.Xa[0])

    def __getitem__(self, index):
        return self.Xa[index], self.Xb[index], self.y[index]

    def __len__(self):
        return len(self.y)

In [None]:
train_dataset = PairwiseRankingDataSet(train_data)
valid_dataset = PairwiseRankingDataSet(valid_data)

In [None]:
def batchify_pair(batch):
    xa = torch.stack([ex[0] for ex in batch])
    xb = torch.stack([ex[1] for ex in batch])
    y = torch.stack([ex[2] for ex in batch])
    return xa, xb, y

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=config["batch_size"],
    sampler=torch.utils.data.sampler.RandomSampler(train_dataset),
    pin_memory=config["cuda"],
    collate_fn=batchify_pair
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=config["batch_size"],
    sampler=torch.utils.data.sampler.RandomSampler(valid_dataset),
    pin_memory=config["cuda"],
    collate_fn=batchify_pair
)

In [None]:
class RankQA(nn.Module):

    def __init__(self,  feat_size):
        super(RankQA, self).__init__()

        self.l1 = nn.Linear(feat_size, config["linearD"])
        self.act = nn.ReLU()
        self.l2 = nn.Linear(config["linearD"], 1)

        self.output_sig = nn.Sigmoid()

    def forward(self, inputl, sig=False):
        out = self.l1(inputl)
        out = self.act(out)
        out = self.l2(out)
        if sig==True:
            out =  self.output_sig(out)
        return out

    def forward_pairwise(self, input1, input2):
        s1 = self.forward(input1)
        s2 = self.forward(input2)
        out = self.output_sig(s1 - s2)
        return out

    def predict(self, input):
        return self.forward(input)

In [None]:
def train(data_loader,model):
    losses = []
    model.train()
    for data in data_loader:
        inl, inr, target = data
        model.zero_grad()
        targets = Variable(target)
        input_l = Variable(inl)
        input_r = Variable(inr)
        y_pred = model.forward_pairwise(input_l, input_r)
        loss = loss_func(y_pred[:, 0], targets)
        l2_reg = None
        for W in model.parameters():
            if l2_reg is None:
                l2_reg = W.norm(2)
            else:
                l2_reg = l2_reg + W.norm(2)
        loss = loss + config["reg"] * l2_reg
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        return np.mean(losses)
def validate(data_loader, model):
    losses = []
    model.eval()
    with torch.no_grad():
        for data in data_loader:
            inl, inr, target = data
            targets = Variable(target)
            input_l = Variable(inl)
            input_r = Variable(inr)
            y_pred = model.forward_pairwise(input_l, input_r)
            loss = loss_func(y_pred[:, 0], targets)
            losses.append(loss.item())
    return np.mean(losses)

In [None]:
model = RankQA(train_dataset.num_feat)
optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
loss_func = nn.functional.mse_loss

In [None]:
model_save_path = config["model_path"]+"/model.pt"
best_val_loss = np.inf
best_val_iteration = 0
tl , vl , acc = [], [],[]
for i in range(config["epochs"]):
    print('EPOCH '+str(i))
    train_loss = train(train_loader, model)
    val_loss = validate(valid_loader, model)
    vl.append(val_loss)
    tl.append(train_loss)
    print('Train loss '+ str(train_loss) + ","+'Validation loss '+str(val_loss))

    if best_val_loss > val_loss:
        print('Saving Best Model')
        torch.save(model, model_save_path)
        best_val_loss = val_loss
        best_val_iteration = 0

    best_val_iteration += 1
    if best_val_iteration > config["early_stopping"]:
        print("Stopping Early..")
        break
model= torch.load(model_save_path)
plot_losses(tl,vl,i)

In [None]:
correct, wrongs = tester.test(model)

In [None]:
tester.baseline

In [None]:
tester.curr_best

In [None]:
correct/(correct+wrongs)

In [None]:
bar_plot(tester)