In [None]:
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import copy
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

In [None]:
config={
    "batch_size" : 256,
    "epochs" : 500,
    "reg" : 0.00005,
    "linearD" : 512,
    "learning_rate" : 0.0005,
    "model_path" : './models/',
    "train_file" : '/scratch/arjunth2001/t1.jsonl',
    "test_file":'/scratch/arjunth2001/t2.jsonl',
    "features" : ['sum_span_score', 'sum_doc_score', 'doc_sim', 'par_sim', 'min_doc_score', 'max_doc_score', 'avg_doc_score',
                'max_span_score', 'min_span_score', 'avg_span_score', 'first_occurence', 'num_occurence', 'par_length'],
    "maximum_depth" : 2,
    "maximum_depth_per_question" : 2,
    "validation_set_split" : 0.9,
    "early_stopping" : 8,
    "cuda":True,
}

In [None]:
def generate_pairs(data):
    training_pairs = []
    new_pairs = 0
    if len(data) - 1 < config["maximum_depth"]:
        limit = len(data) - 1
    else:
        limit = config["maximum_depth"]
    for i in range(limit):
        #if data[i]['target'] == data[i + 1]['target']:
            #continue
        new_pairs += 1
        x = (data[i], data[i + 1]
                ) if data[i]['target'] == 1 else (data[i + 1], data[i])
        training_pairs.append(x)
        if new_pairs >= config["maximum_depth_per_question"]:
            break
    return training_pairs

In [None]:
def generate_subsample(features):
    features = copy.deepcopy(features)
    train, valid = [], []
    with  open(config["train_file"], 'r') as f:
        for line in f:
            answers = json.loads(line)
            if len(answers) < 1:
                continue
            pairs = generate_pairs(answers)

            if len(pairs) == 0:
                continue

            if random.random() < config["validation_set_split"]:
                train.extend(pairs)
            else:
                valid.extend(pairs)
    return train, valid

In [None]:
pos  = ["$", "''", ",","_SP", "-LRB-", "-RRB-", ".", ":", "ADD", "AFX", "CC", "CD", "DT", "EX", "FW", "HYPH", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NFP", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "XX", "``"]
ner = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG", "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]

In [None]:
def fill_pos_vec(data):
    vec = np.zeros(len(pos))
    #print(data["pos"])
    for i in eval(data["pos"]):
        vec[pos.index(i)] = 1
    return vec

def fill_ner_vec(data):
    vec = np.zeros(len(ner))
    for i in eval(data["ner"]):
        try:
            vec[ner.index(i)] = 1
        except:
            pass
    return vec

def fill_ques_vec(data):
    vec = np.zeros(13)
    vec[data["question_type"]]=1
    return vec
    
def get_features(data):
    pos = fill_pos_vec(data)
    ner = fill_ner_vec(data)
    ques = fill_ques_vec(data)
    all_features = (ner, pos, ques)
    return torch.from_numpy(np.concatenate(all_features, axis=-1)).float()

In [None]:
train_data, valid_data  = generate_subsample([{"name":f } for f in config["features"]])

In [None]:
test_data = []
with open(config["test_file"], 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
x, y, types, questions, answers = [], [], [], [], []
for data in test_data:
    tx, ty, ans, i = [], [], [], 0
    for d in data:
        ok = get_features(d)
        types.append(d["question_type"])
        tx.append(ok)
        ty.append(d['target'])
        ans.append(d['para'])
        if i == 0:
            questions.append(d['q'])
            i += 1
    x.append(tx)
    y.append(ty)
    answers.append(ans)

In [None]:
class Evaluator(object):
    def __init__(self,X, y, types, questions, answers):
        self.X, self.y = X, y
        self.n = len(self.y)
        self.base, self.curr, self.top = 0, 0, 0
        self.types = types
        self.total_dist, self.wrong_dist = {}, {}
        self.questions = questions
        self.answers = answers

    def initial_params(self):
        X, y, questions, answers = [], [], [], []
        with torch.no_grad():
            for i, x in enumerate(self.X):
                solvable = False
                for j, _ in enumerate(x):
                    if self.y[i][j] == 1:
                        solvable = True
                self.base += int(self.y[i][0])
                if not solvable:
                    continue
                X.append(x)
                questions.append(self.questions[i])
                answers.append(self.answers[i])
                y.append(self.y[i])
                self.top += 1
                if self.types[i] not in self.total_dist:
                  self.total_dist[self.types[i]] = 0
                  self.wrong_dist[self.types[i]] = 0
                self.total_dist[self.types[i]] += 1
        self.X, self.y = X, y
        self.questions, self.answers = questions, answers
        self.base = self.base / self.n
        self.top = self.top / self.n

    def evaluate(self, model):
        wrong, correct = [], []
        self.wrong_dist.clear()
        with torch.no_grad():
            for i, x in enumerate(self.X):
                inputs = []
                for j, candidate in enumerate(x):
                    inputs.append(candidate)
                scores = model.predict(torch.stack(inputs))
                j = np.argmax(scores[0:10])
                self.curr += int(self.y[i][j])        
                if int(self.y[i][j]) == 0:
                    if self.types[i] not in self.wrong_dist:
                      self.wrong_dist[self.types[i]] = 0
                    self.wrong_dist[self.types[i]] += 1
                    wrong.append([i, j])
                else:
                  correct.append([i, j])

        self.curr = self.curr / self.n
        return correct, wrong

In [None]:
evaluator = Evaluator(x, y, types, questions, answers)

In [None]:
class PairwiseRankingDataSet():

    def __init__(self, subsampled):
        self.Xa, self.Xb, self.y = [], [], []
        for xa, xb in subsampled:
            if random.randint(1, 2) == 1:
                self.Xa.append((get_features(xa)))
                self.Xb.append((get_features(xb)))
                self.y.append(torch.tensor(float(xa['target'])))
            else:
                self.Xa.append((get_features(xb)))
                self.Xb.append((get_features(xa)))
                self.y.append(torch.tensor(float(xb['target'])))
        self.num_feat = len(self.Xa[0])

    def __getitem__(self, index):
        return self.Xa[index], self.Xb[index], self.y[index]

    def __len__(self):
        return len(self.y)

In [None]:
train_dataset = PairwiseRankingDataSet(train_data)
valid_dataset = PairwiseRankingDataSet(valid_data)

In [None]:
def batchify_pair(batch):
    xa = torch.stack([ex[0] for ex in batch])
    xb = torch.stack([ex[1] for ex in batch])
    y = torch.stack([ex[2] for ex in batch])
    return xa, xb, y

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=config["batch_size"],
    sampler=torch.utils.data.sampler.RandomSampler(train_dataset),
    pin_memory=config["cuda"],
    collate_fn=batchify_pair
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=config["batch_size"],
    sampler=torch.utils.data.sampler.RandomSampler(valid_dataset),
    pin_memory=config["cuda"],
    collate_fn=batchify_pair
)

In [None]:
class RankNetModel(nn.Module):

    def __init__(self,  feat_size):
        super(RankNetModel, self).__init__()

        self.l1 = nn.Linear(feat_size, config["linearD"])
        self.act = nn.ReLU()
        self.l2 = nn.Linear(config["linearD"], 1)

        self.output_sig = nn.Sigmoid()

    def forward(self, inputl, sig=False):
        out = self.l1(inputl)
        out = self.act(out)
        out = self.l2(out)
        if sig==True:
            out =  self.output_sig(out)
        return out

    def forward_pairwise(self, input1, input2):
        s1 = self.forward(input1)
        s2 = self.forward(input2)
        out = self.output_sig(s1 - s2)
        return out

    def predict(self, input):
        return self.forward(input)

In [None]:
class RankerNet(object):
    def __init__(self, num_feat):
        self.network = RankNetModel(num_feat)
        self.optimizer = optim.Adam(self.network.parameters(), lr=config["learning_rate"])
        self.loss_func = nn.functional.mse_loss
        self.loss_func_single = nn.functional.mse_loss

    def predict(self, input):
        self.network.eval()
        input = Variable(input)
        scores = self.network.predict(input)
        return scores.data.cpu()

    def eval_pairwise(self, input_l, input_r, targets):
        self.network.eval()
        with torch.no_grad():
            targets = Variable(targets)
            input_l = Variable(input_l)
            input_r = Variable(input_r)

            y_pred = self.network.forward_pairwise(input_l, input_r)

            loss = self.loss_func_single(y_pred[:, 0], targets)
        return loss.item()

    def update_pairwise(self, input_l, input_r, targets):
        self.network.train()

        self.network.zero_grad()
        targets = Variable(targets)
        input_l = Variable(input_l)
        input_r = Variable(input_r)

        y_pred = self.network.forward_pairwise(input_l, input_r)

        loss = self.loss_func_single(y_pred[:, 0], targets)
        l2_reg = None
        for W in self.network.parameters():
            if l2_reg is None:
                l2_reg = W.norm(2)
            else:
                l2_reg = l2_reg + W.norm(2)

        loss = loss + config["reg"] * l2_reg
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def save(self, path):
        torch.save(self.network, path)
        pass

    def load(self, path):
        self.network = torch.load(path)
        pass

def calculate_loss(data_loader, model , typ):
    loss = []
    for data in data_loader:
        inl, inr, target = data
        l = model.eval_pairwise(inl, inr, target) if typ == "val" else model.update_pairwise(inl, inr, target)
        loss.append(l)
    return np.mean(loss)

In [None]:
model = RankerNet(train_dataset.num_feat)

In [None]:
print(train_dataset.num_feat)
model_save_path = config["model_path"]+"/model.pt"
best_val_loss = float('inf')
best_val_iteration = 0

for i in range(config["epochs"]):
    print('EPOCH '+str(i))
    train_loss = calculate_loss(train_loader, model,'train')
    val_loss = calculate_loss(valid_loader, model,'val')

    print('Train loss '+ str(train_loss))
    print('Validation loss '+str(val_loss))

    if best_val_loss > val_loss:
        print('Got a new best model, SAVING!!')
        model.save(model_save_path)
        best_val_loss = val_loss
        best_val_iteration = 0

    best_val_iteration += 1

    if best_val_iteration > config["early_stopping"]:
        print("doing early stopping")
        break
model.load(model_save_path)

In [None]:
evaluator.initial_params()
correct, wrongs = evaluator.evaluate(model)