In [15]:
import json
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from tqdm.auto import tqdm
from torch.autograd import Variable
from torch.nn.functional import cross_entropy
from torch.utils.data import Dataset, DataLoader

In [4]:
params = {
    "epochs": 10,
    "lr": 1e-5,
    "num_feat": 10,
    "lin_dim": 10,
    "reg": 0.1,
    "batch_size": 32,
    "pairs_max_depth": 10,
    "pairs_max_num": 100,
    "filenames": ["test1.txt", "test2.txt"],
}

In [5]:
def gen_rank_pairs(data, max_depth, max_num):
    t = []
    for i, d1 in enumerate(data):
        i2 = min(i, max_depth)
        for j, d2 in enumerate(data[:i2]):
            if d1["target"] - d2["target"] == 1:
                t.append((d1, d2))
                max_num -= 1
            if d2["target"] - d1["target"] == 1:
                t.append((d2, d1))
                max_num -= 1
            if max_num == 0:
                break
        if max_num == 0:
            break
    return t

In [16]:
def gen_subsample(filenames):
    train = []
    val = []
    for i, file in enumerate(filenames):
        with open(file, "r") as f:
            train.append([])
            val.append([])
            for line in f:
                ans = json.loads(line)
                if len(ans) == 0:
                    continue
                pairs = gen_rank_pairs(ans, params["pairs_max_depth"], params["pairs_max_num"])
                if np.random.random() < params["val_split"]:
                    val[i].extend(pairs)
                else:
                    train[i].extend(pairs)
    return train, val

In [14]:
dummy_data = []

for i in range(10):
    dummy_data.append({
        "target": i%2,
        "name": chr(ord("a")+i),
    })

gen_rank_pairs(dummy_data, 2, 10)

[({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'a'}),
 ({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'c'}),
 ({'target': 1, 'name': 'd'}, {'target': 0, 'name': 'a'}),
 ({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'e'}),
 ({'target': 1, 'name': 'f'}, {'target': 0, 'name': 'a'}),
 ({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'g'}),
 ({'target': 1, 'name': 'h'}, {'target': 0, 'name': 'a'}),
 ({'target': 1, 'name': 'b'}, {'target': 0, 'name': 'i'}),
 ({'target': 1, 'name': 'j'}, {'target': 0, 'name': 'a'})]

In [3]:
class rankqaDS(Dataset):
    def __init__(self, sub_data):
        self.x1 = []
        self.x2 = []
        self.y = []
        for d1, d2 in sub_data:
            if np.random.random() <= 0.5:
                self.x1.append(d1)
                self.x2.append(d2)
                self.y.append(d1["target"])
            else:
                self.x1.append(d2)
                self.x2.append(d1)
                self.y.append(d2["target"])
    
    def __getitem__(self, indx):
        return self.x1[indx], self.x2[indx], self.y[indx]
    
    def __len__(self):
        return len(self.y)

In [None]:
sub_data = gen_subsample(params["filenames"])

In [None]:
f_data = rankqaDS(sub_data)
data_loader = DataLoader(f_data, batch_size = params["batch_size"])

In [4]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
print("device - " + str(device))

device - cpu


In [5]:
class ReRanker(nn.Module):
    def __init__(self, num_feat, lin_dim):
        super(ReRanker, self).__init__()
        self.l1 = nn.Linear(num_feat, lin_dim)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(lin_dim, 1)
        self.sig = nn.Sigmoid()

    def forward(x1, x2):
        out1 = self.l1(x1)
        out1 = self.relu(out1)
        out1 = self.l2(out1)
        out2 = self.l1(x2)
        out2 = self.relu(out2)
        out2 = self.l2(out2)
        out = self.sig(out1 - out2)
        return out

In [6]:
model = ReRanker(params["num_feat"], params["lin_dim"])
model.to(device)

ReRanker(
  (l1): Linear(in_features=10, out_features=10, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=10, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [11]:
loss_func = nn.MSELoss()
opt = optim.Adam(model.parameters(), lr=params["lr"])

In [None]:
for epoch in range(params["epochs"]):
    total_loss = 0
    for batch in tqdm(data_loader):
        x1, x2, target, i = batch
        x1 = x1.to(device)
        x2 = x2.to(device)
        target = target.to(device)
        y_pred = model.forward(x1, x2)
        loss = loss_func(y_pred[:, 0], target)
        reg_l2 = 0
        for p in model.paramters():
            reg_l2 += p.norm(2)
        loss += params["reg"] * reg_l2
        loss.backward()
        opt.step()
        opt.zero_grad()
        total_loss += loss.detach().item()
    
    if epoch%2 == 1 or epoch == params["epochs"] - 1:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': opt.state_dict(),
            'loss': total_loss,
        }, f"./checkpoints/model.pt")
    print(f"Total Loss - {total_loss}")