In [1]:
from losses import rankNet

import glob
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

device = torch.device("cuda:0")

torch.set_float32_matmul_precision('high')

# Data Loader

In [2]:
class PairWiseDataset(Dataset):
    def __init__(self, root_dir="./datasets/istella-letor/train_parquet/*"):
        """
        Arguments:
            root_dir (string): Directory with all the queries.
        """
        self.queries = list()
        for query in glob.glob(root_dir):
            self.queries.append(query)

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        df = pd.read_parquet(glob.glob(self.queries[idx] + "/*.parquet")[0])

        labels = torch.tensor(df["label"].to_numpy().reshape(-1))
        length_labels = labels.shape[0]

        # TODO: Make 433 a variable
        labels = F.pad(labels, (0, 433 - length_labels), "constant", -1)

        features = torch.tensor(
            np.array(df["features.values"].values.tolist(), dtype=np.float32)
        )
        features = F.pad(features, (0, 0, 0, 433 - length_labels), "constant", -1)

        return features, labels

In [3]:
train_dataset = PairWiseDataset()
train_dataloader = DataLoader(
    train_dataset,
    num_workers=8,
    batch_size=32,
    shuffle=True,
    prefetch_factor=10,
    pin_memory=True,
)

test_dataset = PairWiseDataset(root_dir="./datasets/istella-letor/test_parquet/*")
test_dataloader = DataLoader(
    test_dataset,
    num_workers=4,
    batch_size=256,
    prefetch_factor=4,
)

In [4]:
# for features, labels in train_dataloader:
#     # Here, anchor, positive, and negative are batches of samples
#     print(f"positive: {features}\n{features.shape}\n\n")
#     print(f"negative: {labels}\n{labels.shape}")
#     break

In [5]:
class DNNLTR(nn.Module):
    def __init__(self, input_size, hidden_sizes=[220, 128, 64, 32], dropout_rate=0.3):
        super(DNNLTR, self).__init__()
        layers = []
        for i in range(len(hidden_sizes)):
            if i == 0:
                layers.append(nn.Linear(input_size, hidden_sizes[i]))
            else:
                layers.append(nn.Linear(hidden_sizes[i-1], hidden_sizes[i]))
            layers.append(nn.BatchNorm1d(hidden_sizes[i]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
        
        layers.append(nn.Linear(hidden_sizes[-1], 1))
        
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        x = self.layers(x)
        return x

In [6]:
input_size = 220  # Example input size
output_size = 1  # Output size is 1 for ranking scores
learning_rate = 0.001
num_epochs = 100

model = DNNLTR(input_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

In [7]:
model.compile()

In [8]:
from sklearn.metrics import ndcg_score

In [9]:
def train_one_epoch(model, epoch, data_loader, optimizer, scheduler, writer):
    running_loss = 0.0
    ndcgs = list()
    model.train()
    for i, data in enumerate(tqdm(data_loader)):
        features = data[0].to(device)
        label = data[1].float().to(device)

        optimizer.zero_grad()

        # # Forward pass
        output = model(features.reshape(-1, 220)).reshape(-1, 433)
        # print(output)
        # print(output.shape)

        # output2 = model(x2)

        # # Compute the loss
        loss = rankNet(output, label)
        running_loss += loss.item()

        # # Backward pass and optimize
        loss.backward()
        optimizer.step()

        for true, pred in zip(label.to("cpu"), output.to("cpu")):
            ndcgs.append(
                ndcg_score([true[true != -1].float().detach().numpy()], [pred[true != -1].float().detach().numpy()])
            )
    
    scheduler.step()

    avg_loss = running_loss / len(data_loader)
    avg_ndcg = np.mean(ndcgs)
    print("Average Loss train: ", avg_loss)
    writer.add_scalar("Loss/train", avg_loss, epoch)
    writer.add_scalar("ndcg/train", avg_ndcg, epoch)

In [10]:
def validate(model, epoch, data_loader, writer):
    running_loss = 0.0
    ndcgs = list()
    model.eval()
    for i, data in enumerate(tqdm(data_loader)):
        features = data[0].to(device)
        label = data[1].float().to(device)
        with torch.no_grad():
            output = model(features.reshape(-1, 220)).reshape(-1, 433)
            
            loss = rankNet(output, label)
            running_loss += loss.item()

            for true, pred in zip(label.to("cpu"), output.to("cpu")):
                ndcgs.append(
                    ndcg_score([true[true != -1].float().numpy()], [pred[true != -1].float().numpy()])
                )


    avg_loss = running_loss / len(data_loader)
    avg_ndcg = np.mean(ndcgs)
    print("Average Loss test: ", avg_loss)
    writer.add_scalar("Loss/test", avg_loss, epoch)
    writer.add_scalar("ndcg/test", avg_ndcg, epoch)

In [11]:
for epoch in range(10):
    train_one_epoch(model, epoch, train_dataloader, optimizer, scheduler, writer)
    validate(model, epoch, train_dataloader, writer)

writer.flush()

 87%|████████▋ | 635/726 [02:18<00:19,  4.59it/s]


ValueError: Computing NDCG is only meaningful when there is more than 1 document. Got 1 instead.

In [None]:
# from sklearn.metrics import ndcg_score

In [None]:
# true[true != -1].float().numpy()

In [None]:
# # ndcgs = list()
# for true, pred in zip(label, output):
#     ndcgs.append(ndcg_score([true[true != -1].float().numpy()], [pred[true != -1].float().numpy()]))


In [None]:
# np.mean(ndcgs)

In [None]:
# label_2 = label.clone()
# output_2 = output.clone()

In [None]:
# label_2[label == -1] = 0
# output_2[label == -1] = 0

In [None]:
# ndcg_score(label_2, output_2)

In [None]:
# from ndcg import ndcg

In [None]:
# ndcg(label, output).reshape(-1).mean()

In [None]:
# true[true != -1]

100%|██████████| 363/363 [01:16<00:00,  4.77it/s]


Average Loss train:  0.3706666540672628


 10%|▉         | 35/363 [00:10<01:40,  3.26it/s]


KeyboardInterrupt: 

In [None]:
writer.close()