# Offline and Online Learning to Rank 

In [1]:
import os
import json
import itertools
from argparse import Namespace
from collections import OrderedDict
from functools import partial


import torch
import numpy as np
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from tqdm.notebook import tqdm, trange
from torch.utils.data import Dataset, DataLoader


import matplotlib.pyplot as plt
from matplotlib.pyplot import cm

import pandas as pd

import dataset
import evaluate


# Offline LTR 

In [2]:
dataset.download_dataset()
data = dataset.get_dataset()
# there is only 1 fold for this dataset 
data = data.get_data_folds()[0]
# read in the data
data.read_data()

In [3]:
print(f"Number of features: {data.num_features}")
# print some statistics
for split in ["train", "validation", "test"]:
    print(f"Split: {split}")
    split = getattr(data, split)
    print(f"\tNumber of queries {split.num_queries()}")
    print(f"\tNumber of docs {split.num_docs()}")

Number of features: 501
Split: train
	Number of queries 2735
	Number of docs 85227
Split: validation
	Number of queries 403
	Number of docs 12794
Split: test
	Number of queries 949
	Number of docs 29881


In [4]:
# these is a useful class to create torch DataLoaders, and can be used during training
class LTRData(Dataset):
    def __init__(self, data, split):
        split = {
            "train": data.train,
            "validation": data.validation,
            "test": data.test
        }.get(split)
        assert split is not None, "Invalid split!"
        features, labels = split.feature_matrix, split.label_vector
        self.features = torch.FloatTensor(features)
        self.labels = torch.FloatTensor(labels)
    
    def __len__(self):
        return self.features.size(0)

    def __getitem__(self, i):
        return self.features[i], self.labels[i]

In [6]:
# this function evaluates a model, on a given split
def evaluate_model(pred_fn, split, batch_size=256, print_results=False, q_level=False):
    dl = DataLoader(LTRData(data, split), batch_size=batch_size)
    all_scores = []
    all_labels = []
    for (x, y) in tqdm(dl, desc=f'Eval ({split})', leave=False):
        all_labels.append(y.squeeze().numpy())
        
        with torch.no_grad():
            output = pred_fn(x)
            all_scores.append(output.squeeze().numpy())
            
    split = {
            "train": data.train,
            "validation": data.validation,
            "test": data.test
    }.get(split)   
    results = evaluate.evaluate2(np.asarray(all_scores), np.asarray(all_labels), print_results=print_results, q_level=q_level)

    return results


In [8]:
# use to get reproducible results
def seed(random_seed):
    import random
    torch.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    

## Pointwise LTR 

Let $x \in \mathbb{R}^d$ be an input feature vector, containing features for a query-document pair. Let $f: \mathbb{R}^d \rightarrow \mathbb{R} $ be a function that maps this feature vector to a number $f(x)$ - either a relevance score (regression) or label (classification). The data $\{x \}$ are treated as feature vectors and the relevance judgements are treated as the target which we want to predict. 


In [9]:
class NeuralModule(nn.Module):
    def __init__(self, output_dim):
        
        super().__init__()
        self.net = nn.Sequential(nn.Linear(in_features=501, out_features=256),
                                 nn.ReLU(),
                                nn.Linear(in_features=256, out_features=output_dim))
    
    def forward(self, x):
        """
        Takes in an input feature vector (of size 501) and produces the (regression/classification) output 
        Input: x: a [N, 501] tensor
        Output: a [N, output_dim] tensor
        """
        out = self.net(x)
        return out

In [10]:
point_nn_clf = NeuralModule(5)
point_nn_reg = NeuralModule(1)

**Implementation :**
Implement cross entropy loss and and then cross entropy prediction.

In [11]:
def clf_loss(output, target):
    assert output.size(0) == target.size(0)
    assert output.size(1) == 5
    target = target.long()
    loss = nn.CrossEntropyLoss()
    out = loss(output, target)
    return out


In [13]:
def clf_pred(inp, net):
    features = net(inp)
    out = features.max(dim=1)[1] 
    return out


In [None]:
clf_pred_fn = partial(clf_pred, net=point_nn_clf)


**Implementation :**
Implement regression loss.

In [None]:
def reg_loss(output, target):
    assert target.dim() == 1
    assert output.size(0) == target.size(0)
    assert output.size(1) == 1
        target = torch.unsqueeze(target, 1)
    loss = nn.MSELoss()
    out = loss(output, target)
    return out

In [None]:
def train_pointwise(net, loss, params):
    """
    This function should train a Pointwise network, 
    trained based on the loss (either "clf" / "reg"). 
    
    The network is trained using the Adam optimizer
        
    net: the neural network to be trained
    
    loss: one of "clf" or "reg"
    
    params: params is an object which contains config used in training 
        (eg. params.epochs - the number of epochs to train). 
    
    Returns: a dictionary containing: "metrics_val" (a list of dictionaries) and 
             "metrics_train" (a list of dictionaries). 
             
             "metrics_val" should contain metrics (the metrics in params.metrics) computed
             after each epoch on the validation set (metrics_train is similar). 
    
    """
    
    assert loss in {"clf", "reg"}
    
    val_metrics_epoch = []
    train_metrics_epoch = []
    train_dl = DataLoader(LTRData(data, "train"), batch_size=params.batch_size, shuffle=True)
    optimiser = Adam(net.parameters(), lr=params.lr)
    net.train()
    for i in range(params.epochs):
        for (x, y) in train_dl:
            optimiser.zero_grad()
            outputs = net(x)
            if loss == 'clf':
                loss_value = clf_loss(outputs, y)
            elif loss == 'reg':
                loss_value = reg_loss(outputs, y)
            loss_value.backward()
            optimiser.step()
        if loss == 'clf':
            clf_pred_fn = partial(clf_pred, net=net)         
            dict_epoch_train = {}
            dict_values_train = evaluate_model(clf_pred_fn, "train", print_results=False)
            select_m_train = {k: v for (k, v) in dict_values_train.items() if k in params.metrics}
            dict_epoch_train[f'Epoch {i+1}'] = select_m_train
            train_metrics_epoch.append(dict_epoch_train)
            dict_epoch_eval = {}
            dict_values_eval = evaluate_model(clf_pred_fn, "validation", print_results=False)
            select_m_eval = {k: v for (k, v) in dict_values_eval.items() if k in params.metrics}
            dict_epoch_eval[f'Epoch {i+1}'] = select_m_eval
            val_metrics_epoch.append(dict_epoch_eval) 
        elif loss == 'reg':
            dict_epoch_train = {}
            dict_values_train = evaluate_model(net, "train", print_results=False)
            select_m_train = {k: v for (k, v) in dict_values_train.items() if k in params.metrics}
            dict_epoch_train[f'Epoch {i+1}'] = select_m_train
            train_metrics_epoch.append(dict_epoch_train)
            dict_epoch_eval = {}
            dict_values_eval = evaluate_model(net, "validation", print_results=False)
            select_m_eval = {k: v for (k, v) in dict_values_eval.items() if k in params.metrics}
            dict_epoch_eval[f'Epoch {i+1}'] = select_m_eval
            val_metrics_epoch.append(dict_epoch_eval)
                  
    return {
        "metrics_val": val_metrics_epoch,
        "metrics_train": train_metrics_epoch
    }

In [None]:
pointwise_test_params = Namespace(epochs=2, 
                    lr=1e-3,
                    batch_size=256,
                   metrics={"ndcg"})
##train a regression model
met_reg = train_pointwise(point_nn_reg, "reg", pointwise_test_params)
## train a classification model
met_clf = train_pointwise(point_nn_clf, "clf", pointwise_test_params)

## Pairwise LTR

For a given query, consider two documents $D_i$ and $D_j$ with two different ground truth relevance  labels,  with  feature  vectors $x_i$ and $x_j$ respectively.   The  RankNet  model,  just  like  the pointwise model, uses $f$ to predict scores i.e $s_i=f(x_i)$ and $s_j=f(x_j)$, but uses a different loss during  training. $D_i \triangleright D_j$ denotes  the  event  that $D_i$ should  be  ranked  higher  than $D_j$.   The  two outputs $s_i$ and $s_j$ are mapped to a learned probability that $D_i \triangleright D_j$: 


$$        P_{ij} = \frac{1}{1 + e^{-\sigma(s_i - s_j)}} $$
  
where $\sigma$ is a parameter that determines the shape of the sigmoid. The loss of the RankNet model is the cross entropy cost function:

$$        C = - \bar{P}_{ij} \log P_{ij} - (1-\bar{P}_{ij}) \log (1 - P_{ij}) $$

As the name suggests, in the pairwise approach to LTR, we optimize a loss $l$ over pairs of documents. Let $S_{ij} \in \{0, \pm1 \}$ be equal to $1$ if the relevance of document $i$ is greater than document $j$; $-1$ if document $j$ is more relevant than document $i$; and 0 if they have the same relevance. This gives us $\bar{P}_{ij} = \frac{1}{2} (1 + S_{ij})$ so that $\bar{P}_{ij} = 1$ if $D_i \triangleright D_j$; $\bar{P}_{ij} = 0$ if $D_j \triangleright D_i$; and finally $\bar{P}_{ij} = \frac{1}{2}$ if the relevance is identical. This gives us:

$$        C = \frac{1}{2}(1- S_{ij})\sigma(s_i - s_j) + \log(1+ e^{-\sigma(s_i - s_j)}) $$

Now, consider a single query for which $n$ documents have been returned. Let the output scores of the ranker be $s_j$ ; $j=\{1, \dots, n \}$, the model parameters be $w_k \in \mathbb{R}^W$, and let the set of pairs of document indices used for training be $\mathcal{P}$. Then, the total cost is $C_T = \sum_{i,j \in \mathcal{P}} C(s_i; s_j)$. 


In [None]:
class QueryGroupedLTRData(Dataset):
    def __init__(self, data, split):
        self.split = {
            "train": data.train,
            "validation": data.validation,
            "test": data.test
        }.get(split)
        assert self.split is not None, "Invalid split!"
    
    def __len__(self):
        return self.split.num_queries()

    def __getitem__(self, q_i):
        feature = torch.FloatTensor(self.split.query_feat(q_i))
        labels = torch.FloatTensor(self.split.query_labels(q_i))
        return q_i, feature, labels

def qg_collate_fn(batch):
    
    qids = []
    features = []
    labels = []
    
    for (q, f, l) in batch:
        qids.append(q)
        features.append(f)
        labels.append(l)
    
    return qids, features, labels
    
    train_dl = DataLoader(QueryGroupedLTRData(data, "train"), batch_size=1, shuffle=True, collate_fn=qg_collate_fn)
for (qids, x, y) in train_dl:
    for q_i, features_i, labels_i in zip(qids, x, y):
        print(f"Query {q_i} has {len(features_i)} query-document pairs")
        print(f"Shape of features for Query {q_i}: {features_i.size()}")
        break
    break
        

In [None]:
def pairwise_loss(scores, labels):
    if labels.size(0) < 2:
        return None
    n = scores.size(0)
    if scores.size() != (n,1):
        scores = torch.unsqueeze(scores, 1)
    distances = scores - torch.transpose(scores, 0, 1)
    expand_labels = torch.unsqueeze(labels, 1)
    diff_labels = expand_labels - torch.transpose(expand_labels, 0, 1)
    pos = (diff_labels > 0) * torch.ones(n)
    neg = (diff_labels < 0) * (-torch.ones(n))
    S_ij = pos + neg
    loss_per_pair = 0.5 * (1 - S_ij) * distances + torch.log(1 + torch.exp(-distances))
    loss_per_pair = loss_per_pair.masked_select(~torch.eye(n, dtype=bool)).view(n, n - 1)
    final_loss = torch.mean(loss_per_pair)
    return final_loss

    

In [None]:
def train_pairwise(net, params):
    """
    This function should train the given network using the pairwise loss
    
    Returns: a dictionary containing: "metrics_val" (a list of dictionaries) and 
             "metrics_train" (a list of dictionaries). 
             
             "metrics_val" should contain metrics (the metrics in params.metrics) computed
             after each epoch on the validation set (metrics_train is similar). 

    Hint: Consider the case when the loss function returns 'None'
    
    net: the neural network to be trained
    
    params: params is an object which contains config used in training 
        (eg. params.epochs - the number of epochs to train). 
    """

    val_metrics_epoch = []
    train_metrics_epoch = []
    train_dl = DataLoader(QueryGroupedLTRData(data, "train"), batch_size=1, shuffle=True, collate_fn=qg_collate_fn)
    optimiser = Adam(net.parameters(), lr=params.lr)
    net.train()
    for i in range(params.epochs):
        for (qids, x, y) in train_dl:
            for q_i, features_i, labels_i in zip(qids, x, y):
                optimiser.zero_grad()
                outputs = net(features_i)
                loss_value = pairwise_loss(outputs, labels_i)
                loss_value.backward()
                optimiser.step()
                    
        dict_epoch_train = {}
        dict_values_train = evaluate_model(net, "train", print_results=False)
        select_m_train = {k: v for (k, v) in dict_values_train.items() if k in params.metrics}
        dict_epoch_train[f'Epoch {i+1}'] = select_m_train
        train_metrics_epoch.append(dict_epoch_train)
        dict_epoch_eval = {}
        dict_values_eval = evaluate_model(net, "validation", print_results=False)
        select_m_eval = {k: v for (k, v) in dict_values_eval.items() if k in params.metrics}
        dict_epoch_eval[f'Epoch {i+1}'] = select_m_eval
        val_metrics_epoch.append(dict_epoch_eval) 
                                        
    return {
        "metrics_val": val_metrics_epoch,
        "metrics_train": train_metrics_epoch
    }

## Pairwise: Speed-up RankNet


To speed up training of the previous model, we can consider a sped up version of the model, where instead of `.backward` on the loss, we use `torch.backward(lambda_i)`. 

The derivative of the total cost $C_T$ with respect to the model parameters $w_k$ is:

$$        \frac{\partial C_T}{\partial w_k} = \sum_{(i,j) \in \mathcal{P}} \frac{\partial C(s_i, s_j)}{\partial s_i} \frac{\partial s_i}{\partial w_k} + \frac{\partial C(s_i, s_j)}{\partial s_j} \frac{\partial s_j}{\partial w_k} $$

We can rewrite this sum by considering the set of indices $j$ , for which $\{i,j\}$ is a valid pair, denoted by $\mathcal{P}_i$, and the set of document indices $\mathcal{D}$:

$$
\frac{\partial C_T}{\partial w_k} = \sum_{i \in \mathcal{D}}
\frac{\partial s_i}{\partial w_k} \sum_{j \in \mathcal{P}_i} 
\frac{\partial C(s_i, s_j)}{\partial s_i} 
$$

This sped of version of the algorithm first computes scores $s_i$ for all the documents. Then for each $j= 1, \dots, n$, compute:

$$
\lambda_{ij} = \frac{\partial C(s_i, s_j)}{\partial s_i} = \sigma \bigg( \frac{1}{2}(1 - S_{ij}) -  \frac{1}{1 + e^{\sigma(s_i -s_j))}} \bigg) \\
\lambda_i = \sum_{j \in \mathcal{P}_i} \frac{\partial C(s_i, s_j)}{\partial s_i} = \sum_{j \in \mathcal{P}_i} \lambda_{ij}
$$

That gives us:

$$
\frac{\partial C_T}{\partial w_k} = \sum_{i \in \mathcal{D}}
\frac{\partial s_i}{\partial w_k} \lambda_i
$$

This can be directly optimized in pytorch using: `torch.autograd.backward(scores, lambda_i)` 
 


In [None]:
def compute_lambda_i(scores, labels):
    """
    Compute \lambda_i (defined in the previous cell). (assume sigma=1.)
    
    scores: tensor of size [N, 1] (the output of a neural network), where N = length of <query, document> pairs
    labels: tensor of size [N], contains the relevance labels 
    
    return: \lambda_i, a tensor of shape: [N, 1]
    """
    
    
    n = scores.size(0)
    if scores.size() != (n,1):
        scores = torch.unsqueeze(scores, 1)
    distances = scores - torch.transpose(scores, 0, 1)
    expand_labels = torch.unsqueeze(labels, 1)
    diff_labels = expand_labels - torch.transpose(expand_labels, 0, 1)
    pos = (diff_labels > 0) * torch.ones(n)
    neg = (diff_labels < 0) * (-torch.ones(n))
    S_ij = pos + neg
    lambda_i = 0.5 * (1 - S_ij) - (1 / (1 + torch.exp(distances)))
    lambda_i = torch.sum(lambda_i, dim=1, keepdim=True)
    return lambda_i

    raise NotImplementedError()

In [None]:
def train_pairwise_spedup(net, params):
    """
    This function should train the given network using the sped up pairwise loss
    
    net: the neural network to be trained
    
    params: params is an object which contains config used in training 
        (eg. params.epochs - the number of epochs to train). 
    
    Returns: a dictionary containing: "metrics_val" (a list of dictionaries) and 
             "metrics_train" (a list of dictionaries). 
             
             "metrics_val" should contain metrics (the metrics in params.metrics) computed
             after each epoch on the validation set (metrics_train is similar). 
    """
    
    val_metrics_epoch = []
    train_metrics_epoch = []

        
    train_dl = DataLoader(QueryGroupedLTRData(data, "train"), batch_size=1, shuffle=True, collate_fn=qg_collate_fn)
    optimiser = Adam(net.parameters(), lr=params.lr)
    net.train()
    for i in range(params.epochs):
        for (qids, x, y) in train_dl:
            for q_i, features_i, labels_i in zip(qids, x, y):
                optimiser.zero_grad()
                outputs = net(features_i)
                lambda_i = compute_lambda_i(outputs, labels_i)
                torch.autograd.backward(outputs, lambda_i)
                optimiser.step()
        
        dict_epoch_train = {}
        dict_values_train = evaluate_model(net, "train", print_results=False)
        select_m_train = {k: v for (k, v) in dict_values_train.items() if k in params.metrics}
        dict_epoch_train[f'Epoch {i+1}'] = select_m_train
        train_metrics_epoch.append(dict_epoch_train)
        dict_epoch_eval = {}
        dict_values_eval = evaluate_model(net, "validation", print_results=False)
        select_m_eval = {k: v for (k, v) in dict_values_eval.items() if k in params.metrics}
        dict_epoch_eval[f'Epoch {i+1}'] = select_m_eval
        val_metrics_epoch.append(dict_epoch_eval)  
    
    return {
        "metrics_val": val_metrics_epoch,
        "metrics_train": train_metrics_epoch
    }

In [None]:
pairwise_spedup_params_test = Namespace(epochs=1, lr=1e-3, batch_size=1, metrics={"ndcg@10","ndcg"})
pairwise_net_spedup = NeuralModule(1)
train_pairwise_spedup(pairwise_net_spedup, pairwise_spedup_params_test)

## Listwise LTR
 Given a ranking measure $IRM$, such as $NDCG$ or $ERR$, the lambda function in LambdaRank is defined as:


$$        \frac{\partial C}{\partial s_i} = \sum_{j \in D} \lambda_{ij} \cdot |\bigtriangleup IRM (i,j)| $$

Where $|\bigtriangleup IRM(i,j)|$ is the absolute difference in $IRM$ after swapping the rank positions $r_i$ and $r_j$ while leaving everything else unchanged ($| \cdot |$ denotes the absolute value). Note that we do not backpropogate $|\bigtriangleup IRM|$, it is treated as a constant that scales the gradients. In this assignment we will use $|\bigtriangleup NDCG|$

In [None]:
def listwise_loss(scores, labels):
    
    """
    Compute the LambdaRank loss. (assume sigma=1.)
    
    scores: tensor of size [N, 1] (the output of a neural network), where N = length of <query, document> pairs
    labels: tensor of size [N], contains the relevance labels 
    
    returns: a tensor of size [N, 1]
    """
    
    
    n = scores.size(0)
    if scores.size() != (n,1):
        scores = torch.unsqueeze(scores, 1)

    random_i = np.random.permutation(np.arange(np.asarray(scores.detach()).shape[0]))
    labels_np = np.asarray(labels.detach())[random_i]
    scores_np = np.asarray(scores.flatten().detach())[random_i]
    sort_ind = np.argsort(scores_np)[::-1]
    sorted_labels = labels_np[sort_ind]
    ideal_labels = np.sort(labels_np)[::-1]
    epsilon = 0.00000000001 
    idcg = evaluate.dcg_at_k(ideal_labels, 0)
    idcg = 1.0 / (idcg + epsilon) #We add epsilon so that when idcg = 0 (when all labels are zero) we don't divide by zero
    
    k = sorted_labels.shape[0]
    order_rank = torch.arange(k).view(-1, 1)
    sorted_labels2 = torch.unsqueeze(torch.from_numpy(sorted_labels), 1)
    nom_diff = torch.pow(2.0, sorted_labels2) - torch.pow(2.0, torch.transpose(sorted_labels2, 0, 1))
    denom_diff = (1.0 / torch.log2(order_rank + 2.0)) - (1.0 / torch.log2(order_rank.t() + 2.0))
    delta_ndcg = torch.abs(idcg * nom_diff * denom_diff)

    distances = scores - torch.transpose(scores, 0, 1)
    expand_labels = torch.unsqueeze(labels, 1)
    diff_labels = expand_labels - torch.transpose(expand_labels, 0, 1)
    pos = (diff_labels > 0) * torch.ones(n)
    neg = (diff_labels < 0) * (-torch.ones(n))
    S_ij = pos + neg
    lambda_i = 0.5 * (1 - S_ij) - (1 / (1 + torch.exp(distances)))

    lambda_r = lambda_i * delta_ndcg
    lambda_r = torch.sum(lambda_r, dim=1, keepdim=True)
    return lambda_r    
    
    raise NotImplementedError()

In [None]:
def train_listwise(net, params):
    """
    This function should train the given network using the listwise (LambdaRank) loss
    

    
    net: the neural network to be trained
    
    params: params is an object which contains config used in training 
        (eg. params.epochs - the number of epochs to train). 
        
    Returns: a dictionary containing: "metrics_val" (a list of dictionaries) and 
             "metrics_train" (a list of dictionaries). 
             
             "metrics_val" should contain metrics (the metrics in params.metrics) computed
             after each epoch on the validation set (metrics_train is similar). 
    """
    
    val_metrics_epoch = []
    train_metrics_epoch = []
    train_dl = DataLoader(QueryGroupedLTRData(data, "train"), batch_size=1, shuffle=True, collate_fn=qg_collate_fn)
    optimiser = Adam(net.parameters(), lr=params.lr)
    net.train()
    for i in range(params.epochs):
        for (qids, x, y) in train_dl:
            for q_i, features_i, labels_i in zip(qids, x, y):
                optimiser.zero_grad()
                outputs = net(features_i)
                loss = listwise_loss(outputs, labels_i)
                torch.autograd.backward(outputs, loss)
                optimiser.step()
                    
        dict_epoch_train = {}
        dict_values_train = evaluate_model(net, "train", print_results=False)
        select_m_train = {k: v for (k, v) in dict_values_train.items() if k in params.metrics}
        dict_epoch_train[f'Epoch {i+1}'] = select_m_train
        train_metrics_epoch.append(dict_epoch_train)
        dict_epoch_eval = {}
        dict_values_eval = evaluate_model(net, "validation", print_results=False)
        select_m_eval = {k: v for (k, v) in dict_values_eval.items() if k in params.metrics}
        dict_epoch_eval[f'Epoch {i+1}'] = select_m_eval
        val_metrics_epoch.append(dict_epoch_eval)     
    
    return {
        "metrics_val": val_metrics_epoch,
        "metrics_train": train_metrics_epoch
    }

In [None]:
listwise_params_test = Namespace(epochs=1, lr=1e-3, batch_size=1, metrics={"ndcg"})
listwise_net = NeuralModule(1)
train_listwise(listwise_net, listwise_params_test)

