# RankNet

In [1]:
import torch

In [2]:
%%latex
$$C_{ij}=C(o_{ij})=-\bar{P_{ij}}log(P_{ij})-(1-\bar{P_{ij}})log(1-P_{ij})$$

<IPython.core.display.Latex object>

In [3]:
%%latex
$$o_{ij}=f(x_i)-f(x_j)$$

<IPython.core.display.Latex object>

In [4]:
%%latex
$$P_{ij}=\frac{e^{o_{ij}}}{1+e^{o_{ij}}}$$

<IPython.core.display.Latex object>

In [5]:
# torch.sigmoid()

In [6]:
%%latex
$$\text{out}_{i} = \frac{1}{1 + e^{-\text{input}_{i}}}$$

<IPython.core.display.Latex object>

In [7]:
class RankNet(torch.nn.Module):
    def __init__(self, num_input_features, hidden_dim=10):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(num_input_features, self.hidden_dim), # переводит входное пространство в скрытый слой
            torch.nn.ReLU(), # добавляем нелинейность
            torch.nn.Linear(self.hidden_dim, 1), # приводим к скаляру
        )
        
        self.out_activation = torch.nn.Sigmoid()

    def forward(self, input_1, input_2):
        logits_1 = self.predict(input_1)
        logits_2 = self.predict(input_2)
        
        logits_diff = logits_1 - logits_2
        out = self.out_activation(logits_diff)

        return out
    
    def predict(self, inp):
        logits = self.model(inp)
        return logits

In [8]:
ranknet_model = RankNet(num_input_features=10)

In [9]:
inp_1, inp_2 = torch.rand(4, 10), torch.rand(4, 10)
# batch_size x input_dim
inp_2

tensor([[0.8081, 0.6449, 0.9244, 0.7282, 0.9149, 0.9803, 0.3511, 0.9423, 0.3800,
         0.4350],
        [0.1026, 0.6526, 0.8542, 0.8848, 0.3349, 0.2752, 0.2069, 0.9747, 0.9119,
         0.8857],
        [0.4068, 0.6438, 0.6348, 0.3660, 0.2397, 0.8409, 0.7032, 0.0230, 0.3923,
         0.6201],
        [0.8944, 0.6438, 0.0946, 0.9344, 0.3438, 0.7913, 0.8423, 0.0804, 0.1115,
         0.9901]])

In [10]:
# вероятность что каждый объект из input1 более релевантен чем в input2
preds = ranknet_model(inp_1, inp_2)
preds

tensor([[0.4973],
        [0.4853],
        [0.4976],
        [0.5010]], grad_fn=<SigmoidBackward>)

In [11]:
first_linear_layer = ranknet_model.model[0]

In [12]:
# градиенты первого слоя нейронной сети неопределены
first_linear_layer.weight.grad

In [13]:
# функция потерь - бинарная кросс-энтропия
criterion = torch.nn.BCELoss()
loss = criterion(preds, torch.ones_like(preds)) # допустим что все 1 - i всегда релевантнее j
loss.backward() # расчет градиентов

In [14]:
first_linear_layer.weight.grad
# следуя градинетам будем обновлять веса нейрноки, чтобы выходы были более похожи на 1

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.0074,  0.0017,  0.0019, -0.0008,  0.0056,  0.0014,  0.0079, -0.0023,
         -0.0013, -0.0011],
        [ 0.0083,  0.0016, -0.0022, -0.0087,  0.0055, -0.0035,  0.0068, -0.0033,
         -0.0035, -0.0072],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.0300,  0.0193,  0.0107, -0.0103,  0.0253, -0.0059,  0.0270,  0.0096,
          0.0075, -0.0054],
        [-0.0037, -0.0233, -0.0306, -0.0317, -0.0120, -0.0098, -0.0074, -0.0349,
         -0.0326, -0.0317],
        [-0.0010,  0.0019,  0.0022,  0.0053, -0.0019,  0.0039, -0.0020,  0.0018,
          0.0018,  0.0043],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.0000,  

In [15]:
ranknet_model.zero_grad()

# ListNet

In [16]:
from itertools import combinations
import numpy as np

from lecture2_task import ndcg, num_swapped_pairs

In [17]:
class ListNet(torch.nn.Module):
    def __init__(self, num_input_features, hidden_dim=10):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(num_input_features, self.hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_dim, 1),
        )


    def forward(self, input_1):
        logits = self.model(input_1)
        return logits


In [18]:
%%latex
$$CE = -\sum ^{N}_{j=1} (P_y^i(j) * log(P_z^i(j)))$$

<IPython.core.display.Latex object>

In [19]:
%%latex
$$\text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$

<IPython.core.display.Latex object>

In [20]:
def listnet_ce_loss(y_i, z_i): # y_i - разметка
    """
    y_i: (n_i, 1) GT
    z_i: (n_i, 1) preds
    """

    P_y_i = torch.softmax(y_i, dim=0) # dim=0 - нормирование в рамках каждого столбца
    P_z_i = torch.softmax(z_i, dim=0)
    return -torch.sum(P_y_i * torch.log(P_z_i)) # сырые скоры

def listnet_kl_loss(y_i, z_i):
    """
    y_i: (n_i, 1) GT
    z_i: (n_i, 1) preds
    """
    P_y_i = torch.softmax(y_i, dim=0)
    P_z_i = torch.softmax(z_i, dim=0)
    return -torch.sum(P_y_i * torch.log(P_z_i/P_y_i)) # нормирование на скоры из разметки


def make_dataset(N_train, N_valid, vector_dim):
    fake_weights = torch.randn(vector_dim, 1)

    X_train = torch.randn(N_train, vector_dim)
    X_valid = torch.randn(N_valid, vector_dim)

    ys_train_score = torch.mm(X_train, fake_weights) # matmul - матричное умножение шума на веса
    ys_train_score += torch.randn_like(ys_train_score) # добавляем немного шума, чтобы модель не переучивалась

    ys_valid_score = torch.mm(X_valid, fake_weights)
    ys_valid_score += torch.randn_like(ys_valid_score)

#     bins = [-1, 1]  # 3 relevances - разбиваем вектор скоров на 3 бина
    bins = [-1, 0, 1, 2]  # 5 relevances
    ys_train_rel = torch.Tensor(
        np.digitize(ys_train_score.clone().detach().numpy(), bins=bins)
    )
    ys_valid_rel = torch.Tensor(
        np.digitize(ys_valid_score.clone().detach().numpy(), bins=bins)
    )

    return X_train, X_valid, ys_train_rel, ys_valid_rel

In [21]:
N_train = 1000
N_valid = 500

vector_dim = 100
epochs = 2

batch_size = 16

X_train, X_valid, ys_train, ys_valid = make_dataset(N_train, N_valid, vector_dim)

net = ListNet(num_input_features=vector_dim)
opt = torch.optim.Adam(net.parameters())


In [22]:
torch.unique(ys_train)

tensor([0., 1., 2., 3., 4.])

In [23]:
for epoch in range(epochs):
    idx = torch.randperm(N_train) # шафлим чтобы 1 батч 1й эпохи отличался от 1 бача во 2й

    X_train = X_train[idx]
    ys_train = ys_train[idx]

    cur_batch = 0
    for it in range(N_train // batch_size):
        batch_X = X_train[cur_batch: cur_batch + batch_size]
        batch_ys = ys_train[cur_batch: cur_batch + batch_size]
        cur_batch += batch_size

        opt.zero_grad() # на всякий случай зануляем градиенты перед обучением
        if len(batch_X) > 0:
            batch_pred = net(batch_X)
            batch_loss = listnet_kl_loss(batch_ys, batch_pred)
#             batch_loss = listnet_ce_loss(batch_ys, batch_pred)
            batch_loss.backward(retain_graph=True) # считаем градиеты
            opt.step() # делаем шаг градиентного спуска

        if it % 10 == 0:
            with torch.no_grad():
                valid_pred = net(X_valid)
                valid_swapped_pairs = num_swapped_pairs(ys_valid, valid_pred)
                ndcg_score = ndcg(ys_valid, valid_pred)
            print(f"epoch: {epoch + 1}.\tNumber of swapped pairs: " 
                  f"{valid_swapped_pairs}/{N_valid * (N_valid - 1) // 2}\t"
                  f"nDCG: {ndcg_score:.4f}")

epoch: 1.	Number of swapped pairs: 40090/124750	nDCG: 0.8440
epoch: 1.	Number of swapped pairs: 37423/124750	nDCG: 0.8625
epoch: 1.	Number of swapped pairs: 34113/124750	nDCG: 0.8813
epoch: 1.	Number of swapped pairs: 31067/124750	nDCG: 0.8976
epoch: 1.	Number of swapped pairs: 28197/124750	nDCG: 0.9128
epoch: 1.	Number of swapped pairs: 25247/124750	nDCG: 0.9255
epoch: 1.	Number of swapped pairs: 22497/124750	nDCG: 0.9368
epoch: 2.	Number of swapped pairs: 22011/124750	nDCG: 0.9389
epoch: 2.	Number of swapped pairs: 19773/124750	nDCG: 0.9477
epoch: 2.	Number of swapped pairs: 17878/124750	nDCG: 0.9554
epoch: 2.	Number of swapped pairs: 16137/124750	nDCG: 0.9613
epoch: 2.	Number of swapped pairs: 14619/124750	nDCG: 0.9669
epoch: 2.	Number of swapped pairs: 13341/124750	nDCG: 0.9709
epoch: 2.	Number of swapped pairs: 12165/124750	nDCG: 0.9740
