# 1. Neural Network Training

In [1]:
import sys
if ".." not in sys.path:
    sys.path.insert(0, "..")

import torch
import torch.nn as nn
import numpy as np
from joblib import dump
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ndcg_score

from RankingAlgorithms.neuralnetwork import NeuralNetwork
from RankingAlgorithms.pwsvm import RankSVM
from DataHandling.train_data import load_data


### 1.1. Load training and test data

In [2]:
dic = {'url_bm25': 108, 'url_idf': 18, 'url_vsm': 103,
       'url_covered_query_term_number': 3, 'url_query_term_ratio': 8, 'url_stream_length': 13, 'url_n_slash':125, 'url_len_url': 126,
       'title_bm25': 107, 'title_idf': 17, 'title_vsm': 102, 
       'title_covered_query_term_number': 2, 'title_query_term_ratio': 7, 'title_stream_length': 12,
       'body_bm25': 105, 'body_idf': 15, 'body_vsm': 100, 'body_covered_query_term_number': 0, 'body_query_term_ratio': 5, 'body_stream_length': 10}

In [3]:
feature_indices = [108, 18, 103, 3, 8, 13, 125, 126, 107, 17, 102, 2, 7, 12, 105, 15, 100, 0, 5, 10]
feature_indices = [108, 18, 103, 3, 8, 126, 107, 17, 102, 2, 7, 12, 105, 15, 100, 0, 5, 10]
feature_indices = [108, 103, 3, 8, 107, 102, 2, 7, 105, 100, 0, 5]

In [4]:
# load training data
X_train, y_train = load_data(path="../../../data/MSLR-WEB10K/Fold1/train.txt", nrows=20000, feature_indices=feature_indices)
#X_test, y_test = load_data(path="../../../data/MSLR-WEB10K/Fold1/test.txt", nrows=10000)


In [5]:
print('label counts', np.unique(y_train, return_counts=True))
n_samples_per_class = np.unique(y_train, return_counts=True)[1][-1]

label counts (array([0., 1., 2., 3., 4.]), array([11633,  5644,  2354,   267,   102]))


In [6]:
# balance dataset
indices = []
for label in range(5):
    indices.append(list(np.random.choice(np.where(y_train == label)[0], n_samples_per_class, replace=False)))
    
indices = np.array(indices).flatten()

X_cut = X_train[indices, :]
y_cut = y_train[indices]

print(X_cut.shape, y_cut.shape)
print('label counts: ', np.unique(y_cut, return_counts=True))

(510, 12) (510,)
label counts:  (array([0., 1., 2., 3., 4.]), array([102, 102, 102, 102, 102]))


In [7]:
y_trans = torch.zeros(len(y_cut), 5)
for i, label in enumerate(y_cut):
    y_trans[i, 0:int(label)+1] = 1 


### 1.2. Create model

In [8]:
model = NeuralNetwork(n_features=len(feature_indices), n_hidden=10, load=False)

In [9]:
### define optimizer and loss fct
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
loss = nn.CrossEntropyLoss()

### 1.3. Train Model

In [10]:
for epoch in range(100):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loss = model.train_loop(torch.Tensor(X_cut), y_trans, loss, optimizer)



Epoch 1
-------------------------------
loss: 5.088414  [   32/  510]
Epoch 2
-------------------------------
loss: 5.362634  [   32/  510]
Epoch 3
-------------------------------
loss: 4.770411  [   32/  510]
Epoch 4
-------------------------------
loss: 4.541931  [   32/  510]
Epoch 5
-------------------------------
loss: 4.392932  [   32/  510]
Epoch 6
-------------------------------
loss: 4.160458  [   32/  510]
Epoch 7
-------------------------------
loss: 4.487248  [   32/  510]
Epoch 8
-------------------------------
loss: 4.796001  [   32/  510]
Epoch 9
-------------------------------
loss: 4.272980  [   32/  510]
Epoch 10
-------------------------------
loss: 5.154402  [   32/  510]
Epoch 11
-------------------------------
loss: 4.523971  [   32/  510]
Epoch 12
-------------------------------
loss: 4.285190  [   32/  510]
Epoch 13
-------------------------------
loss: 4.731416  [   32/  510]
Epoch 14
-------------------------------
loss: 4.317109  [   32/  510]
Epoch 15
------

In [11]:
y_pred = model.evaluate(torch.Tensor(X_cut))
print('accuracy: ', np.sum(np.array(y_pred) == np.array(y_cut)) / len(y_cut))

accuracy:  0.2


In [12]:
np.unique(np.array(y_pred) == np.array(y_cut), return_counts=True)

(array([False,  True]), array([408, 102]))

In [13]:

np.unique(y_pred, return_counts=True)

(array([0]), array([510]))

In [14]:
ndcg = ndcg_score(y_cut.reshape(1, -1), y_pred.reshape(1, -1), k=20)
ndcg

0.49999999999999994

### 1.4. Save Model

In [15]:
# save mdoel
model.save("../models/nn.pth")

# Pairwise SVM Training

In [16]:
brek

NameError: name 'brek' is not defined

In [17]:
print(y_cut.shape)
svm = RankSVM(load=False)
svm.fit(X_cut, y_cut)

(510,)
n_samples after pairwise transform  208080




In [18]:
svm.model.coef_

array([[ 0.01502815,  0.32612297, -0.40985204, -0.18505744,  0.02152919,
        -0.95838216,  0.24080634,  0.52426169,  0.01029894,  0.07783305,
        -0.14635887, -0.21289091]])

In [19]:
preds = svm.predict(X_cut)
svm.save("../models/ranksvm.pkl")
preds

array([ 3.67700432e-01, -2.66612348e-01,  9.30984618e-02,  1.38551213e-01,
       -8.79487123e-02, -4.12904771e-01, -1.41773453e-01,  4.44130485e-01,
       -2.66880706e-03, -3.58039834e-01, -8.44252895e-01, -1.75034810e-01,
       -3.05181127e-01, -4.99293373e-02, -7.23188334e-01, -2.58073899e-02,
       -3.65801300e-01, -4.77305980e-01, -1.44553992e-02, -9.44672396e-02,
        2.73010049e-01, -3.21902281e-01,  5.20510365e-01,  6.79101748e-02,
       -2.29674146e-01, -7.11919955e-01,  0.00000000e+00,  1.16986746e-02,
       -3.03102930e-01,  0.00000000e+00, -4.82464392e-01, -1.15212065e-01,
        1.94942871e-02,  2.40949424e-02,  1.86273482e-01,  1.17665668e-01,
       -1.13881659e+00, -4.86174689e-02,  1.90833539e-02, -5.10502131e-02,
        4.64561930e-01, -8.01217767e-01, -2.75778584e-01, -9.56512820e-02,
       -2.98521939e-01, -2.22161531e-01, -2.76150975e-01, -1.91943354e-01,
       -4.29995385e-01, -4.55238784e-01,  8.28749414e-03,  4.42029925e-01,
        7.89158571e-02, -

In [20]:
ndcg = ndcg_score(y_cut.reshape(1, -1), preds.reshape(1, -1), k=10)
ndcg

0.589554438519665