# 1. Neural Network Training

In [1]:
import sys
if ".." not in sys.path:
    sys.path.insert(0, "..")

import torch
import torch.nn as nn
import numpy as np
from joblib import dump
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ndcg_score

from RankingAlgorithms.neuralnetwork import NeuralNetwork
from RankingAlgorithms.pwsvm import RankSVM
from DataHandling.train_data import load_data


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annavollweiter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1.1. Load training and test data

In [2]:
dic = {'url_bm25': 108, 'url_idf': 18, 'url_vsm': 103,
       'url_covered_query_term_number': 3, 'url_query_term_ratio': 8, 'url_stream_length': 13, 'url_n_slash':125, 'url_len_url': 126,
       'title_bm25': 107, 'title_idf': 17, 'title_vsm': 102, 
       'title_covered_query_term_number': 2, 'title_query_term_ratio': 7, 'title_stream_length': 12,
       'body_bm25': 105, 'body_idf': 15, 'body_vsm': 100, 'body_covered_query_term_number': 0, 'body_query_term_ratio': 5, 'body_stream_length': 10}

In [3]:
feature_indices = [108, 18, 103, 3, 8, 13, 125, 126, 107, 17, 102, 2, 7, 12, 105, 15, 100, 0, 5, 10]
feature_indices = [108, 18, 103, 3, 8, 126, 107, 17, 102, 2, 7, 12, 105, 15, 100, 0, 5, 10]
feature_indices = [108, 103, 3, 8, 107, 102, 2, 7, 105, 100, 0, 5]

In [4]:
# load training data
X_train, y_train = load_data(path="../../../data/MSLR-WEB10K/Fold1/train.txt", nrows=20000, feature_indices=feature_indices)
#X_test, y_test = load_data(path="../../../data/MSLR-WEB10K/Fold1/test.txt", nrows=10000)


In [5]:
print('label counts', np.unique(y_train, return_counts=True))
n_samples_per_class = np.unique(y_train, return_counts=True)[1][-1]

label counts (array([0., 1., 2., 3., 4.]), array([11633,  5644,  2354,   267,   102]))


In [6]:
# balance dataset
indices = []
for label in range(5):
    indices.append(list(np.random.choice(np.where(y_train == label)[0], n_samples_per_class, replace=False)))
    
indices = np.array(indices).flatten()

X_cut = X_train[indices, :]
y_cut = y_train[indices]

print(X_cut.shape, y_cut.shape)
print('label counts: ', np.unique(y_cut, return_counts=True))

(510, 12) (510,)
label counts:  (array([0., 1., 2., 3., 4.]), array([102, 102, 102, 102, 102]))


In [7]:
y_trans = torch.zeros(len(y_cut), 5)
for i, label in enumerate(y_cut):
    y_trans[i, 0:int(label)+1] = 1 


### 1.2. Create model

In [8]:
model = NeuralNetwork(n_features=len(feature_indices), n_hidden=10, load=False)

In [9]:
### define optimizer and loss fct
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
loss = nn.CrossEntropyLoss()

### 1.3. Train Model

In [10]:
for epoch in range(100):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loss = model.train_loop(torch.Tensor(X_cut), y_trans, loss, optimizer)



Epoch 1
-------------------------------
loss: 4.772689  [   32/  510]
Epoch 2
-------------------------------
loss: 4.251239  [   32/  510]
Epoch 3
-------------------------------
loss: 4.534404  [   32/  510]
Epoch 4
-------------------------------
loss: 5.540410  [   32/  510]
Epoch 5
-------------------------------
loss: 4.968125  [   32/  510]
Epoch 6
-------------------------------
loss: 5.109271  [   32/  510]
Epoch 7
-------------------------------
loss: 4.689440  [   32/  510]
Epoch 8
-------------------------------
loss: 5.048555  [   32/  510]
Epoch 9
-------------------------------
loss: 5.334731  [   32/  510]
Epoch 10
-------------------------------
loss: 4.624043  [   32/  510]
Epoch 11
-------------------------------
loss: 4.879647  [   32/  510]
Epoch 12
-------------------------------
loss: 4.397687  [   32/  510]
Epoch 13
-------------------------------
loss: 4.934406  [   32/  510]
Epoch 14
-------------------------------
loss: 4.886223  [   32/  510]
Epoch 15
------

In [11]:
y_pred = model.evaluate(torch.Tensor(X_cut))
print('accuracy: ', np.sum(np.array(y_pred) == np.array(y_cut)) / len(y_cut))

accuracy:  0.2


In [12]:
np.unique(np.array(y_pred) == np.array(y_cut), return_counts=True)

(array([False,  True]), array([408, 102]))

In [13]:

np.unique(y_pred, return_counts=True)

(array([0]), array([510]))

In [14]:
ndcg = ndcg_score(y_cut.reshape(1, -1), y_pred.reshape(1, -1), k=20)
ndcg

0.49999999999999994

### 1.4. Save Model

In [15]:
# save mdoel
model.save("../models/nn.pth")

# Pairwise SVM Training

In [None]:
print(y_cut.shape)
svm = RankSVM(load=False)
svm.fit(X_cut, y_cut)

(510,)
n_samples after pairwise transform  208080




In [None]:
svm.model.coef_

array([[ 0.02883423,  0.21695335, -0.26878167, -0.21214111,  0.02746978,
        -0.78000491,  0.318019  ,  0.03747703,  0.00310763,  0.30371981,
        -0.18882598, -0.16140359]])

In [None]:
preds = svm.predict(X_cut)
svm.save("../models/ranksvm.pkl")
preds

array([ 0.06808976, -0.09105113,  0.1003668 , -0.22147471,  0.06969051,
       -0.03106701, -0.19279618, -0.52288695, -0.31845989,  0.25137458,
        0.25528594,  0.        , -0.63044159,  0.09289614,  0.22631097,
       -0.32727074,  0.        , -0.03801811,  0.        ,  0.119675  ,
        0.        ,  0.0861692 , -0.13905482, -0.08857685, -0.05738022,
       -0.24088161, -0.14889187,  0.27764925, -0.24410914, -0.12673423,
       -0.2273243 ,  0.05447276, -0.24048315,  0.        , -0.11764717,
       -0.13684448,  0.        , -0.32744117,  0.15710861, -0.27015591,
       -0.38543318,  0.03556509, -0.40656977,  0.12848449,  0.53172015,
        0.01824406,  0.03129232,  0.07167787,  0.41373617, -0.39445903,
        0.26874884, -0.18107534, -0.00544969, -0.01074187,  0.25044873,
        0.49648469,  0.        , -0.14226012,  0.05542743,  0.10216847,
        0.        ,  0.        ,  0.        ,  0.        , -0.55837999,
       -0.16905079,  0.25386844, -0.23018993, -0.51728109, -0.16