# 1. Neural Network Training

In [32]:
import sys

if ".." not in sys.path:
    sys.path.insert(0, "..")

import torch
import torch.nn as nn
import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer

from RankingAlgorithms.neuralnetwork import NeuralNetwork
from RankingAlgorithms.pwsvm import RankSVM
from DataHandling.train_data import load_data

### 1.1. Load training and test data

In [33]:
dic = {
    "url_bm25": 108,
    "url_idf": 18,
    "url_vsm": 103,
    "url_covered_query_term_number": 3,
    "url_query_term_ratio": 8,
    "url_stream_length": 13,
    "url_n_slash": 125,
    "url_len_url": 126,
    "title_bm25": 107,
    "title_idf": 17,
    "title_vsm": 102,
    "title_covered_query_term_number": 2,
    "title_query_term_ratio": 7,
    "title_stream_length": 12,
    "body_bm25": 105,
    "body_idf": 15,
    "body_vsm": 100,
    "body_covered_query_term_number": 0,
    "body_query_term_ratio": 5,
    "body_stream_length": 10,
}

In [34]:
feature_indices = [108, 103, 3, 8, 107, 102, 2, 7, 105, 100, 0, 5]

In [35]:
# load training data
X_train, y_train = load_data(
    path="../../../data/MSLR-WEB10K/Fold1/train.txt",
    nrows=20000,
    feature_indices=feature_indices,
)
# X_test, y_test = load_data(path="../../../data/MSLR-WEB10K/Fold1/test.txt", nrows=10000)

In [36]:
print("label counts", np.unique(y_train, return_counts=True))
n_samples_per_class = np.unique(y_train, return_counts=True)[1][-1]

label counts (array([0., 1., 2., 3., 4.]), array([11633,  5644,  2354,   267,   102]))


In [37]:
# balance dataset
indices = []
for label in range(5):
    indices.append(
        list(
            np.random.choice(
                np.where(y_train == label)[0], n_samples_per_class, replace=False
            )
        )
    )

indices = np.array(indices).flatten()

X_cut = X_train[indices, :]
y_cut = y_train[indices]

print(X_cut.shape, y_cut.shape)
print("label counts: ", np.unique(y_cut, return_counts=True))

(510, 12) (510,)
label counts:  (array([0., 1., 2., 3., 4.]), array([102, 102, 102, 102, 102]))


In [38]:
y_trans = torch.zeros(len(y_cut), 5)
for i, label in enumerate(y_cut):
    y_trans[i, 0 : int(label) + 1] = 1

### 1.2. Create model

In [39]:
model = NeuralNetwork(n_features=len(feature_indices), n_hidden=10, load=False)

In [40]:
### define optimizer and loss fct
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
loss = nn.CrossEntropyLoss()

### 1.3. Train Model

In [41]:
for epoch in range(100):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loss = model.train_loop(torch.Tensor(X_cut), y_trans, loss, optimizer)

Epoch 1
-------------------------------
loss: 5.053125  [   32/  510]
Epoch 2
-------------------------------
loss: 4.493237  [   32/  510]
Epoch 3
-------------------------------
loss: 5.383163  [   32/  510]
Epoch 4
-------------------------------
loss: 4.882262  [   32/  510]
Epoch 5
-------------------------------
loss: 4.678925  [   32/  510]
Epoch 6
-------------------------------
loss: 5.126726  [   32/  510]
Epoch 7
-------------------------------
loss: 4.173510  [   32/  510]
Epoch 8
-------------------------------
loss: 4.921635  [   32/  510]
Epoch 9
-------------------------------
loss: 5.885022  [   32/  510]
Epoch 10
-------------------------------
loss: 4.118716  [   32/  510]
Epoch 11
-------------------------------
loss: 5.073897  [   32/  510]
Epoch 12
-------------------------------
loss: 5.326819  [   32/  510]
Epoch 13
-------------------------------
loss: 5.017152  [   32/  510]
Epoch 14
-------------------------------
loss: 4.954721  [   32/  510]
Epoch 15
------

In [42]:
y_pred = model.evaluate(torch.Tensor(X_cut))
print("accuracy: ", np.sum(np.array(y_pred) == np.array(y_cut)) / len(y_cut))

accuracy:  0.2


In [43]:
np.unique(np.array(y_pred) == np.array(y_cut), return_counts=True)

(array([False,  True]), array([408, 102]))

In [44]:
np.unique(y_pred, return_counts=True)

(array([0]), array([510]))

### 1.4. Save Model

In [45]:
# save mdoel
model.save("../models/nn.pth")

# 2. Pairwise SVM Training

In [46]:
print(y_cut.shape)
svm = RankSVM(load=False)
svm.fit(X_cut, y_cut)

(510,)
n_samples after pairwise transform  208080




In [47]:
svm.model.coef_

array([[ 1.95995110e-02,  5.48621345e-01, -2.76093827e-01,
        -5.36713077e-02,  3.12683148e-02, -9.44029517e-01,
         3.53660647e-01,  2.08883665e-01, -9.03992095e-04,
         3.09513406e-01, -1.84403838e-01, -4.18891355e-01]])

In [53]:
preds = svm.predict(X_cut)
svm.save("../models/ranksvm.pkl")
preds

array([ 1.38803354e-01, -4.14875045e-01, -1.43879255e-01,  5.41011609e-01,
       -2.38905598e-01,  4.96032265e-01, -9.49175300e-02, -5.16240497e-01,
       -3.94080586e-01,  3.55587383e-01, -6.74226052e-01, -3.20007929e-01,
       -2.68118622e-01, -7.30816202e-01, -4.35178722e-01, -5.93375223e-01,
       -5.07496430e-01, -3.40385826e-01, -1.04951655e-02, -3.74342423e-01,
       -7.20876588e-02, -5.53036042e-01,  0.00000000e+00, -1.32677315e+00,
       -1.65350843e-01, -3.37485859e-01, -6.44301903e-01, -2.52630959e-01,
       -1.06600078e-01, -3.76867047e-01,  1.76459791e-01, -4.46861618e-01,
        2.17634220e-01,  0.00000000e+00, -6.68662784e-01, -3.71070175e-01,
        3.97204229e-01,  2.13293289e-01,  4.23715351e-01,  6.46698084e-01,
       -7.29333630e-02,  0.00000000e+00,  9.05004644e-02, -6.95308835e-01,
       -4.00534267e-01, -6.31000720e-01,  1.67356641e-01, -6.86820252e-01,
        3.55058593e-02, -4.91707362e-01,  6.36003950e-01, -4.93488122e-01,
       -1.20811576e-01, -

# 3. TF-IDF

In [54]:
# load data
titles = np.load("../data/titles.npy", allow_pickle=True)
bodies = np.load("../data/bodies.npy", allow_pickle=True)
urls = np.load("../data/urls.npy", allow_pickle=True)

In [55]:
# fit vectorizer
vectorizer_body = TfidfVectorizer().fit(bodies)
vectorizer_title = TfidfVectorizer().fit(titles)
vectorizer_url = TfidfVectorizer().fit(urls)

In [56]:
# save models
import pickle

with open("../models/vectorizer_body.pkl", "wb") as f:
    pickle.dump(vectorizer_body, f)
with open("../models/vectorizer_title.pkl", "wb") as f:
    pickle.dump(vectorizer_title, f)
with open("../models/vectorizer_url.pkl", "wb") as f:
    pickle.dump(vectorizer_url, f)

In [57]:
title_embedding = vectorizer_title.transform(titles)
body_embedding = vectorizer_body.transform(bodies)
url_embedding = vectorizer_url.transform(urls)

In [58]:
sp.save_npz("../data/title_embedding.npz", sp.csr_matrix(title_embedding))
sp.save_npz("../data/body_embedding.npz", sp.csr_matrix(body_embedding))
sp.save_npz("../data/url_embedding.npz", sp.csr_matrix(url_embedding))