In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from lambdaMART import LambdaMART

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def dcg(rel, k=None):
    i = np.arange(1, len(rel)+1)
    gain = (2**rel - 1)/np.log2(i + 1)
    if k is not None:
        gain = gain[i <= k]
    return gain.sum()

In [4]:
def idcg(rel, k=None):
    rel = np.sort(rel)[::-1]
    i = np.arange(1, len(rel)+1)
    gain = (2**rel - 1)/np.log2(i + 1)
    if k is not None:
        gain = gain[i <= k]
    return gain.sum()

In [5]:
def ndcg(rel, k=None):
    idcg_value = idcg(rel, k=k)
    if idcg_value != 0:
        return dcg(rel, k=k) / idcg_value
    else:
        return 0

In [6]:
def ndcg_mean(res_table, k=None):
    ndcg_val = 0
    for qid in res_table['QueryId'].unique():
        rel = res_table[res_table['QueryId'] == qid]['rel']
        ndcg_val += ndcg(rel, k=k)
    return ndcg_val / res_table['QueryId'].nunique()

In [7]:
df, rank, qid = load_svmlight_file('data/train.txt.gz', query_id = True)

In [8]:
df_test, rank_test, qid_test = load_svmlight_file('data/test.txt.gz', query_id = True)

In [9]:
print(f'Number of unique queries in the dataset: {len(np.unique(qid))}')

Number of unique queries in the dataset: 19944


In [10]:
sample_size = 1000
sample_queries = random.sample(list(np.unique(qid)), sample_size)
qid_from_sample = (qid == sample_queries[0])
for idx in sample_queries[1:]:
    qid_from_sample |= (qid == idx)
df_part = df[qid_from_sample]
rank_part = rank[qid_from_sample]
qid_part = qid[qid_from_sample]

In [11]:
train_idx, cv_idx = train_test_split(np.unique(qid_part), test_size=0.2)

In [12]:
qid_has_train_idx = (qid_part == train_idx[0])
for idx in train_idx[1:]:
    qid_has_train_idx |= (qid_part == idx)

In [13]:
qid_has_cv_idx = (qid_part == cv_idx[0])
for idx in cv_idx[1:]:
    qid_has_cv_idx |= (qid_part == idx)

In [14]:
df_train = df_part[qid_has_train_idx]
rank_train = rank_part[qid_has_train_idx]
qid_train = qid_part[qid_has_train_idx]

df_cv = df_part[qid_has_cv_idx]
rank_cv = rank_part[qid_has_cv_idx]
qid_cv = qid_part[qid_has_cv_idx]

In [15]:
len(rank_train)

19442

In [16]:
model = LambdaMART(num_trees=100, max_depth=4, learning_rate=0.125)
model.fit(df_train, rank_train, qid_train)

In [17]:
cv_predictions = model.predict(df_cv)

In [18]:
cv_res = pd.DataFrame({'neg_pred': -cv_predictions, 'QueryId': qid_cv, 
                       'DocumentId': np.arange(1, len(qid_cv)+1), 'rel': rank_cv})
cv_res = cv_res.sort_values(by=['QueryId', 'neg_pred'])

In [19]:
ndcg_mean(cv_res, k=5)

0.6356269035048396

In [20]:
predictions = model.predict(df_test)

In [21]:
res = pd.DataFrame({'neg_pred': -predictions, 'QueryId': qid_test, 'DocumentId': np.arange(1, len(qid_test)+1)})

In [22]:
res = res.sort_values(by=['QueryId', 'neg_pred'])

In [23]:
res[['QueryId', 'DocumentId']].to_csv('ranking_result.csv', index=False)