# <i>CatBoost learning to rank</i>

In [43]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd

In [44]:
# training dataset

X_train, y_train, queries_train = [], [], []

with open('imat2009_train_new.txt') as file:
    for i in file:
        row = np.zeros(245)
        i_splitted = i.split()
        y_train.append(float(i_splitted[0]))
        queries_train.append(int(i_splitted[-1]))
        for elem in i_splitted[1:-2]:
            splitted = elem.split(':')
            index = int(splitted[0]) - 1
            value = float(splitted[1])
            row[index] = value
        X_train.append(row)

X_train = np.array(X_train)
y_train = np.array(y_train)
queries_train = np.array(queries_train)

In [45]:
# test dataset

X_test, y_test, queries_test = [], [], []

with open('imat2009_test_new.txt') as file:
    for i in file:
        row = np.zeros(245)
        i_splitted = i.split()
        y_test.append(float(i_splitted[0]))
        queries_test.append(int(i_splitted[-1]))
        for elem in i_splitted[1:-2]:
            splitted = elem.split(':')
            index = int(splitted[0]) - 1
            value = float(splitted[1])
            row[index] = value
        X_test.append(row)

X_test = np.array(X_test)
y_test = np.array(y_test)
queries_test = np.array(queries_test)

### Dataset analysis

__Number of documents__

In [46]:
num_documents = X_train.shape[0]
print(num_documents)

77714


__Number of features__

In [47]:
X_train.shape[1]

245

In [48]:
from collections import Counter
Counter(y_train).items()

dict_items([(1.0, 20086), (0.0, 25776), (2.0, 24424), (4.0, 952), (3.0, 1744), (0.5, 1982), (1.5, 1033), (0.25, 77), (1.33333, 110), (1.2, 3), (2.37037, 39), (0.666671, 340), (2.33333, 79), (0.333329, 268), (2.16049, 19), (2.5, 337), (2.87037, 26), (1.66667, 107), (2.12037, 4), (2.25, 19), (2.24074, 25), (0.2, 10), (1.6, 6), (0.8, 5), (0.6, 10), (0.875, 1), (2.66667, 31), (3.1625, 2), (1.75, 12), (0.75, 55), (2.61111, 4), (0.222229, 1), (0.4, 5), (1.25, 23), (1.97143, 2), (3.5, 16), (2.24691, 10), (2.16667, 1), (1.95239, 1), (1.4, 4), (3.66667, 5), (3.8, 2), (0.125, 1), (2.05556, 2), (3.33333, 4), (2.2, 5), (2.58025, 2), (1.16667, 2), (2.91358, 1), (2.07407, 3), (2.11729, 1), (3.25, 1), (2.375, 1), (3.21666, 1), (2.74074, 5), (2.12346, 3), (0.166671, 8), (0.833329, 5), (1.14286, 1), (3.53, 1), (3.4, 1), (2.75, 1), (3.58125, 1), (2.40741, 1), (0.583329, 1), (1.8, 1), (2.42857, 1), (2.0463, 1), (1.77143, 1), (3.75, 1), (0.888886, 1)])

In [49]:
max_relevance = np.max(y_train)
y_train /= max_relevance
y_test /= max_relevance

__Number of queries__

In [50]:
num_queries = np.unique(queries_train).shape[0]
num_queries

7300

### Creation of CatBoost pools

In [51]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

In [52]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

In [53]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    
    return model

In [54]:
model = fit_model('YetiRank')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [56]:
model2 = fit_model('PairLogit')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [57]:
model2.best_score_

{'learn': {'PairLogit': 0.31398503926498234},
 'validation': {'NDCG:type=Base': 0.8943728100892233,
  'PairLogit': 0.5564476911949912}}

In [58]:
model3 = fit_model('YetiRank', {'train_dir': 'YetiRank-lr-0.3', 'learning_rate': 0.3})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [60]:
model4 = fit_model('YetiRank', {'metric_period': 50})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [62]:
widget = MetricVisualizer(['YetiRank', 'YetiRank-lr-0.3'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [63]:
model5 = fit_model('PairLogit', {'train_dir': 'PairLogit-lr-0.3', 'learning_rate': 0.3})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [64]:
model5.best_score_

{'learn': {'PairLogit': 0.05314590192738986},
 'validation': {'NDCG:type=Base': 0.8894676950935261,
  'PairLogit': 0.5409311648181809}}

In [65]:
model6 = fit_model('PairLogit', {'metric_period': 50})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [66]:
model6.best_score_

{'learn': {'PairLogit': 0.31398503926498234},
 'validation': {'NDCG:type=Base': 0.8943368098775238,
  'PairLogit': 0.5564569694482013}}

In [67]:
widget = MetricVisualizer(['PairLogit', 'PairLogit-lr-0.3'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [68]:
widget = MetricVisualizer(['PairLogit', 'YetiRank'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))