In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd
import numpy as np

In [2]:
X_train = []
queries_train = []
y_train = []
with open('imat2009_new_split/imat2009_train_new.txt') as txt_file:
  for txt_line in txt_file:
    line = txt_line.split()[1:-2]
    queries_train.append(int(txt_line.split()[-1]))
    y_train.append(float(txt_line.split()[0]))
    n = np.zeros(245)
    for i in line:
      n[int(i.split(":")[0])-1] = float(i.split(":")[1])
    X_train.append(n)

In [3]:
X_train = np.array(X_train)
queries_train = np.array(queries_train)
y_train = np.array(y_train)

In [4]:
X_train.shape

(77714, 245)

In [5]:
y_train.shape

(77714,)

In [6]:
X_test = []
queries_test = []
y_test = []
with open('imat2009_new_split/imat2009_test_new.txt') as txt_file:
  for txt_line in txt_file:
    line = txt_line.split()[1:-2]
    queries_test.append(int(txt_line.split()[-1]))
    y_test.append(float(txt_line.split()[0]))
    n = np.zeros(245)
    for i in line:
      n[int(i.split(":")[0])-1] = float(i.split(":")[1])
    X_test.append(n)

In [7]:
X_test = np.array(X_test)
queries_test = np.array(queries_test)
y_test = np.array(y_test)

In [8]:
from collections import Counter
Counter(y_train).items()

dict_items([(1.0, 20086), (0.0, 25776), (2.0, 24424), (4.0, 952), (3.0, 1744), (0.5, 1982), (1.5, 1033), (0.25, 77), (1.33333, 110), (1.2, 3), (2.37037, 39), (0.666671, 340), (2.33333, 79), (0.333329, 268), (2.16049, 19), (2.5, 337), (2.87037, 26), (1.66667, 107), (2.12037, 4), (2.25, 19), (2.24074, 25), (0.2, 10), (1.6, 6), (0.8, 5), (0.6, 10), (0.875, 1), (2.66667, 31), (3.1625, 2), (1.75, 12), (0.75, 55), (2.61111, 4), (0.222229, 1), (0.4, 5), (1.25, 23), (1.97143, 2), (3.5, 16), (2.24691, 10), (2.16667, 1), (1.95239, 1), (1.4, 4), (3.66667, 5), (3.8, 2), (0.125, 1), (2.05556, 2), (3.33333, 4), (2.2, 5), (2.58025, 2), (1.16667, 2), (2.91358, 1), (2.07407, 3), (2.11729, 1), (3.25, 1), (2.375, 1), (3.21666, 1), (2.74074, 5), (2.12346, 3), (0.166671, 8), (0.833329, 5), (1.14286, 1), (3.53, 1), (3.4, 1), (2.75, 1), (3.58125, 1), (2.40741, 1), (0.583329, 1), (1.8, 1), (2.42857, 1), (2.0463, 1), (1.77143, 1), (3.75, 1), (0.888886, 1)])

In [9]:
max_relevance = np.max(y_train)
max_relevance

4.0

In [10]:
y_train /= max_relevance
y_test /= max_relevance

In [11]:
num_queries = np.unique(queries_train).shape[0]
num_queries

7300

In [12]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

In [13]:
default_parameters = {
    'iterations': 1000,
    'custom_metric': ['NDCG'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

In [14]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    
    return model

In [17]:
model = fit_model('RMSE', {'custom_metric': ['NDCG:top=1', 'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10']})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [18]:
model.best_score_

{'learn': {'MAP:top=10': 0.16913829345665216,
  'RecallAt:top=10': 0.9943030879410262,
  'PrecisionAt:top=10': 0.04481332898456182,
  'RMSE': 0.1515496882339609},
 'validation': {'MAP:top=10': 0.15770294964151993,
  'RecallAt:top=10': 0.9904017982110775,
  'PrecisionAt:top=10': 0.04668528961292121,
  'NDCG:top=1;type=Base': 0.7662902438040171,
  'RMSE': 0.1859884336544538}}

In [19]:
def create_weights(queries):
    query_set = np.unique(queries)
    query_weights = np.random.uniform(size=query_set.shape[0])
    weights = np.zeros(shape=queries.shape)
    
    for i, query_id in enumerate(query_set):
        weights[queries == query_id] = query_weights[i]
    
    return weights
    

train_with_weights = Pool(
    data=X_train,
    label=y_train,
    group_weight=create_weights(queries_train),
    group_id=queries_train
)

test_with_weights = Pool(
    data=X_test,
    label=y_test,
    group_weight=create_weights(queries_test),
    group_id=queries_test
)

fit_model(
    'RMSE', 
    additional_params={'train_dir': 'RMSE_weigths'}, 
    train_pool=train_with_weights,
    test_pool=test_with_weights
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1f64953e3a0>

In [20]:
def get_best_documents(labels, queries):
    query_set = np.unique(queries)
    num_queries = query_set.shape[0]
    by_query_arg_max = {query: -1 for query in query_set}
    
    for i, query in enumerate(queries):
        best_idx = by_query_arg_max[query]
        if best_idx == -1 or labels[best_idx] < labels[i]:
            by_query_arg_max[query] = i
    
    binary_best_docs = np.zeros(shape=labels.shape)
    for arg_max in by_query_arg_max.values():
        binary_best_docs[arg_max] = 1.
        
    return binary_best_docs

In [21]:
best_docs_train = get_best_documents(y_train, queries_train)
best_docs_test = get_best_documents(y_test, queries_test)

train_with_weights = Pool(
    data=X_train,
    label=best_docs_train,
    group_id=queries_train,
    group_weight=create_weights(queries_train)
)

test_with_weights = Pool(
    data=X_test,
    label=best_docs_test,
    group_id=queries_test,
    group_weight=create_weights(queries_test)
)

fit_model(
    'QuerySoftMax',
    additional_params={'custom_metric': 'NDCG:top=1'},
    train_pool=train_with_weights,
    test_pool=test_with_weights
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1f64996d3a0>

In [22]:
QueryRMSE = fit_model('QueryRMSE')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [23]:
QueryRMSE.best_score_

{'learn': {'QueryRMSE': 0.14252007241341322},
 'validation': {'NDCG:type=Base': 0.8934050588000731,
  'QueryRMSE': 0.15038545345333232}}

In [24]:
PairLogit = fit_model('PairLogit')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [25]:
PairLogit.best_score_

{'learn': {'PairLogit': 0.3819102937087057},
 'validation': {'NDCG:type=Base': 0.8928623235668288,
  'PairLogit': 0.558288760963724}}

In [36]:
YetiRank = fit_model('YetiRank')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [34]:
YetiRank.best_score_

{}