In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from collections import Counter

### Data preparation

In [2]:
train_df = pd.read_table('imat2009_train_new.txt', header=None, sep ='#')

In [3]:
rel = [x.split(' ') for x in train_df[0]] #relevance labels and features
relevance = [float(x[0]) for x in rel]#relevance labels
rele = [x[1:-1] for x in rel]#just features with values without last space
split = [[x.split(':') for x in line] for line in rele] #splitting features vs values
features= np.arange(1, 246, 1)
transposed = [np.array(line).T for line in split] #transposed for convenience
values = [[0 if (str(el) not in line[0]) else float(line[1][np.where(line[0] == str(el))[0][0]]) for el in features] for line in tqdm(transposed)]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77714/77714 [06:41<00:00, 193.46it/s]


In [4]:
X_train = np.array(values)
y_train = np.array(relevance)
queries_train = train_df[1].values

In [5]:
test_df = pd.read_table('imat2009_test_new.txt', header=None, sep ='#')

In [6]:
rel = [x.split(' ') for x in test_df[0]] #relevance labels and features
relevance = [float(x[0]) for x in rel]#relevance labels
rele = [x[1:-1] for x in rel]#just features with values without last space
split = [[x.split(':') for x in line] for line in rele] #splitting features vs values
features= np.arange(1, 246, 1)
transposed = [np.array(line).T for line in split] #transposed for convenience
values = [[0 if (str(el) not in line[0]) else float(line[1][np.where(line[0] == str(el))[0][0]]) for el in features] for line in tqdm(transposed)]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19576/19576 [01:39<00:00, 196.24it/s]


In [7]:
X_test = np.array(values)
y_test = np.array(relevance)
queries_test = test_df[1].values

Scaling into [0,1] range

In [11]:
max_relevance = np.max(y_train)
y_train /= max_relevance
y_test /= max_relevance

### Training

In [13]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

In [14]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

In [80]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    
    return model

In [81]:
fit_model('YetiRank')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab8a001720>

In [88]:
fit_model('YetiRank', {'train_dir': 'YetiRank-lr-0.1', 'learning_rate': 0.1})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab8ec080a0>

In [84]:
fit_model('YetiRank', {'train_dir': 'YetiRank-lr-0.3', 'learning_rate': 0.3})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab8d2b3820>

In [85]:
fit_model('YetiRank', {'train_dir': 'YetiRank-mp-50-lr-0.1', 'metric_period': 50, 'iterations': 700,'learning_rate': 0.1})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab89d4aef0>

In [110]:
widget = MetricVisualizer(['YetiRank-lr-0.3', 'YetiRank-lr-0.1', 'YetiRank-mp-50-lr-0.1'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [92]:
fit_model('PairLogit')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab8d7af6d0>

In [94]:
fit_model('PairLogit', {'train_dir': 'PairLogit-lr-0.1', 'learning_rate': 0.1})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab8d7aec80>

In [95]:
fit_model('PairLogit', {'train_dir': 'PairLogit-lr-0.1-mp-50', 'learning_rate': 0.3, 'metric_period': 50})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab89d4b490>

In [105]:
fit_model('PairLogit', {'train_dir': 'PairLogit-lr-0.1-mp-200', 'learning_rate': 0.3, 'metric_period': 200})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab930aae60>

In [96]:
widget = MetricVisualizer(['PairLogit', 'YetiRank'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

# Optional task

### Data preparation

In [21]:
doc = pd.read_csv('wikIR/documents.csv')
doc_splt = [x.split(' ') for x in doc['text_right'].tolist()]  
docs = pd.DataFrame(np.array([doc['id_right'].tolist(),[len(x) for x in doc_splt]]).T, columns=['id_right', 'length']) 

In [22]:
df_train = pd.read_csv('wikIR/training/BM25.res', header=None, names=['q'])
df_train[['q', 'n_u', 'd', 'r', 's', 'rn']] = df_train.q.str.split(expand=True)

In [37]:
queries_train = pd.read_csv('wikIR/training/queries.csv')
queries_train_splt = [x.split(' ') for x in queries_train['text_left'].tolist()]  
queries_trains = pd.DataFrame(np.array([queries_train['id_left'].tolist(),
                                        [len(x) for x in queries_train_splt]]).T, columns=['id_left', 'length'])

#### Extracting features

Extracting relevance judgements

In [39]:
qrels_train  = pd.read_table('wikIR/training/qrels', header = None, names = ['id_left', 'n_u', 'id_right', 'label'])
df_train['rl'] = pd.Series([0 if len(np.where((qrels_train['id_left'] == int(df_train['q'][i]))&
                                              (qrels_train['id_right']==int(df_train['d'][i])))[0])== 0 
                            else qrels_train['label'][np.where((qrels_train['id_left'] == int(df_train['q'][i]))&
                                                               (qrels_train['id_right']==int(df_train['d'][i])))[0][0]] 
                            for i in range(len(df_train))])

Extracting features. I used such scores:
query and document length in words, # of matched q/d terms, phrase match (0/1), BM25 score

In [40]:
df_train['ql'] = [queries_trains['length'][np.where(int(df_train['q'][i]) == queries_trains['id_left'])[0][0]] 
                  for i in range(len(df_train))]
df_train['dl'] = [docs['length'][np.where(int(df_train['d'][i]) == docs['id_right'])[0][0]] 
                  for i in range(len(df_train))]
df_train['phm'] = [1 if (queries_train['text_left'][np.where(queries_train['id_left']==int(df_train['q'][i]))[0][0]] 
                         in doc['text_right'][np.where(doc['id_right'] == int(df_train['d'][i]))[0][0]]) 
                   else 0 for i in range(len(df_train))]
df_train['nmq'] = [sum([1 if (queries_train_splt[np.where(queries_train['id_left'] == int(df_train['q'][i]))[0][0]][j] 
                              in doc['text_right'][np.where(doc['id_right'] == int(df_train['d'][i]))[0][0]]) 
                        else 0 
                        for j in range(len(queries_train_splt
                                           [np.where(queries_train['id_left'] == int(df_train['q'][i]))[0][0]]))]) 
                   for i in range(len(df_train))]

In [112]:
queries_train

Unnamed: 0,id_left,text_left
0,123839,yanni
1,188629,k pop
2,13898,venice film festival
3,316959,downtown brooklyn
4,515031,pennsylvania house of representatives
...,...,...
1439,896124,british ceylon
1440,12319,scottish national party
1441,4421,cinema of china
1442,296526,gold mining


Cutting such that we have all relevant documents + up to 10 docs nonrelevant documents for each query

In [41]:
relevant_train = df_train.drop(df_train[df_train.rl== 0].index)
nonrelevant_train = df_train.drop(df_train[df_train.rl!= 0].index)
nonrelevant_train10 = pd.concat(nonrelevant_train.iloc[x] 
                                for x in [np.where(nonrelevant_train['q'] == nonrelevant_train['q'].unique()[j])[0][0:10] 
                                          for j in range(len(nonrelevant_train['q'].unique()))])
df_train_cut = pd.concat([relevant_train, nonrelevant_train10])

In [113]:
relevant_train

Unnamed: 0,q,n_u,d,r,s,rn,rl,ql,dl,phm,nmq
0,123839,Q0,806300,0,20.720094194011075,BM25,1,1,200,1,1
1,123839,Q0,123839,1,19.91782871489318,BM25,2,1,200,1,1
2,123839,Q0,836567,2,18.824522997710037,BM25,1,1,200,1,1
4,123839,Q0,806075,4,17.246712972547066,BM25,1,1,200,1,1
5,123839,Q0,1793430,5,17.246712972547066,BM25,1,1,200,1,1
...,...,...,...,...,...,...,...,...,...,...,...
144252,296526,Q0,1369882,52,14.750173252211724,BM25,1,2,200,0,2
144267,296526,Q0,1438397,67,13.956645218404534,BM25,1,2,200,0,2
144275,296526,Q0,1438101,75,13.59692873773307,BM25,1,2,200,0,2
144278,296526,Q0,1487772,78,13.59692873773307,BM25,1,2,200,0,2


In [99]:
df_train_cut[['q','rl','s','ql','dl','phm','nmq']]

Unnamed: 0,q,rl,s,ql,dl,phm,nmq
0,123839,1,20.720094194011075,1,200,1,1
1,123839,2,19.91782871489318,1,200,1,1
2,123839,1,18.824522997710037,1,200,1,1
4,123839,1,17.246712972547066,1,200,1,1
5,123839,1,17.246712972547066,1,200,1,1
...,...,...,...,...,...,...,...
144305,341793,0,16.31737881723864,4,200,0,2
144306,341793,0,16.150794491115548,4,200,0,2
144307,341793,0,16.122447953779385,4,200,0,3
144308,341793,0,16.122447953779385,4,200,0,3


In [52]:
df_test = pd.read_csv('wikIR/test/BM25.res', header=None, names=['q'])
df_test[['q', 'n_u', 'd', 'r', 's', 'rn']] = df_test.q.str.split(expand=True)
qrels_test  = pd.read_table('wikIR/test/qrels', header = None, names = ['id_left', 'n_u', 'id_right', 'label'])
df_test['rl'] = pd.Series([0 if len(np.where((qrels_test['id_left'] == int(df_test['q'][i]))&
                                             (qrels_test['id_right']==int(df_test['d'][i])))[0])== 0 
                           else qrels_test['label'][np.where((qrels_test['id_left'] == int(df_test['q'][i]))&
                                                             (qrels_test['id_right']==int(df_test['d'][i])))[0][0]] 
                           for i in range(len(df_test))])

In [53]:
queries_test = pd.read_csv('wikIR/test/queries.csv')
queries_test_splt = [x.split(' ') for x in queries_test['text_left'].tolist()]  
queries_tests = pd.DataFrame(np.array([queries_test['id_left'].tolist(),
                                       [len(x) for x in queries_test_splt]]).T, columns=['id_left', 'length'])

In [54]:
df_test['ql'] = [queries_tests['length'][np.where(int(df_test['q'][i]) == queries_tests['id_left'])[0][0]] 
                 for i in range(len(df_test))]
df_test['dl'] = [docs['length'][np.where(int(df_test['d'][i]) == docs['id_right'])[0][0]] 
                 for i in range(len(df_test))]
df_test['phm'] = [1 if (queries_test['text_left'][np.where(queries_test['id_left']==int(df_test['q'][i]))[0][0]] 
                        in doc['text_right'][np.where(doc['id_right'] == int(df_test['d'][i]))[0][0]]) 
                  else 0 for i in range(len(df_test))]
df_test['nmq'] = [sum([1 if (queries_test_splt[np.where(queries_test['id_left'] == int(df_test['q'][i]))[0][0]][j] 
                             in doc['text_right'][np.where(doc['id_right'] == int(df_test['d'][i]))[0][0]]) 
                       else 0 
                       for j in range(len(queries_test_splt
                                          [np.where(queries_test['id_left'] == int(df_test['q'][i]))[0][0]]))]) 
                  for i in range(len(df_test))]

In [55]:
relevant_test = df_test.drop(df_test[df_test.rl== 0].index)
nonrelevant_test = df_test.drop(df_test[df_test.rl!= 0].index)
nonrelevant_test10 = pd.concat(nonrelevant_test.iloc[x] 
                               for x in [np.where(nonrelevant_test['q'] == nonrelevant_test['q'].unique()[j])[0][0:10] 
                                         for j in range(len(nonrelevant_test['q'].unique()))])
df_test_cut = pd.concat([relevant_test, nonrelevant_test10])

In [100]:
df_test_cut[['q','rl','s','ql','dl','phm','nmq']]

Unnamed: 0,q,rl,s,ql,dl,phm,nmq
18,158491,2,13.473990503467475,3,200,0,3
100,5728,2,19.198156123814513,1,200,1,1
107,5728,1,9.290534282498049,1,200,1,1
127,5728,1,9.290534282498049,1,200,1,1
130,5728,1,9.290534282498049,1,200,1,1
...,...,...,...,...,...,...,...
9905,712704,0,10.195791954566438,2,190,0,0
9906,712704,0,9.9875412430656,2,199,0,0
9907,712704,0,9.964926216266763,2,200,0,0
9908,712704,0,9.964926216266763,2,200,0,0


In [59]:
df_train['s'] = [float(x) for x in df_train['s']]
df_test['s'] = [float(x) for x in df_test['s']]

In [60]:
train_df = df_train[['rl','q', 's','ql','dl','phm','nmq']]
test_df = df_test[['rl','q', 's','ql','dl','phm','nmq']]

In [61]:
X_train_opt = train_df.drop(['rl','q'], axis=1).values
y_train_opt = train_df['rl'].values
queries_train_opt = train_df['q'].values

X_test_opt = test_df.drop(['rl','q'], axis=1).values
y_test_opt = test_df['rl'].values
queries_test_opt = test_df['q'].values

Scaling into [0,1] range

In [62]:
max_relevance_opt = np.max(y_train_opt)
y_train_opt = y_train_opt/ max_relevance_opt
y_test_opt = y_test_opt / max_relevance_opt

In [63]:
train_opt = Pool(
    data=X_train_opt,
    label=y_train_opt,
    group_id=queries_train_opt
)

test_opt = Pool(
    data=X_test_opt,
    label=y_test_opt,
    group_id=queries_test_opt
)

### Fitting models

I used only 700 iterations.

In [102]:
fit_model('YetiRank',{'train_dir': 'YetiRank-opt', 'iterations' : 700 },train_pool=train_opt, test_pool=test_opt)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab96e17490>

In [103]:
fit_model('PairLogit',{'train_dir': 'PairLogit-opt', 'iterations' : 700 }, train_pool=train_opt, test_pool=test_opt)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x1ab96e19f00>

In [104]:
widget = MetricVisualizer(['PairLogit-opt', 'YetiRank-opt'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))