In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from collections import Counter
from elasticsearch import Elasticsearch    # elasticsearch will extract features from query-document pairs for us
from elasticsearch.helpers import bulk, parallel_bulk
import ir_measures
from ir_measures import *
import json
import requests
import re
import time

### Connecting to Elasticsearch (used to extract BM25 score)

In [2]:
es = Elasticsearch('http://localhost:9200')

In [3]:
index_name = 'wiki'

In [5]:
mappings = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki'})

In [6]:
doc = pd.read_csv('wikIR/documents.csv')

In [7]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def es_action_generator(df):
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0], bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        doc = {
            'text': row['text_right'],
        }
        yield create_es_action(index_name, row['id_right'], doc)


for ok, result in parallel_bulk(es, es_action_generator(doc), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)


100%|██████████████████████████████| 369721/369721 [01:39<00:00, 3727.35it/s]                                                                                                          


In [14]:
def pretty_print_result(search_result, fields=[]):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
    
def search(query, *args):
    return pretty_print_result(es.search(index=index_name, query=query, size=100), args)

def get_doc_by_id(doc_id):
    return es.get(index=index_name, id=doc_id)['_source']

def make_query(text):
    return {
        "bool": {
            'must': {
                'match': {
                    'text': text
                }                    
            },
            'should': {
                "match_phrase": {
                    "text": {
                        "query": text,
                        'slop': 10,
                    }
                }
            }
        }
    }

def extract_BM25(index_name, query_text, doc_id):
    
   # request with explain parameter
    headers = {
        'Content-type': 'application/json',
        'Accept': 'application/json',    }
    json_data = {
        'query': make_query(query_text)    } 
    res = ''
    while res == '':
        try:
            res = requests.get(f'http://127.0.0.1:9200/wiki/_explain/{doc_id}', headers=headers, json=json_data).json()
            break
        except:
            time.sleep(5)
            continue
    

    total_score = 0
    if res['matched']:
        # BM25 score
        total_score = res['explanation']['value']
        
    return  total_score


### Extracting features (training data)

In [9]:
#queries
queries_train = pd.read_csv('wikIR/training/queries.csv')

#query length in words
queries_train_splt = [x.split(' ') for x in queries_train['text_left'].tolist()]  
queries_trains = pd.DataFrame(np.array([queries_train['id_left'].tolist(),
                                        [len(x) for x in queries_train_splt]]).T, columns=['id_left', 'length'])

In [8]:
#document length in words
doc_splt = [x.split(' ') for x in doc['text_right'].tolist()]  
docs = pd.DataFrame(np.array([doc['id_right'].tolist(),[len(x) for x in doc_splt]]).T, columns=['id_right', 'length'])

In [10]:
#training data : relevant pairs
df_train = pd.read_table('wikIR/training/qrels', header = None, names = ['q', 'n_u', 'd', 'rl'])

In [11]:
# training data : BM25 runs
df_train_run = pd.read_csv('wikIR/training/BM25.res', header=None, names=['q'])
df_train_run[['q', 'n_u', 'd', 'r', 's', 'rn']] = df_train_run.q.str.split(expand=True)
df_train_run['q'] = [int(x) for x in df_train_run['q']]

In [12]:
#relevance judgements
df_train_run['rl'] = pd.Series([0 if len(np.where((df_train['q'] == int(df_train_run['q'][i]))&
                                              (df_train['d']==int(df_train_run['d'][i])))[0])== 0 
                            else df_train['rl'][np.where((df_train['q'] == int(df_train_run['q'][i]))&
                                                               (df_train['d']==int(df_train_run['d'][i])))[0][0]] 
                            for i in range(len(df_train_run))])

In [13]:
# relevant + 10 nonrelevant pairs
nonrelevant_train = df_train_run.drop(df_train_run[df_train_run.rl != 0].index)
nonrelevant_train10 = pd.concat(nonrelevant_train.iloc[x] 
                                for x in [np.where(nonrelevant_train['q'] == nonrelevant_train['q'].unique()[j])[0][0:10] 
                                          for j in range(len(nonrelevant_train['q'].unique()))])
nonrelevant_train10  = nonrelevant_train10.drop(['r','s', 'rn'], axis = 1)
df_train_cut = pd.concat([df_train, nonrelevant_train10]).sort_values(by=['q']).reset_index(drop=True)

In [15]:
#BM25 scores
df_train_cut['s'] = [extract_BM25('wiki',queries_train['text_left']
                                 [np.where(queries_train['id_left'] == df_train_cut['q'][i])[0][0]],df_train_cut['d'][i]) 
                     for i in range(len(df_train_cut))]

In [16]:
#query length in words
df_train_cut['ql'] = [queries_trains['length'][np.where(int(df_train_cut['q'][i]) == queries_trains['id_left'])[0][0]] 
                  for i in range(len(df_train_cut))]

#document length in words
df_train_cut['dl'] = [docs['length'][np.where(int(df_train_cut['d'][i]) == docs['id_right'])[0][0]] 
                  for i in range(len(df_train_cut))]

#phrase matches
df_train_cut['phm'] = [1 if (queries_train['text_left'][np.where(queries_train['id_left']==int(df_train_cut['q'][i]))[0][0]] 
                         in doc['text_right'][np.where(doc['id_right'] == int(df_train_cut['d'][i]))[0][0]]) 
                   else 0 for i in range(len(df_train_cut))]

#number of matched queries
df_train_cut['nmq'] = [sum([1 if (queries_train_splt[np.where(queries_train['id_left'] == int(df_train_cut['q'][i]))[0][0]][j] 
                              in doc['text_right'][np.where(doc['id_right'] == int(df_train_cut['d'][i]))[0][0]]) 
                        else 0 
                        for j in range(len(queries_train_splt
                                           [np.where(queries_train['id_left'] == int(df_train_cut['q'][i]))[0][0]]))]) 
                   for i in range(len(df_train_cut))]

In [17]:
# training data ready to use
train = df_train_cut[['rl','q','s','ql','dl','phm', 'nmq']]

In [33]:
train

Unnamed: 0,rl,q,s,ql,dl,phm,nmq
0,0,79,0.000000,1,200,0,0
1,0,79,0.000000,1,200,0,0
2,1,79,0.000000,1,200,0,0
3,0,79,0.000000,1,200,0,0
4,0,79,0.000000,1,200,0,0
...,...,...,...,...,...,...,...
62134,0,2433785,14.545878,3,200,0,1
62135,0,2433785,14.545878,3,200,0,1
62136,1,2433785,12.717758,3,200,0,1
62137,1,2433785,12.717758,3,200,0,1


### Extracting features (testing data)

In [18]:
#queries
queries_test = pd.read_csv('wikIR/test/queries.csv')

#query length in words
queries_test_splt = [x.split(' ') for x in queries_test['text_left'].tolist()]  
queries_tests = pd.DataFrame(np.array([queries_test['id_left'].tolist(),
                                       [len(x) for x in queries_test_splt]]).T, columns=['id_left', 'length'])

In [19]:
#testing data : BM25 runs 
df_test = pd.read_csv('wikIR/test/BM25.res', header=None, names=['q'])
df_test[['q', 'n_u', 'd', 'r', 's', 'rn']] = df_test.q.str.split(expand=True)

qrels_test  = pd.read_table('wikIR/test/qrels', header = None, names = ['id_left', 'n_u', 'id_right', 'label'])

#relevance judgements
df_test['rl'] = pd.Series([0 if len(np.where((qrels_test['id_left'] == int(df_test['q'][i]))&
                                             (qrels_test['id_right']==int(df_test['d'][i])))[0])== 0 
                           else qrels_test['label'][np.where((qrels_test['id_left'] == int(df_test['q'][i]))&
                                                             (qrels_test['id_right']==int(df_test['d'][i])))[0][0]] 
                           for i in range(len(df_test))])

In [20]:
#taking only first 20 results
df_test20 = pd.concat(df_test.iloc[x] 
                            for x in [np.where(df_test['q'] == df_test['q'].unique()[j])[0][0:20] 
                                          for j in range(len(df_test['q'].unique()))]).sort_values(by=['q']).reset_index(drop=True)


In [21]:
#query length in words
df_test20['ql'] = [queries_tests['length'][np.where(int(df_test20['q'][i]) == queries_tests['id_left'])[0][0]] 
                 for i in range(len(df_test20))]

#document length in words
df_test20['dl'] = [docs['length'][np.where(int(df_test20['d'][i]) == docs['id_right'])[0][0]] 
                 for i in range(len(df_test20))]

#phrase matches
df_test20['phm'] = [1 if (queries_test['text_left'][np.where(queries_test['id_left']==int(df_test20['q'][i]))[0][0]] 
                        in doc['text_right'][np.where(doc['id_right'] == int(df_test20['d'][i]))[0][0]]) 
                  else 0 for i in range(len(df_test20))]

#number of matched queries
df_test20['nmq'] = [sum([1 if (queries_test_splt[np.where(queries_test['id_left'] == int(df_test20['q'][i]))[0][0]][j] 
                             in doc['text_right'][np.where(doc['id_right'] == int(df_test20['d'][i]))[0][0]]) 
                       else 0 
                       for j in range(len(queries_test_splt
                                          [np.where(queries_test['id_left'] == int(df_test20['q'][i]))[0][0]]))]) 
                  for i in range(len(df_test20))]

In [22]:
#BM25 scores
df_test20['s'] = [extract_BM25('wiki',queries_test['text_left']
                                 [np.where(queries_test['id_left'] == int(df_test20['q'][i]))[0][0]],df_test20['d'][i]) 
                     for i in range(len(df_test20))]

In [23]:
test = df_test20[['rl','q','s','ql','dl','phm', 'nmq']]

In [34]:
test

Unnamed: 0,rl,q,s,ql,dl,phm,nmq
0,0,101626,11.455160,2,200,0,1
1,0,101626,11.736632,2,200,0,1
2,0,101626,11.736632,2,200,0,1
3,0,101626,11.841684,2,197,0,1
4,0,101626,11.956985,2,200,0,1
...,...,...,...,...,...,...,...
1995,0,996687,10.664045,2,200,0,1
1996,0,996687,10.664045,2,200,0,1
1997,1,996687,39.583195,2,200,1,2
1998,0,996687,11.489851,2,200,0,1


### Modelling

In [24]:
X_train_opt = train.drop(['rl','q'], axis=1).values
y_train_opt = train['rl'].values
queries_train_opt = train['q'].values

X_test_opt = test.drop(['rl','q'], axis=1).values
y_test_opt = test['rl'].values
queries_test_opt = test['q'].values

In [25]:
max_relevance_opt = np.max(y_train_opt)
y_train_opt = y_train_opt/ max_relevance_opt
y_test_opt = y_test_opt / max_relevance_opt

In [26]:
train_opt = Pool(
    data=X_train_opt,
    label=y_train_opt,
    group_id=queries_train_opt
)

test_opt = Pool(
    data=X_test_opt,
    label=y_test_opt,
    group_id=queries_test_opt
)

In [27]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG'],
    'verbose': False,
    'random_seed': 0,
}

def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    
    return model

In [35]:
fit_model('YetiRank',{'train_dir': 'YetiRank-opt', 'iterations' : 1000 },train_pool=train_opt, test_pool=test_opt)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x22bfe03bb80>

In [36]:
fit_model('PairLogit',{'train_dir': 'PairLogit-opt', 'iterations' : 1000 }, train_pool=train_opt, test_pool=test_opt)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x22bfe039690>

In [37]:
widget = MetricVisualizer(['PairLogit-opt', 'YetiRank-opt'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))