
LambdaMART Model (LM) (25 marks)
-------------------------------------------------
https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/
https://xgboost.readthedocs.io/en/stable/python/python_intro.html#data-interface

Use the LambdaMART learning to rank algorithm (a variant of LambdaRank we have learned in the class)
from XGBoost gradient boosting library to learn a model that can re-rank passages.

command XGBoost to use LambdaMART algorithm for ranking
by setting the appropriate value to the objective parameter as described in the documentation

carry out hyperparameter tuning in this task
--------------------------------------------------
Report:

    - describe the methodology used in deriving the best performing model.
    - report the performance of your model on the validation data with metrics from eval.py
    
    - Describe:
    
        1. how you perform input processing
        2. the representation/features used as input




In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn import model_selection
from sklearn.metrics import make_scorer

import xgboost as xgb

from icecream import ic
# from xgboostextension import XGBRanker
from xgboostextension.scorer import RankingScorer
from xgboostextension.scorer.metrics import ndcg

from huepy import *
from LogisticRegression import DataLoader
from eval import init_evaluator, eval_per_query, eval_dataframe
from utils import timeit, data_path, queries_embeddings, train_raw_df, load_passages_tensors, train_debug_df, val_raw_df


In [2]:
passage_tensors = load_passages_tensors()
x_df = pd.read_parquet(train_debug_df)
dataloader = DataLoader(x_df, len(x_df), passage_tensors)
_,(train_x, train_y )= [(x, y) for x, y in enumerate(dataloader)][0]
train_df = dataloader.df
del dataloader



100%|██████████| 30/30 [01:37<00:00,  3.25s/it]
ic| self.N: 47771, self.num_batches: 2, self.batch_size: 47771


In [3]:
val_df = pd.read_parquet(val_raw_df)
del val_df['query']
del val_df['passage']
del val_df['pid']
del val_df['p_idx']
x_val = torch.load('./data/val_embeddings.pth')[0].reshape(-1,600)

In [4]:
passage_tensors = load_passages_tensors()

100%|██████████| 30/30 [01:36<00:00,  3.22s/it]


In [5]:
_,counts=np.unique(train_df.qid,return_counts=True)
q_idx = np.arange(0,len(counts)).repeat(counts)


train_df.loc[:,'q_idx']=q_idx
train_df

Unnamed: 0,qid,pid,relevancy,q_idx
4083931,2,4339068,1.0,0
4342040,2,857994,0.0,0
3541247,2,3395119,0.0,0
3403841,2,1159026,0.0,0
2628639,2,4406386,0.0,0
...,...,...,...,...
51778,1102400,1828800,0.0,4589
659754,1102400,3863416,0.0,4589
968945,1102400,991043,0.0,4589
1664691,1102400,4464638,0.0,4589


In [6]:
ranker = xgb.XGBRanker(n_estimators=1, 
                       max_depth=15, 
                       learning_rate=0.1,
                       gamma=1,
                       objective='rank:ndcg')

In [7]:
ranker.fit(train_x, train_y, qid=q_idx)
pred=ranker.predict(x_val)

evaluator=init_evaluator(at=[3, 10, 100], x_val_handler=None,prepare_x=False)
evaluator(pred)

(array([0.01466318, 0.01753948, 0.02140657]),
 array([0.0147436 , 0.02150925, 0.05183143]))

In [8]:
from sklearn.model_selection import ParameterGrid
param_dicts = {

    'max_depth': [10,15,20],
#     'learning_rate ': [0.1, 0.5],
    'n_estimators': [5,10,15],
#     'booster': ['gbtree', 'dart'],
    'gamma': [.5, 1, 2]

}
param_grid = list(ParameterGrid(param_dicts))


In [18]:
from sklearn.model_selection import ParameterGrid
param_dicts = {

    'max_depth': [10,15,20],
#     'learning_rate ': [0.1, 0.5],
    'n_estimators': [5,10,15],
#     'booster': ['gbtree', 'dart'],
    'gamma': [.5, 1, 2]

}
param_grid = list(ParameterGrid(param_dicts))
def cross_val(train_x, train_y, param_grid=param_grid):
    

    n_splits = 5
    cv = model_selection.GroupKFold(n_splits=n_splits)
    at = [100]
    all_ndcg = np.zeros(len(param_grid))
    for count, params in enumerate(param_grid):
        print(count, params, end='')


        for i, (train_index, test_index) in enumerate(cv.split(train_x, train_y, groups=q_idx)):
            ranker = xgb.XGBRanker(learning_rate=0.1, objective='rank:ndcg', **params)
            ranker.fit(train_x[train_index, ...], train_y[train_index, ...], qid=q_idx[train_index, ...])

            # predict
            pred = ranker.predict(train_x[test_index, ...])
            train_df_now = train_df.iloc[test_index, :].copy()

            _, avg_ndcg = eval_dataframe(train_df_now, pred, at)
            all_ndcg[count] += avg_ndcg     

    #         [print(orange(italic(f'NDCG @ {now}: {value}'))) for now, value in zip(at, avg_ndcg)]

        all_ndcg[count] /= n_splits
        print(f'\tNDCG @ 100: {all_ndcg[count] :.5f}')

    best = param_grid[np.argmax(all_ndcg)]
    print(f'final:{best}')
    
    return best



        
    

0 {'gamma': 0.5, 'max_depth': 10, 'n_estimators': 5}	NDCG @ 100: 0.53503
1 {'gamma': 0.5, 'max_depth': 10, 'n_estimators': 10}	NDCG @ 100: 0.54381
2 {'gamma': 0.5, 'max_depth': 10, 'n_estimators': 15}	NDCG @ 100: 0.54933
3 {'gamma': 0.5, 'max_depth': 15, 'n_estimators': 5}	NDCG @ 100: 0.53565
4 {'gamma': 0.5, 'max_depth': 15, 'n_estimators': 10}	NDCG @ 100: 0.54221
5 {'gamma': 0.5, 'max_depth': 15, 'n_estimators': 15}	NDCG @ 100: 0.54315
6 {'gamma': 0.5, 'max_depth': 20, 'n_estimators': 5}	NDCG @ 100: 0.53550
7 {'gamma': 0.5, 'max_depth': 20, 'n_estimators': 10}	NDCG @ 100: 0.54045
8 {'gamma': 0.5, 'max_depth': 20, 'n_estimators': 15}	NDCG @ 100: 0.54284
9 {'gamma': 1, 'max_depth': 10, 'n_estimators': 5}	NDCG @ 100: 0.52726
10 {'gamma': 1, 'max_depth': 10, 'n_estimators': 10}	NDCG @ 100: 0.53677
11 {'gamma': 1, 'max_depth': 10, 'n_estimators': 15}	NDCG @ 100: 0.54370
12 {'gamma': 1, 'max_depth': 15, 'n_estimators': 5}	NDCG @ 100: 0.53964
13 {'gamma': 1, 'max_depth': 15, 'n_estimators':

In [6]:
best={'gamma': 2, 'max_depth': 15, 'n_estimators': 15}



In [None]:
df_raw = pd.read_parquet(train_raw_df)
dataloader = DataLoader(df_raw, len(df_raw)-1, passage_tensors)
_,(x_raw, y_raw) = [(x, y) for x, y in enumerate(dataloader)][0]
row_df = dataloader.df
del dataloader

_, counts = np.unique(row_df.qid,return_counts=True)


row_df.loc[:,'q_idx'] = np.arange(0,len(counts)).repeat(counts)



In [None]:
print('start fit')

best_ranker = xgb.XGBRanker(learning_rate=0.1, objective='rank:ndcg', **best)
best_ranker.fit(x_raw, y_raw, qid=row_df.q_idx.values[:-1])


In [15]:
evaluator=init_evaluator(at=[3, 10, 100], x_val_handler=None,prepare_x=False)
evaluator(best_ranker.predict(x_val))

[34m[3mmAP @ 3: 0.007694541231126596[0m[0m
[34m[3mmAP @ 10: 0.013737486864664566[0m[0m
[34m[3mmAP @ 100: 0.01888022919579131[0m[0m
[33m[3mNDCG @ 3: 0.008845512863987182[0m[0m
[33m[3mNDCG @ 10: 0.02201075316100031[0m[0m
[33m[3mNDCG @ 100: 0.0565087289259376[0m[0m


(array([0.00769454, 0.01373749, 0.01888023]),
 array([0.00884551, 0.02201075, 0.05650873]))