
LambdaMART Model (LM) (25 marks)
-------------------------------------------------
https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/
https://xgboost.readthedocs.io/en/stable/python/python_intro.html#data-interface

Use the LambdaMART learning to rank algorithm (a variant of LambdaRank we have learned in the class)
from XGBoost gradient boosting library to learn a model that can re-rank passages.

command XGBoost to use LambdaMART algorithm for ranking
by setting the appropriate value to the objective parameter as described in the documentation

carry out hyperparameter tuning in this task
--------------------------------------------------
Report:

    - describe the methodology used in deriving the best performing model.
    - report the performance of your model on the validation data with metrics from eval.py
    
    - Describe:
    
        1. how you perform input processing
        2. the representation/features used as input




In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn import model_selection
from sklearn.metrics import make_scorer

import xgboost as xgb

from icecream import ic

from huepy import *
from eval import init_evaluator, eval_per_query, eval_dataframe
from utils import map_location, queries_embeddings, train_raw_df, load_passages_tensors, train_debug_df, val_raw_df



In [2]:
class DataLoader:
    def __init__(self, dataframe: pd.DataFrame, batch_size, p_tensors):
        self.current_pth = -1
        self.p_tensors = p_tensors
        self.q_tensors = torch.load(queries_embeddings, map_location=map_location)
        self.df = dataframe.sort_values(by=['pid'])[['qid', 'pid', 'relevancy']]
        self.N = len(dataframe)
        self.batch_size = batch_size
        self.num_batches = self.N // self.batch_size + 1
        ic(self.N, self.num_batches, self.batch_size)

    def __len__(self):
        return self.num_batches

    def __iter__(self):
        for start in range(0, self.N, self.batch_size):
            end = min(start + self.batch_size, self.N)
            this_batch_size = end - start
            df = self.df.iloc[start:end]
            df.reset_index(drop=True, inplace=True)

            queries = torch.zeros((this_batch_size, 300))
            passages = torch.zeros((this_batch_size, 300))
            for i, row in df.iterrows():
                queries[i, :] = self.q_tensors[row.qid]
                passages[i, :] = self.p_tensors[row.pid]

            x = torch.stack([queries, passages], dim=2).numpy().reshape(-1, 600)
            y = df.relevancy.values.reshape(-1)
            yield x, y


In [None]:
passage_tensors = load_passages_tensors()
x_df = pd.read_parquet(train_debug_df)


In [22]:
val_df = pd.read_parquet(val_raw_df)
del val_df['query']
del val_df['passage']
del val_df['pid']
del val_df['p_idx']
x_val = torch.load('./data/val_embeddings.pth')[0].reshape(-1, 600)

In [None]:
train_df = pd.read_parquet(train_raw_df)

_, counts = np.unique(train_df.qid, return_counts=True)
q_idx = np.arange(0, len(counts)).repeat(counts)

train_df.loc[:, 'q_idx'] = q_idx
train_df

In [None]:
evaluator = init_evaluator(at=[3, 10, 100], x_val_handler=None, prepare_x=False)

In [None]:
from sklearn.model_selection import ParameterGrid

param_dicts = {

    'max_depth': [10, 15, 20],
    #     'learning_rate ': [0.1, 0.5],
    'n_estimators': [5, 10, 15],
    #     'booster': ['gbtree', 'dart'],
    'gamma': [.5, 1, 2]

}
param_grid = list(ParameterGrid(param_dicts))


In [None]:

def cross_val(train_x, train_y, param_grid=param_grid):
    n_splits = 5
    cv = model_selection.GroupKFold(n_splits=n_splits)
    at = [100]
    all_ndcg = np.zeros(len(param_grid))
    for count, params in enumerate(param_grid):
        print(count, params, end='')

        for i, (train_index, test_index) in enumerate(cv.split(train_x, train_y, groups=q_idx)):
            ranker = xgb.XGBRanker(learning_rate=0.1, objective='rank:ndcg', **params)
            ranker.fit(train_x[train_index, ...], train_y[train_index, ...], qid=q_idx[train_index, ...])

            # predict
            pred = ranker.predict(train_x[test_index, ...])
            train_df_now = train_df.iloc[test_index, :].copy()

            _, avg_ndcg = eval_dataframe(train_df_now, pred, at)
            all_ndcg[count] += avg_ndcg

            #         [print(orange(italic(f'NDCG @ {now}: {value}'))) for now, value in zip(at, avg_ndcg)]

        all_ndcg[count] /= n_splits
        print(f'\tNDCG @ 100: {all_ndcg[count] :.5f}')

    best = param_grid[np.argmax(all_ndcg)]
    print(f'final:{best}')

    return best


In [18]:

dataloader = DataLoader(dataframe=train_df, batch_size=len(train_raw_df), p_tensors=passage_tensors)
_, (x_raw, y_raw) = [(x, y) for x, y in enumerate(dataloader)][0]
row_df = dataloader.df

best = cross_val(x_raw, y_raw, param_grid)

# {'gamma': 2, 'max_depth': 15, 'n_estimators': 15}

In [None]:
df_raw = pd.read_parquet(train_debug_df)
passage_tensors = load_passages_tensors()
dataloader = DataLoader(dataframe=df_raw, batch_size=len(df_raw), p_tensors=passage_tensors)


In [15]:
_, (x_raw, y_raw) = [(x, y) for x, y in enumerate(dataloader)][0]
row_df = dataloader.df

In [16]:
# del dataloader

_, counts = np.unique(row_df.qid, return_counts=True)

row_df.loc[:, 'q_idx'] = np.arange(0, len(counts)).repeat(counts)



In [20]:
print('start fit')
best = {'gamma': 2, 'max_depth': 15, 'n_estimators': 15}
best_ranker = xgb.XGBRanker(learning_rate=0.1, objective='rank:ndcg', **best)
best_ranker.fit(x_raw, y_raw, qid=row_df.q_idx.values)


start fit


In [23]:
evaluator = init_evaluator(at=[3, 10, 100], x_val_handler=None, prepare_x=False)
evaluator(best_ranker.predict(x_val))

(array([0.007259  , 0.01048165, 0.0143232 ]),
 array([0.00829592, 0.01518676, 0.03929316]))

In [25]:
scores = best_ranker.predict(x_val)

In [104]:
def test(pred_callback, model_name):
    p = torch.load('./data/part1/passage.pth', map_location=map_location)
    q = torch.load('./data/part1/query.pth', map_location=map_location)
    df = pd.read_csv('./data/part1/candidate_passages_top1000.tsv', sep='\t', header=None,
                     names=['qid', 'pid', 'query', 'passage'])

    df = df.sort_values(by=['pid'])[['qid', 'pid']]
    N = len(df)

    queries = torch.zeros((N, 300))
    passages = torch.zeros((N, 300))

    for i, row in df.iterrows():
        queries[i, :] = q[row.qid]
        passages[i, :] = p[row.pid]

    x = torch.stack([queries, passages], dim=2).numpy().reshape(-1, 600)

    df['score'] = pred_callback(x)

    group = df.groupby('qid')

    dflist = []
    for name, dff in group:
        dff = dff.sort_values(by=['score'], ascending=False)

        if len(dff) > 100:
            dff = dff.iloc[:100, :]

        dff['rank'] = np.arange(len(dff)) + 1
        dflist.append(dff)

    result = pd.concat(dflist)
    result['A'] = ['A2'] * len(result)
    result['model'] = [model_name] * len(result)
    result = result.reindex(columns=['qid', 'A', 'pid', 'rank', 'score', 'model'])
    #     result.to_csv(f'./data/part1/{model_name}.txt', sep=' ',header=False,index=False)
    return result

