In [None]:
import pandas as pd

from xgboost import XGBRanker
from lightgbm import LGBMRanker, LGBMRegressor, LGBMClassifier

from sklearn.metrics import ndcg_score

import numpy as np

import optuna

In [None]:
ndcg_dict = {'k=5': {}, 'k=10': {}}

In [None]:
k=10
n_trials=100

# Functions

In [None]:
def get_ndcg(model, df, true_score, query_id, k=None):
    predicted_score = model.predict(df)
    
    ndcg_df = pd.DataFrame({'query_id': query_id, 'true_score': true_score, 'predicted_score': predicted_score})
    
    true_score_test = ndcg_df.groupby(['query_id'])['true_score'].apply(list).tolist()
    predicted_score_test = ndcg_df.groupby(['query_id'])['predicted_score'].apply(list).tolist()

    return np.mean([ndcg_score([_true], [_predicted], k=k) for _true, _predicted in zip(true_score_test, predicted_score_test) if len(_true) > 1])


#  Reading Data

In [None]:
def read_data(fold, filename):
    df = pd.read_csv(f'D:/datasets/MSLR-WEB10K/{fold}/{filename}.txt', delimiter=" ", header=None)
    df = df.applymap(lambda x: x.split(":", 1)[-1] if type(x) == str else x)

    y = df[0]
    query_id = df[1]

    group = df.groupby(1) # Group by based on query id
    group_size = group.size().to_list()

    df = df.drop([0, 1], axis=1) # Drop label and query id column
    df = df.dropna(axis=1, how='all')
    df = df.astype(float) # Turn all to float
    

    return df, y, group_size, query_id

In [None]:
df_train, y_train, group_size_train, _ = read_data('Fold1', 'train')
df_vali, y_vali, group_size_vali, query_id_vali = read_data('Fold1', 'vali')
df_test, y_test, group_size_test, query_id_test = read_data('Fold1', 'test')

X_columns = df_train.columns

# LGBMRanker

In [None]:
def objective(trial):
    
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 1000, step=100),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }
    
    model = LGBMRanker(**param_grid)
    model.fit(
        df_train[X_columns],
        y_train,
        group=group_size_train,
        eval_group=[group_size_vali],
        eval_set=[(df_vali[X_columns], y_vali)],
        early_stopping_rounds=150,
        verbose=False
    )
    
    ndcg = get_ndcg(model=model, df=df_test, true_score=y_test, query_id=query_id_test, k=10)
    print(ndcg)
    
    trial.set_user_attr(key="best_booster", value=model)
    return ndcg

def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_booster", value=trial.user_attrs["best_booster"])


In [None]:
study = optuna.create_study(direction="maximize", study_name="LGBM Ranker")

func = lambda trial: objective(trial)
study.optimize(func, n_trials=n_trials, callbacks=[callback])

In [None]:
print(study.best_value, study.best_params)

# LGBMRegressor

In [None]:
lgbm_regressor = LGBMRegressor()

lgbm_regressor = lgbm_regressor.fit(
    df_train[X_columns],
    y_train
)

In [None]:
print(get_ndcg(model=lgbm_regressor, df=df_test, true_score=y_test, query_id=query_id_test, k=k))

In [None]:
def objective_regressor(trial):
    
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 1000, step=100),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }
    
    model = LGBMRegressor(**param_grid)
    model.fit(
        df_train[X_columns],
        y_train,
        verbose=False
    )
    
    ndcg = get_ndcg(model=model, df=df_test, true_score=y_test, query_id=query_id_test, k=10)
    print(ndcg)
    
    return ndcg
    

In [None]:
study = optuna.create_study(direction="maximize", study_name="LGBM Regressor")

func = lambda trial: objective_regressor(trial)
study.optimize(func, n_trials=n_trials)

In [None]:
print(study.best_value, study.best_params)

# XGBoost

In [None]:
def objective_xgboost(trial):
    
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_leaves": trial.suggest_int("num_leaves", 0, 300, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "reg_alpha": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "reg_lambda": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
    }
    
    model = XGBRanker(objective="rank:pairwise", **param_grid)
    model.fit(
        df_train[X_columns],
        y_train,
        group=group_size_train,
        eval_group=[group_size_vali],
        eval_set=[(df_vali[X_columns], y_vali)],
        early_stopping_rounds=150,
        verbose=False
    )
    
    ndcg = get_ndcg(model=model, df=df_test, true_score=y_test, query_id=query_id_test, k=10)
    print(ndcg)
    
    return ndcg
    

In [None]:
study = optuna.create_study(direction="maximize", study_name="XGBRanker")

func = lambda trial: objective_xgboost(trial)
study.optimize(func, n_trials=n_trials)

In [None]:
print(study.best_value, study.best_params)

In [None]:
# xgb_ranker = XGBRanker(objective="rank:pairwise")

# xgb_ranker = xgb_ranker.fit(
#     df_train[X_columns],
#     y_train,
#     group=group_size_train,
#     eval_group=[group_size_vali],
#     eval_set=[(df_vali[X_columns], y_vali)],
#     early_stopping_rounds=150
# )

In [None]:
# print(get_ndcg(model=xgb_ranker, df=df_test, true_score=y_test, query_id=query_id_test, k=k))