#### Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pyltr

import operator
import pickle
import matplotlib.pyplot as plt

#### Read Data 

In [2]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

def gini_lgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', 1-gini_score, False

#### LamdaMART

In [3]:
TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(open('./RankLib/train.csv'))
VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(open('./RankLib/val.csv'))
EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(open('./RankLib/test.csv'))

In [4]:
metric = pyltr.metrics.NDCG(k=20000)

# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=250)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1000,
    learning_rate=0.02,
    max_features=0.8,
    query_subsample=0.8,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=2,
)

model.fit(TX, Ty, Tqids, monitor=monitor)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 


KeyboardInterrupt: 

In [128]:
# Create a submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = np.zeros_like(id_test)

[Fold 1/5]
[0]	train-gini:0.092283	valid-gini:0.092967
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 170 rounds.
[100]	train-gini:0.23907	valid-gini:0.219212
[200]	train-gini:0.279838	valid-gini:0.239939
[300]	train-gini:0.307116	valid-gini:0.251263
[400]	train-gini:0.329763	valid-gini:0.257677
[500]	train-gini:0.348702	valid-gini:0.260066
[600]	train-gini:0.366232	valid-gini:0.261655
[700]	train-gini:0.382267	valid-gini:0.263327
[800]	train-gini:0.397882	valid-gini:0.263074
[900]	train-gini:0.412033	valid-gini:0.263318
Stopping. Best iteration:
[749]	train-gini:0.389925	valid-gini:0.263849

[Fold 1/5 Prediction:]
0.263849
[Fold 2/5]
[0]	train-gini:0.08912	valid-gini:0.086737
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 170 rounds.
[100]	train-gini:0.238951	valid-gini:0.221573
[200]	train-gini:0.276291	valid-gin

KeyboardInterrupt: 

In [60]:
# Create a submission file
sub.to_csv('./submit/v15_xgb_one_hot_standardized_gamma_15_depth_7_colbytree_07_10_fold_val_2869.csv.gz', 
           index=False, compression='gzip')

In [109]:
val_scores = [0.279973, 0.283588, 0.282138, 0.291781, 0.277837]
np.mean(val_scores)

0.28306339999999997

In [11]:
one_hot_standardized_lambda_100_val_scores = [0.282158, 0.282431, 0.286325, 0.294444, 0.275561]
np.mean(one_hot_standardized_lambda_100_val_scores)
one_hot_standardized_lambda_100_poisson_val_scores = [0.282493, 0.283971, 0.286585, 0.294414, 0.276483]
np.mean(one_hot_standardized_lambda_100_poisson_val_scores)
one_hot_and_original_standardized_gamma_17_depth_8_eta_015_val_scores = [0.282456, 0.280899, 0.284503,
                                                                 0.290474, 0.273345]
np.mean(one_hot_and_original_standardized_gamma_17_depth_8_eta_015_val_scores)
one_hot_standardized_gamma_15_depth_7_colbytree_07_10_fold_val_scores = [0.301454, 0.269646, 0.292486,
                                                                 0.27597, 0.294481, 0.27325, 0.301893,
                                                                        0.281695, 0.2969622, 0.281281]
np.mean(one_hot_standardized_gamma_15_depth_7_colbytree_07_10_fold_val_scores)

0.28691181999999993