#####  Lib import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from importlib import reload
import seaborn as sns
%load_ext autoreload
%autoreload 2
%matplotlib inline

from dataprep.dataprep import DataPrep
from rank.ranker import Ranker
from pipeline.pipeline import Pipeline

In [3]:
# Models
import xgboost as xgb
from xgboost import XGBRanker

# Cross-val
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, average_precision_score

# Processing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# Feature selection
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectFromModel

##### Data import

In [4]:
test_data = pd.read_csv(r'C:\Users\afragkoulis\PyCharmProjects\sortranking\data\test.csv')
train_data = pd.read_csv(r'C:\Users\afragkoulis\PyCharmProjects\sortranking\data\train.csv')

In [29]:
raw_train_data = train_data.copy()
raw_test_data = test_data.copy()

In [30]:
# from sklearn.preprocessing import Normalizer
# nor = Normalizer(norm='l2', copy=True)
# raw_test_data['srch_visitor_visit_nbr'] = nor.fit_transform(raw_test_data['srch_visitor_visit_nbr'].reset_index())
# # raw_test_data['srch_adults_cnt'].reset_index().shape

In [31]:
train_dataprep = DataPrep(raw_train_data)
test_dataprep = DataPrep(raw_test_data)
df_train = train_dataprep.return_df()
df_test = test_dataprep.return_df().sort_values(['srch_id'])

In [None]:
df_train.to_csv('train_transformed.csv')
df_test.to_csv('test_transformed.csv')
# df_train = pd.read_csv('train_transformed.csv')
# df_test = pd.read_csv('test_transformed.csv').sort_values(['srch_id'])

##### Train

In [33]:
# RandomState
random_state = 1

In [34]:
from sklearn.model_selection import train_test_split
def split_data(df):
    X = df.drop('prop_booking_bool', axis=1).copy()
    y = df['prop_booking_bool'].copy()
    return train_test_split(X, y, test_size=0.2, random_state=random_state)
X_train, X_test, y_train, y_test = split_data(df_train)

In [35]:
# xgtrain = xgb.DMatrix(X_train.values, y_train.values)
# xgtest = xgb.DMatrix(X_test.values)

In [36]:
def train_xgb_model(model, X_train, y_train, X_test, y_test, group, eval_group):
    model.fit(X_train, y_train, group, eval_set=[(X_train, y_train), (X_test, y_test)], eval_group=eval_group)
    y_train_pred = model.predict(X_train)
    res_train = list(model.evals_result['eval_0'].values())[0]
    ndcg_train = round(sum(res_train)/len(res_train), 2)
    print('*'*20 + 'TRAIN' + '*'*20)
    print('train avg precision: %.2f' % (average_precision_score(y_train, y_train_pred, average='weighted')))
    print('train avg ndcg@50: %.2f' % ndcg_train)
    print('*'*20 + 'TEST' + '*'*20)
    y_test_pred = model.predict(X_test)
    res_test = list(model.evals_result['eval_1'].values())[0]
    ndcg_test = round(sum(res_test)/len(res_test), 2)
    print('test avg precision: %.2f' % (average_precision_score(y_test, y_test_pred, average='weighted')))
    print('test avg ndcg@50: %.2f' % ndcg_test)
    return model

In [37]:
# XGBRanker # objective='rank:ndcg'
group = X_train.groupby('srch_id').size().values
eval_group = [X_train.groupby('srch_id').size().values, X_test.groupby('srch_id').size().values]
xgb_model = train_xgb_model(XGBRanker(objective='rank:pairwise', eval_metric='ndcg@50', n_estimators=150, 
                                      learning_rate=0.1, subsample=0.9, random_state=random_state), 
                            X_train, y_train, X_test, y_test, group=group, eval_group=eval_group)

  if getattr(data, 'base', None) is not None and \


********************TRAIN********************
train avg precision: 0.07
train avg ndcg@50: 0.72
********************TEST********************
test avg precision: 0.07
test avg ndcg@50: 0.92


In [38]:
# X_train.xgboost.plot_importance()

In [39]:
# ranker = XGBRanker(n_estimators=150, learning_rate=0.1, subsample=0.9)
# ranker.fit(X_train, y_train, group=X_train.groupby('srch_id').size().values, eval_metric=['ndcg', 'map@5-'])
# y_predict = ranker.predict(X_)

##### Rank & make submission

In [43]:
final_df = df_test.reset_index().copy() 
final_df['rank'] = xgb_model.predict(df_test)

In [44]:
final_df[['srch_id', 'prop_key', 'rank']].head()

Unnamed: 0,srch_id,prop_key,rank
0,-2146499282,401490,0.476926
1,-2146499282,3814956,0.614225
2,-2146499282,426482,0.90762
3,-2146499282,251172,0.926214
4,-2146499282,3255338,0.591575


In [45]:
submission_df = final_df[['srch_id', 'prop_key', 'rank']].sort_values(by=['srch_id', 'rank'], ascending=[False, False])\
                .drop('rank', axis=1).set_index('srch_id')

In [46]:
submission_df.head()

Unnamed: 0_level_0,prop_key
srch_id,Unnamed: 1_level_1
2146811608,263860
2146811608,246294
2146811608,250312
2146811608,582436
2146811608,512551


In [47]:
submission_df.to_csv('submission.csv')