In [26]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle 
import gc
import os
import time

# import the LabelEncoder from sklearn
from sklearn.preprocessing import LabelEncoder

In [27]:
# load data
data_train = pd.read_csv('data/train_cleaned.csv') # cleaned data gebruiken

# print search id column
print(data_train['srch_id'])

0               1.0
1               1.0
2               1.0
3               1.0
4               1.0
             ...   
4958342    332785.0
4958343    332785.0
4958344    332785.0
4958345    332785.0
4958346    332785.0
Name: srch_id, Length: 4958347, dtype: float64


In [28]:
# ceate target variable that combines booking bool and click bool 
# if booking_bool = 1, target = 5
# if click_bool = 1, target = 1
# else target = 0

data_train['target'] = np.where(data_train['click_bool'] == 1, 1, 0)
data_train['target'] = np.where(data_train['booking_bool'] == 1, 5, data_train['target'])	
data_train = data_train.drop(['click_bool', 'booking_bool'], axis=1)

print(data_train['srch_id'])

0               1.0
1               1.0
2               1.0
3               1.0
4               1.0
             ...   
4958342    332785.0
4958343    332785.0
4958344    332785.0
4958345    332785.0
4958346    332785.0
Name: srch_id, Length: 4958347, dtype: float64


In [29]:
# define categorical features
categorical_features = ['date_time', 'site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'prop_brand_bool', 'promotion_flag', 'srch_destination_id', 'srch_saturday_night_bool', 'random_bool']
categorical_index = [data_train.columns.get_loc(col) for col in categorical_features]

labelencoder = LabelEncoder()

for col in categorical_features:
    data_train[col] = labelencoder.fit_transform(data_train[col])

for col in categorical_features:
    data_train[col] = data_train[col].astype('int')

In [30]:
# print all columns and data types
print(data_train.dtypes)

srch_id                        float64
date_time                        int32
site_id                          int32
visitor_location_country_id      int32
prop_country_id                  int32
prop_id                          int32
prop_starrating                float64
prop_review_score              float64
prop_brand_bool                  int32
prop_log_historical_price      float64
position                       float64
price_usd                      float64
promotion_flag                   int32
srch_destination_id              int32
srch_length_of_stay            float64
srch_booking_window            float64
srch_adults_count              float64
srch_children_count            float64
srch_room_count                float64
srch_saturday_night_bool         int32
random_bool                      int32
prop_location_score            float64
comb_rate                      float64
comb_inv                       float64
target                           int32
dtype: object


In [31]:
#data_test = pd.read_csv('data/test.csv')

# only use 10% of the data
# data_train = data_train.sample(frac=0.1, random_state=42)
# data_test = data_train.sample(frac=0.01, random_state=42)
#data_test = data_test.sample(frac=0.1, random_state=42)

In [32]:
# show first 5 rows
data_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_log_historical_price,...,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,prop_location_score,comb_rate,comb_inv,target
0,1.0,0,11,168,162,824,3.0,3.5,1,4.95,...,0.0,4.0,0.0,1.0,1,1,1.4369,0.0,0.0,0
1,1.0,0,11,168,162,9596,4.0,4.0,1,5.03,...,0.0,4.0,0.0,1.0,1,1,1.10745,0.0,1.0,0
2,1.0,0,11,168,162,19602,3.0,4.5,1,4.92,...,0.0,4.0,0.0,1.0,1,1,1.11225,0.0,0.0,0
3,1.0,0,11,168,162,25136,2.0,4.0,1,4.39,...,0.0,4.0,0.0,1.0,1,1,1.42125,-3.0,1.0,0
4,1.0,0,11,168,162,27221,4.0,3.5,1,4.93,...,0.0,4.0,0.0,1.0,1,1,1.38205,0.0,0.0,0


In [33]:
# split train data into train and test data
temp = data_train.sample(frac=0.8, random_state=42)
data_test = data_train.drop(temp.index)
data_train = temp

data_test.head()	

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_log_historical_price,...,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,prop_location_score,comb_rate,comb_inv,target
4,1.0,0,11,168,162,27221,4.0,3.5,1,4.93,...,0.0,4.0,0.0,1.0,1,1,1.38205,0.0,0.0,0
10,1.0,0,11,168,162,54381,3.0,3.964591,1,5.08,...,0.0,4.0,0.0,1.0,1,1,2.2,1.0,0.0,0
16,1.0,0,11,168,162,80796,4.0,4.0,1,5.18,...,0.0,4.0,0.0,1.0,1,1,1.503,1.0,0.0,0
17,1.0,0,11,168,162,80826,3.0,3.0,1,4.8,...,0.0,4.0,0.0,1.0,1,1,0.6969,0.0,0.0,0
21,1.0,0,11,168,162,87388,4.0,3.5,1,4.93,...,0.0,4.0,0.0,1.0,1,1,1.25745,-1.0,1.0,0


In [34]:
# prnt what columns are missing in test data compared to train data
print('Columns missing in test data compared to train data: ', set(data_train.columns) - set(data_test.columns))

Columns missing in test data compared to train data:  set()


In [35]:
# print size of the data
print('Size of training data: ' + str(data_train.shape))
print('Size of testing data: ' + str(data_test.shape))

Size of training data: (3966678, 25)
Size of testing data: (991669, 25)


In [36]:
# drop columns with missing values
data_train = data_train.dropna(axis=1)
data_test = data_test.dropna(axis=1)

In [37]:
# sort on search id and print
data_train = data_train.sort_values(by=['srch_id'])
data_test = data_test.sort_values(by=['srch_id'])
data_train.head()
data_test.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_log_historical_price,...,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,prop_location_score,comb_rate,comb_inv,target
4,1.0,0,11,168,162,27221,4.0,3.5,1,4.93,...,0.0,4.0,0.0,1.0,1,1,1.38205,0.0,0.0,0
10,1.0,0,11,168,162,54381,3.0,3.964591,1,5.08,...,0.0,4.0,0.0,1.0,1,1,2.2,1.0,0.0,0
16,1.0,0,11,168,162,80796,4.0,4.0,1,5.18,...,0.0,4.0,0.0,1.0,1,1,1.503,1.0,0.0,0
17,1.0,0,11,168,162,80826,3.0,3.0,1,4.8,...,0.0,4.0,0.0,1.0,1,1,0.6969,0.0,0.0,0
21,1.0,0,11,168,162,87388,4.0,3.5,1,4.93,...,0.0,4.0,0.0,1.0,1,1,1.25745,-1.0,1.0,0


In [38]:
# define groups
groups = data_train.groupby('srch_id').size().values                                                    # TODO: this does not work
lgb_train = lgb.Dataset(data_train.drop(['target'], axis=1), data_train['target'], group=groups)

In [45]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [5, 10, 20],
    'max_bin': 255,
    'num_leaves': 31,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'is_unbalance': True,
    
}
gbm = lgb.train(params, lgb_train, num_boost_round=100)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2266
[LightGBM] [Info] Number of data points in the train set: 3966678, number of used features: 24


In [46]:
# print columns in train data
print('Columns in train data: ', data_train.columns)

Columns in train data:  Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score',
       'prop_brand_bool', 'prop_log_historical_price', 'position', 'price_usd',
       'promotion_flag', 'srch_destination_id', 'srch_length_of_stay',
       'srch_booking_window', 'srch_adults_count', 'srch_children_count',
       'srch_room_count', 'srch_saturday_night_bool', 'random_bool',
       'prop_location_score', 'comb_rate', 'comb_inv', 'target'],
      dtype='object')


In [47]:
# print columns in test data
print('Columns in test data: ', data_test.columns)

Columns in test data:  Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score',
       'prop_brand_bool', 'prop_log_historical_price', 'position', 'price_usd',
       'promotion_flag', 'srch_destination_id', 'srch_length_of_stay',
       'srch_booking_window', 'srch_adults_count', 'srch_children_count',
       'srch_room_count', 'srch_saturday_night_bool', 'random_bool',
       'prop_location_score', 'comb_rate', 'comb_inv', 'target'],
      dtype='object')


In [48]:
# make predictions
y_pred = gbm.predict(data_test.drop(['target'], axis=1))

In [49]:
# rank predictions per search id
data_test['pred'] = y_pred

# sort predictions by search id and prediction
data_test = data_test.sort_values(['srch_id', 'pred'], ascending = [True, False])

In [50]:
df = data_test[['srch_id', 'prop_id', 'target', 'pred']]
if not os.path.exists('results'):
    os.makedirs('results')
df.to_csv('results/submission.csv', index=False)

In [52]:
# evaluate predictions
from sklearn.metrics import ndcg_score
print('NDCG score: ', ndcg_score(data_test['target'].values.reshape(1, -1), data_test['pred'].values.reshape(1, -1), k=5))

NDCG score:  0.834790051318404


In [None]:
# dealing with under