In [90]:
import pandas as pd
import numpy as np
import random
import math
import time

import pyltr
from sklearn.model_selection import train_test_split

In [91]:
# Function to sample queries
def sample_queries(df, frac):
    srch_ids = df["srch_id"].unique()[0:int(len(df)*frac)]
    return df.loc[df.srch_id.isin(srch_ids)]

# Train, validation and test set generated from the train set
_train = pd.read_csv("data/processed/_train_undersampled.csv")
_val = pd.read_csv("data/processed/_val.csv")
_test = pd.read_csv("data/processed/_test.csv")

# The actual test set that will be used to submit our final result
test = pd.read_csv("data/processed/kaggle_test.csv")

# The properties that will be used for submitting the final result
Sprops = test["prop_id"]

# To speed up execution we'll just sample for now
_train = sample_queries(_train, 0.1)
_val = sample_queries(_val, 0.2)
_test = sample_queries(_test, 1)

In [92]:
_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,price_usd_mean,price_usd_median,prop_starrating_mean,prop_starrating_median,prop_review_score_mean,prop_review_score_median
0,1,2013-04-04 08:32:15,12,187,,,219,56880,4,4.0,...,,0,,0,163.718929,139.05,3.071429,3.0,3.482143,3.5
1,1,2013-04-04 08:32:15,12,187,,,219,68914,2,3.0,...,11.0,1,114.29,1,163.718929,139.05,3.071429,3.0,3.482143,3.5
2,4,2012-12-31 08:59:22,5,219,,,219,109185,4,4.5,...,,0,,0,265.5625,263.0,3.09375,3.5,3.90625,4.0
3,4,2012-12-31 08:59:22,5,219,,,219,139893,2,3.0,...,,1,,0,265.5625,263.0,3.09375,3.5,3.90625,4.0
4,6,2013-06-05 12:27:51,14,100,,,100,52376,2,0.0,...,,0,,0,125.47,115.03,1.4,2.0,3.1,4.0


In [93]:
_train.columns

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate'

In [94]:
len(_train.loc[_train.booking_bool == 1]) / len(_train) * 100

31.36258064516129

In [95]:
len(_train.loc[(_train.click_bool == 1) & (_train.booking_bool == 0)]) / len(_train) * 100

19.13290322580645

In [96]:
len(_train.loc[_train.click_bool == 0]) / len(_train) * 100

49.504516129032254

In [97]:
# Generate target label
for df in [_train, _val, _test]:
    df["relevance"] = df.click_bool + df.booking_bool
    df.relevance = df.relevance.map(lambda r: 5 if r > 1 else r)
    df.drop(["click_bool", "booking_bool"], axis=1, inplace=True)

In [99]:
# Generate price difference feature
#for df in [_train, _val, _test, test]:
    #df["price_usd_diff"] = df["price_usd"] - df["price_usd_mean"]
    #df["prop_starrating_diff"] = df["prop_starrating"] - df["prop_starrating_mean"] # Seems to be a bad feature

In [100]:
# Generate month feature
#for df in [_train, _val, _test, test]:
    #df["month"] = 

In [101]:
# Fill missing values (need better methods for each feature)
for df in [_train, _val, _test, test]:
    df.fillna(0, inplace=True)

In [102]:
# Drop training specific data
for df in [_train, _val, _test]:
    df.drop(["gross_bookings_usd", "position"], axis=1, inplace=True)
    
# Drop other features we don't care about
for df in [_train, _val, _test, test]:
    df.drop(["date_time", "visitor_hist_starrating", "visitor_hist_adr_usd", "prop_id", "random_bool"], axis=1, inplace=True)

_train.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,...,comp8_rate,comp8_inv,comp8_rate_percent_diff,price_usd_mean,price_usd_median,prop_starrating_mean,prop_starrating_median,prop_review_score_mean,prop_review_score_median,relevance
0,1,12,187,219,4,4.0,1,2.83,0.1028,5.15,...,0.0,0.0,0.0,163.718929,139.05,3.071429,3.0,3.482143,3.5,0
1,1,12,187,219,2,3.0,1,2.2,0.0206,4.44,...,0.0,0.0,11.0,163.718929,139.05,3.071429,3.0,3.482143,3.5,5
2,4,5,219,219,4,4.5,1,3.18,0.3138,6.08,...,0.0,0.0,0.0,265.5625,263.0,3.09375,3.5,3.90625,4.0,0
3,4,5,219,219,2,3.0,1,1.61,0.0309,5.25,...,0.0,0.0,0.0,265.5625,263.0,3.09375,3.5,3.90625,4.0,1
4,6,14,100,100,2,0.0,1,1.95,0.0,0.0,...,0.0,0.0,0.0,125.47,115.03,1.4,2.0,3.1,4.0,0


In [103]:
def split_X_y_qids(df):
    X = df.drop(["srch_id", "relevance"], axis=1)
    y = df.relevance
    qids = df.srch_id
    return (X, y, qids)
    
TX, Ty, Tqids = split_X_y_qids(_train)
VX, Vy, Vqids = split_X_y_qids(_val)
EX, Ey, Eqids = split_X_y_qids(_test)

# Train model
metric = pyltr.metrics.NDCG(k=38)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=100)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=100,
    learning_rate=0.1,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1
)

start = time.time()

model.fit(TX, Ty, Tqids)

print("Time elapsed:", time.time() - start)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.8038       0.1800        7.47m                                         
    2       0.8298       0.0254        6.63m                                         
    3       0.8424       0.0126        6.33m                                         
    4       0.8624       0.0204        6.24m                                         
    5       0.8655       0.0026        6.05m                                         
    6       0.8701       0.0032        5.88m                                         
    7       0.8713       0.0020        5.72m                                         
    8       0.8731       0.0009        5.59m                                         
    9       0.8736       0.0019        5.48m                                         
   10       0.8776       0.0023        5.40m                                         
   15       0.8793       0.0001        5.04m         

In [104]:
# Test model locally
Epred = model.predict(EX)
metric.calc_mean(Eqids, Ey.as_matrix(), Epred)

0.49620279854038296

In [105]:
# Predict final submission order
SX = test.drop("srch_id", axis=1)
Sqids = test["srch_id"]
Spred = model.predict(SX)

In [106]:
# Create submission data frame
result = pd.DataFrame()
result["SearchId"] = Sqids
result["PropertyId"] = Sprops
result["Sort"] = Spred
result = result.sort(["SearchId", "Sort"], ascending=False)
result



Unnamed: 0,SearchId,PropertyId,Sort
6622614,665571,52204,0.884823
6622620,665571,108152,0.823453
6622607,665571,30880,0.766024
6622621,665571,111905,0.638042
6622611,665571,42127,0.325211
6622600,665571,10681,0.139916
6622609,665571,36329,0.093268
6622616,665571,58641,0.059106
6622598,665571,2312,0.040450
6622625,665571,127979,-0.011291


In [107]:
# Write submission to file
result = result.drop("Sort", axis=1)
result.to_csv("submission.csv", index=False)