In [42]:
import pandas as pd
import numpy as np
import random
import math

import pyltr
from sklearn.model_selection import train_test_split

# Function to sample queries
def sample_queries(df, frac):
    srch_ids = df["srch_id"].unique()[0:int(len(df)*frac)]
    return df.loc[df.srch_id.isin(srch_ids)]

# Train, validation and test set generated from the train set
_train = pd.read_csv("data/processed/_train_undersampled.csv")
_val = pd.read_csv("data/processed/_val.csv")
_test = pd.read_csv("data/processed/_test.csv")

# The actual test set that will be used to submit our final result
test = pd.read_csv("data/processed/kaggle_test.csv")

# The properties that will be used for submitting the final result
Sprops = test["prop_id"]

# To speed up execution we'll just sample for now
_train = sample_queries(_train, 0.1)
_val = sample_queries(_val, 0.2)
_test = sample_queries(_test, 1)

In [43]:
_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,price_usd_mean,price_usd_median,prop_starrating_mean,prop_starrating_median,prop_review_score_mean,prop_review_score_median
0,1,2013-04-04 08:32:15,12,187,,,219,50984,2,0.0,...,,0,,0,163.718929,139.05,3.071429,3.0,3.482143,3.5
1,1,2013-04-04 08:32:15,12,187,,,219,68914,2,3.0,...,11.0,1,114.29,1,163.718929,139.05,3.071429,3.0,3.482143,3.5
2,4,2012-12-31 08:59:22,5,219,,,219,64344,4,3.0,...,,0,,0,265.5625,263.0,3.09375,3.5,3.90625,4.0
3,4,2012-12-31 08:59:22,5,219,,,219,139893,2,3.0,...,,1,,0,265.5625,263.0,3.09375,3.5,3.90625,4.0
4,6,2013-06-05 12:27:51,14,100,,,100,10759,0,2.0,...,,0,,0,125.47,115.03,1.4,2.0,3.1,4.0


In [44]:
_train.shape

(77500, 60)

In [45]:
len(_train.loc[_train.booking_bool == 1]) / len(_train) * 100

31.36258064516129

In [46]:
len(_train.loc[(_train.click_bool == 1) & (_train.booking_bool == 0)]) / len(_train) * 100

19.13290322580645

In [47]:
len(_train.loc[_train.click_bool == 0]) / len(_train) * 100

49.504516129032254

In [48]:
# Generate target label
for df in [_train, _val, _test]:
    df["relevance"] = df.click_bool + df.booking_bool
    df.relevance = df.relevance.map(lambda r: 5 if r > 1 else r)

In [49]:
# Generate price difference feature
for df in [_train, _val, _test, test]:
    df["price_usd_diff"] = df["price_usd"] - df["price_usd_mean"]
    df["prop_starrating_diff"] = df["prop_starrating"] - df["prop_starrating_mean"] # Seems to be a bad feature

In [50]:
# Fill missing values (need better methods for each feature)
for df in [_train, _val, _test, test]:
    df.fillna(-1, inplace=True)

In [51]:
# Select properties we care about (kind of arbitrary right now)
features = ["srch_id", "site_id", "prop_country_id", "prop_starrating", "prop_review_score", "prop_brand_bool",
            "prop_location_score1", "prop_location_score2", "price_usd", "promotion_flag", "srch_destination_id",
            "srch_length_of_stay", "srch_adults_count", "srch_children_count", "srch_room_count",
            "prop_starrating_median", "prop_starrating_mean"
            "price_usd_median", "price_usd_mean", "price_usd_diff",
            "relevance"]

_train = _train[features]
_val = _val[features]
_test = _test[features]
test = test[features[0:-1]]

_train.head()

Unnamed: 0,srch_id,site_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,price_usd,promotion_flag,...,srch_adults_count,srch_children_count,srch_room_count,prop_starrating_median,prop_starrating_mean,prop_starrating_diff,price_usd_median,price_usd_mean,price_usd_diff,relevance
0,1,12,219,2,0.0,0,1.61,-1.0,85.37,0,...,4,0,1,3.0,3.071429,-1.071429,139.05,163.718929,-78.348929,0
1,1,12,219,2,3.0,1,2.2,0.0206,100.89,0,...,4,0,1,3.0,3.071429,-1.071429,139.05,163.718929,-62.828929,5
2,4,5,219,4,3.0,1,1.79,-1.0,267.0,0,...,1,0,1,3.5,3.09375,0.90625,263.0,265.5625,1.4375,0
3,4,5,219,2,3.0,1,1.61,0.0309,129.0,0,...,1,0,1,3.5,3.09375,-1.09375,263.0,265.5625,-136.5625,1
4,6,14,100,0,2.0,0,1.95,-1.0,97.63,0,...,2,0,1,2.0,1.4,-1.4,115.03,125.47,-27.84,0


In [52]:
def split_X_y_qids(df):
    X = df.drop(["srch_id", "relevance"], axis=1)
    y = df.relevance
    qids = df.srch_id
    return (X, y, qids)
    
TX, Ty, Tqids = split_X_y_qids(_train)
VX, Vy, Vqids = split_X_y_qids(_val)
EX, Ey, Eqids = split_X_y_qids(_test)

# Train model
metric = pyltr.metrics.NDCG(k=38)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=100)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=100,
    learning_rate=0.1,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1
)
model.fit(TX, Ty, Tqids, monitor=monitor)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.8150       0.1889       15.91m      C:      0.3466 B:      0.3466 S:  0
    2       0.8397       0.0265       13.16m      C:      0.3853 B:      0.3853 S:  0
    3       0.8583       0.0169       12.16m      C:      0.4149 B:      0.4149 S:  0
    4       0.8668       0.0089       11.60m      C:      0.4313 B:      0.4313 S:  0
    5       0.8707       0.0041       11.26m      C:      0.4427 B:      0.4427 S:  0
    6       0.8717       0.0002       10.99m      C:      0.4438 B:      0.4438 S:  0
    7       0.8735       0.0015       10.76m      C:      0.4465 B:      0.4465 S:  0
    8       0.8757       0.0004       10.54m      C:      0.4496 B:      0.4496 S:  0
    9       0.8757       0.0019       10.36m      C:      0.4543 B:      0.4543 S:  0
   10       0.8776       0.0015       10.18m      C:      0.4562 B:      0.4562 S:  0
   15       0.8829       0.0008        9.46m      C: 

<pyltr.models.lambdamart.LambdaMART at 0x1ee27a76358>

In [53]:
# Test model locally
Epred = model.predict(EX)
metric.calc_mean(Eqids, Ey.as_matrix(), Epred)

0.49064825747265112

In [54]:
# Predict final submission order
SX = test.drop("srch_id", axis=1)
Sqids = test["srch_id"]
Spred = model.predict(SX)

In [55]:
# Create submission data frame
result = pd.DataFrame()
result["SearchId"] = Sqids
result["PropertyId"] = Sprops
result["Sort"] = Spred
result = result.sort(["SearchId", "Sort"], ascending=False)
result



Unnamed: 0,SearchId,PropertyId,Sort
6622620,665571,108152,1.368215
6622614,665571,52204,0.929147
6622621,665571,111905,0.873768
6622607,665571,30880,0.705605
6622600,665571,10681,0.434237
6622625,665571,127979,0.413386
6622611,665571,42127,0.412679
6622616,665571,58641,0.161567
6622598,665571,2312,0.099100
6622604,665571,27221,-0.068251


In [56]:
# Write submission to file
result = result.drop("Sort", axis=1)
result.to_csv("submission.csv", index=False)