In [11]:
import pandas as pd
import numpy as np
import random
import math
import time

import pyltr
from sklearn.model_selection import train_test_split

In [12]:
# Function to sample queries
def sample_queries(df, frac):
    srch_ids = df["srch_id"].unique()[0:int(len(df)*frac)]
    return df.loc[df.srch_id.isin(srch_ids)]

# Train, validation and test set generated from the train set
_train = pd.read_csv("data/processed/_train_undersampled.csv")
_val = pd.read_csv("data/processed/_val.csv")
_test = pd.read_csv("data/processed/_test.csv")

# The actual test set that will be used to submit our final result
test = pd.read_csv("data/processed/kaggle_test.csv")

# The properties that will be used for submitting the final result
Sprops = test["prop_id"]

# To speed up execution we'll just sample for now
_train = sample_queries(_train, 0.1)
_val = sample_queries(_val, 0.2)
_test = sample_queries(_test, 1)

In [15]:
# Generate price difference feature
#for df in [_train, _val, _test, test]:
    #df["price_usd_diff"] = df["price_usd"] - df["price_usd_mean"]
    #df["prop_starrating_diff"] = df["prop_starrating"] - df["prop_starrating_mean"] # Seems to be a bad feature

In [16]:
# Generate month feature
for df in [_train, _val, _test, test]:
    month = df.date_time.map(lambda d: d.split("-")[1])
    dummies = pd.get_dummies(month, prefix="month")
    df[dummies.columns] = dummies

In [17]:
# Generate target label
for df in [_train, _val, _test]:
    df["relevance"] = df.click_bool + df.booking_bool
    df.relevance = df.relevance.map(lambda r: 5 if r > 1 else r)
    df.drop(["click_bool", "booking_bool"], axis=1, inplace=True)

In [18]:
# Fill missing values (need better methods for each feature)
for df in [_train, _val, _test, test]:
    df.fillna(0, inplace=True)

In [22]:
# Drop training specific data
for df in [_train, _val, _test]:
    df.drop(["gross_bookings_usd", "position"], axis=1, inplace=True)

_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,booking_probability,month_01,month_02,month_03,month_04,month_05,month_06,month_11,month_12,relevance
0,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,56880,4,4.0,...,0.038519,0,0,0,1,0,0,0,0,0
1,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,68914,2,3.0,...,0.02911,0,0,0,1,0,0,0,0,5
2,4,2012-12-31 08:59:22,5,219,0.0,0.0,219,134162,5,4.5,...,0.009585,0,0,0,0,0,0,0,1,0
3,4,2012-12-31 08:59:22,5,219,0.0,0.0,219,139893,2,3.0,...,0.019553,0,0,0,0,0,0,0,1,1
4,6,2013-06-05 12:27:51,14,100,0.0,0.0,100,10759,0,2.0,...,0.0,0,0,0,0,0,1,0,0,0


In [25]:
def split_X_y_qids(df):
    X = df.drop(["srch_id", "relevance"], axis=1)
    y = df.relevance
    qids = df.srch_id
    
    # Remove features we don't want to include
    X = X.drop(["date_time", 
                "visitor_hist_starrating", 
                "visitor_hist_adr_usd", 
                "prop_id", 
                "random_bool"
               ], axis=1)
    
    # Destination stuff doesn't seem to be great, TODO: figure out which ones to keep and which ones to leave in
    dest_cols = [c for c in _train.columns if c.startswith("dest")]
    X = X.drop(dest_cols, axis=1)
    
    return (X, y, qids)
    
TX, Ty, Tqids = split_X_y_qids(_train)
VX, Vy, Vqids = split_X_y_qids(_val)
EX, Ey, Eqids = split_X_y_qids(_test)

# Train model
metric = pyltr.metrics.NDCG(k=38)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=100)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=100,
    learning_rate=0.5,
    #max_features=1,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1
)

start = time.time()

model.fit(TX, Ty, Tqids)

print("Time elapsed:", time.time() - start)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.8765       0.2544        7.75m                                         
    2       0.8948       0.0173        7.09m                                         
    3       0.9035       0.0089        6.83m                                         
    4       0.9073       0.0026        6.65m                                         
    5       0.9088       0.0017        6.50m                                         
    6       0.9099       0.0006        6.38m                                         
    7       0.9119       0.0025        6.28m                                         
    8       0.9120       0.0011        6.19m                                         
    9       0.9144       0.0008        6.11m                                         
   10       0.9140       0.0004        6.03m                                         
   15       0.9172       0.0002        5.66m         

In [26]:
# Test model locally
Epred = model.predict(EX)
metric.calc_mean(Eqids, Ey.as_matrix(), Epred)

0.54366469698181297

In [27]:
# Predict final submission order
SX = test.drop("srch_id", axis=1)

# Remove features we don't want to include
SX = SX.drop(["date_time", 
            "visitor_hist_starrating", 
            "visitor_hist_adr_usd", 
            "prop_id", 
            "random_bool"
           ], axis=1)

# Destination stuff doesn't seem to be great, TODO: figure out which ones to keep and which ones to leave in
dest_cols = [c for c in _train.columns if c.startswith("dest")]
SX = SX.drop(dest_cols, axis=1)

Sqids = test["srch_id"]
Spred = model.predict(SX)

In [28]:
# Create submission data frame
result = pd.DataFrame()
result["SearchId"] = Sqids
result["PropertyId"] = Sprops
result["Sort"] = Spred
result = result.sort(["SearchId", "Sort"], ascending=False)
result



Unnamed: 0,SearchId,PropertyId,Sort
6622620,665571,108152,2.268755
6622607,665571,30880,2.139583
6622614,665571,52204,1.224031
6622609,665571,36329,0.865342
6622600,665571,10681,0.372707
6622616,665571,58641,0.363430
6622621,665571,111905,0.333684
6622625,665571,127979,0.221264
6622627,665571,138555,0.124923
6622618,665571,80863,-0.438067


In [29]:
# Write submission to file
result = result.drop("Sort", axis=1)
result.to_csv("submission.csv", index=False)