In [2]:
import pandas as pd
import numpy as np
import random
import math

import pyltr
from sklearn.model_selection import train_test_split

# Function to sample queries
def sample_queries(df, frac):
    srch_ids = df["srch_id"].unique()[0:int(len(df)*frac)]
    return df.loc[df.srch_id.isin(srch_ids)]

# Train, validation and test set generated from the train set
_train = pd.read_csv("data/processed/_train_undersampled.csv")
_val = pd.read_csv("data/processed/_val.csv")
_test = pd.read_csv("data/processed/_test.csv")

# The actual test set that will be used to submit our final result
test = pd.read_csv("data/processed/kaggle_test.csv")

# The properties that will be used for submitting the final result
Sprops = test["prop_id"]

# To speed up execution we'll just sample for now
_train = sample_queries(_train, 0.1)
_val = sample_queries(_val, 0.2)
_test = sample_queries(_test, 1)

In [3]:
_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,price_usd_mean,price_usd_median,prop_starrating_mean,prop_starrating_median,prop_review_score_mean,prop_review_score_median
0,1,2013-04-04 08:32:15,12,187,,,219,68914,2,3.0,...,11.0,1,114.29,1,163.718929,139.05,3.071429,3.0,3.482143,3.5
1,1,2013-04-04 08:32:15,12,187,,,219,88218,4,3.5,...,,0,,0,163.718929,139.05,3.071429,3.0,3.482143,3.5
2,4,2012-12-31 08:59:22,5,219,,,219,65984,2,3.5,...,,0,,0,265.5625,263.0,3.09375,3.5,3.90625,4.0
3,4,2012-12-31 08:59:22,5,219,,,219,139893,2,3.0,...,,1,,0,265.5625,263.0,3.09375,3.5,3.90625,4.0
4,6,2013-06-05 12:27:51,14,100,,,100,22135,0,5.0,...,,0,,0,125.47,115.03,1.4,2.0,3.1,4.0


In [4]:
_train.shape

(77500, 60)

In [5]:
len(_train.loc[_train.booking_bool == 1]) / len(_train) * 100

31.36258064516129

In [6]:
len(_train.loc[(_train.click_bool == 1) & (_train.booking_bool == 0)]) / len(_train) * 100

19.13290322580645

In [7]:
len(_train.loc[_train.click_bool == 0]) / len(_train) * 100

49.504516129032254

In [8]:
# Generate target label
for df in [_train, _val, _test]:
    df["relevance"] = df.click_bool + df.booking_bool
    df.relevance = df.relevance.map(lambda r: 5 if r > 1 else r)

In [9]:
# Generate price difference feature
for df in [_train, _val, _test, test]:
    df["price_usd_diff"] = df["price_usd"] - df["price_usd_mean"]
    df["prop_starrating_diff"] = df["prop_starrating"] - df["prop_starrating_mean"] # Seems to be a bad feature

In [10]:
# Fill missing values (need better methods for each feature)
for df in [_train, _val, _test, test]:
    df.fillna(-1, inplace=True)

In [12]:
# Select properties we care about (kind of arbitrary right now)
features = ["srch_id", "site_id", "prop_country_id", "prop_starrating", "prop_review_score", "prop_brand_bool",
            "prop_location_score1", "prop_location_score2", "price_usd", "promotion_flag", "srch_destination_id",
            "srch_length_of_stay", "srch_adults_count", "srch_children_count", "srch_room_count",
            "prop_starrating_median", "prop_starrating_mean",
            "price_usd_median", "price_usd_mean", "price_usd_diff",
            "relevance"]

_train = _train[features]
_val = _val[features]
_test = _test[features]
test = test[features[0:-1]]

_train.head()

Unnamed: 0,srch_id,site_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,price_usd,promotion_flag,...,srch_length_of_stay,srch_adults_count,srch_children_count,srch_room_count,prop_starrating_median,prop_starrating_mean,price_usd_median,price_usd_mean,price_usd_diff,relevance
0,1,12,219,2,3.0,1,2.2,0.0206,100.89,0,...,1,4,0,1,3.0,3.071429,139.05,163.718929,-62.828929,5
1,1,12,219,4,3.5,1,2.77,0.1266,115.12,0,...,1,4,0,1,3.0,3.071429,139.05,163.718929,-48.598929,0
2,4,5,219,2,3.5,0,3.09,0.223,194.0,0,...,2,1,0,1,3.5,3.09375,263.0,265.5625,-71.5625,0
3,4,5,219,2,3.0,1,1.61,0.0309,129.0,0,...,2,1,0,1,3.5,3.09375,263.0,265.5625,-136.5625,1
4,6,14,100,0,5.0,0,1.95,-1.0,115.03,0,...,1,2,0,1,2.0,1.4,115.03,125.47,-10.44,0


In [19]:
def split_X_y_qids(df):
    X = df.drop(["srch_id", "relevance"], axis=1)
    y = df.relevance
    qids = df.srch_id
    return (X, y, qids)
    
TX, Ty, Tqids = split_X_y_qids(_train)
VX, Vy, Vqids = split_X_y_qids(_val)
EX, Ey, Eqids = split_X_y_qids(_test)

# Train model
metric = pyltr.metrics.NDCG(k=38)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=100)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=100,
    learning_rate=0.1,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1
)
model.fit(TX, Ty, Tqids)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.8074       0.1843        6.78m                                         
    2       0.8464       0.0386        7.13m                                         
    3       0.8559       0.0097        6.60m                                         
    4       0.8624       0.0063        6.24m                                         
    5       0.8658       0.0045        5.99m                                         
    6       0.8673       0.0009        5.80m                                         
    7       0.8694       0.0032        5.63m                                         
    8       0.8723       0.0014        5.48m                                         
    9       0.8744       0.0017        5.35m                                         
   10       0.8726       0.0012        5.23m                                         
   15       0.8784       0.0001        4.77m         

<pyltr.models.lambdamart.LambdaMART at 0x15f4a67f0>

In [20]:
# Test model locally
Epred = model.predict(EX)
metric.calc_mean(Eqids, Ey.as_matrix(), Epred)

0.49138120663435486

In [15]:
# Predict final submission order
SX = test.drop("srch_id", axis=1)
Sqids = test["srch_id"]
Spred = model.predict(SX)

In [16]:
# Create submission data frame
result = pd.DataFrame()
result["SearchId"] = Sqids
result["PropertyId"] = Sprops
result["Sort"] = Spred
result = result.sort(["SearchId", "Sort"], ascending=False)
result



Unnamed: 0,SearchId,PropertyId,Sort
6622614,665571,52204,0.910422
6622620,665571,108152,0.901392
6622607,665571,30880,0.678440
6622621,665571,111905,0.661322
6622616,665571,58641,0.411309
6622611,665571,42127,0.379879
6622625,665571,127979,0.306869
6622598,665571,2312,0.226118
6622600,665571,10681,-0.006066
6622609,665571,36329,-0.081917


In [17]:
# Write submission to file
result = result.drop("Sort", axis=1)
result.to_csv("submission.csv", index=False)

NameError: name 'train' is not defined