In [35]:
import pandas as pd
import numpy as np
import random
import math

import pyltr
from sklearn.model_selection import train_test_split

# Function to sample queries
def sample_queries(df, frac):
    srch_ids = df["srch_id"].unique()[0:int(len(df)*frac)]
    return df.loc[df.srch_id.isin(srch_ids)]

# Train, validation and test set generated from the train set
_train = pd.read_csv("data/processed/_train_undersampled.csv")
_val = pd.read_csv("data/processed/_val.csv")
_test = pd.read_csv("data/processed/_test.csv")

# The actual test set that will be used to submit our final result
test = pd.read_csv("data/processed/kaggle_test.csv")

# The properties that will be used for submitting the final result
Sprops = test["prop_id"]

# To speed up execution we'll just sample for now
_train = sample_queries(_train, 0.1)
_val = sample_queries(_val, 0.2)
_test = sample_queries(_test, 1)

In [36]:
# Generate month feature
for df in [_train, _val, _test, test]:
    month = df.date_time.map(lambda d: d.split("-")[1])
    dummies = pd.get_dummies(month, prefix="month")
    df[dummies.columns] = dummies

In [37]:
# Generate target label
for df in [_train, _val, _test]:
    df["relevance"] = df.click_bool + df.booking_bool
    df.relevance = df.relevance.map(lambda r: 5 if r > 1 else r)
    df.drop(["click_bool", "booking_bool"], axis=1, inplace=True)

In [38]:
# Fill missing values (need better methods for each feature)
for df in [_train, _val, _test, test]:
    df.fillna(0, inplace=True)

In [39]:
# Drop training specific data
for df in [_train, _val, _test]:
    df.drop(["gross_bookings_usd", "position"], axis=1, inplace=True)

_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,booking_probability,month_01,month_02,month_03,month_04,month_05,month_06,month_11,month_12,relevance
0,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,68914,2,3.0,...,0.02911,0,0,0,1,0,0,0,0,5
1,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,97247,2,3.5,...,0.011321,0,0,0,1,0,0,0,0,0
2,4,2012-12-31 08:59:22,5,219,0.0,0.0,219,58696,0,4.5,...,0.005952,0,0,0,0,0,0,0,1,0
3,4,2012-12-31 08:59:22,5,219,0.0,0.0,219,139893,2,3.0,...,0.019553,0,0,0,0,0,0,0,1,1
4,6,2013-06-05 12:27:51,14,100,0.0,0.0,100,104251,3,4.0,...,0.045455,0,0,0,0,0,1,0,0,5


In [40]:
def _comp_rate_sum(df):
## add feature: comp_rate_sum
    for i in range(1,9):
        df['comp'+str(i)+'_rate'].fillna(0, inplace=True)
        df['comp_rate_sum'] = df['comp1_rate']
    for i in range(2,9):
        df['comp_rate_sum'] += df['comp'+str(i)+'_rate']

## add feature: comp_rate_sum
    for i in range(1,9):
        df['comp'+str(i)+'_inv'].fillna(0, inplace=True)
        df['comp'+str(i)+'_inv'][df['comp'+str(i)+'_inv']==1] = 10
        df['comp'+str(i)+'_inv'][df['comp'+str(i)+'_inv']==-1] = 1
        df['comp'+str(i)+'_inv'][df['comp'+str(i)+'_inv']==0] = -1
        df['comp'+str(i)+'_inv'][df['comp'+str(i)+'_inv']==10] = 0
        df['comp_inv_sum'] = df['comp1_inv']
    for i in range(2,9):
        df['comp_inv_sum'] += df['comp'+str(i)+'_inv']

In [41]:
_comp_rate_sum(_train)
_comp_rate_sum(_test)
_comp_rate_sum(_val)
_comp_rate_sum(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [42]:
_train.columns

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv

In [49]:
import time
def split_X_y_qids(df):
    X = df.drop(["srch_id", "relevance"], axis=1)
    y = df.relevance
    qids = df.srch_id
    
    # Remove features we don't want to include
    X = X.drop(["date_time", 
                "visitor_hist_starrating", 
                "visitor_hist_adr_usd", 
                "prop_id", 
                "random_bool",
                "comp1_rate_percent_diff","comp1_rate","comp1_inv",
                "comp2_rate_percent_diff","comp2_rate","comp2_inv",
                "comp3_rate_percent_diff","comp3_rate","comp3_inv",
                "comp4_rate_percent_diff","comp4_rate","comp4_inv",
                "comp5_rate_percent_diff","comp5_rate","comp5_inv",
                "comp6_rate_percent_diff","comp6_rate","comp6_inv",
                "comp7_rate_percent_diff","comp7_rate","comp7_inv",
                "comp8_rate_percent_diff","comp8_rate","comp8_inv",
               ], axis=1)
    
    # Destination stuff doesn't seem to be great, TODO: figure out which ones to keep and which ones to leave in
    dest_cols = [c for c in _train.columns if c.startswith("dest")]
    X = X.drop(dest_cols, axis=1)
    
    return (X, y, qids)
    
TX, Ty, Tqids = split_X_y_qids(_train)
VX, Vy, Vqids = split_X_y_qids(_val)
EX, Ey, Eqids = split_X_y_qids(_test)

# Train model
metric = pyltr.metrics.NDCG(k=38)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=100)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=500,
    learning_rate=0.5,
    #max_features=1,
    query_subsample=0.6,
    max_leaf_nodes=10,
    min_samples_leaf=1000,
    verbose=1
)

start = time.time()

model.fit(TX, Ty, Tqids)

print("Time elapsed:", time.time() - start)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.8756       0.2516       89.67m                                         
    2       0.8949       0.0183       81.66m                                         
    3       0.9022       0.0064       79.03m                                         
    4       0.9070       0.0036       74.56m                                         
    5       0.9081       0.0028       65.69m                                         
    6       0.9110       0.0017       59.72m                                         
    7       0.9128       0.0004       55.41m                                         
    8       0.9131       0.0011       55.39m                                         
    9       0.9124      -0.0006       53.55m                                         
   10       0.9137       0.0003       51.37m                                         
   15       0.9147       0.0003       44.55m         

In [50]:
# Test model locally
Epred = model.predict(EX)
metric.calc_mean(Eqids, Ey.as_matrix(), Epred)

0.53089331770811277

In [27]:
_train.shape

(77500, 67)

In [28]:
len(_train.loc[_train.booking_bool == 1]) / len(_train) * 100

31.36258064516129

### len(_train.loc[(_train.click_bool == 1) & (_train.booking_bool == 0)]) / len(_train) * 100

In [29]:
len(_train.loc[_train.click_bool == 0]) / len(_train) * 100

49.504516129032254

In [30]:
# Generate target label
for df in [_train, _val, _test]:
    df["relevance"] = df.click_bool + df.booking_bool
    df.relevance = df.relevance.map(lambda r: 5 if r > 1 else r)

In [31]:
# Fill missing values (need better methods for each feature)
for df in [_train, _val, _test, test]:
    df.fillna(-1, inplace=True)

In [34]:
# Generate month feature
for df in [_train, _val, _test, test]:
    month = df.date_time.map(lambda d: d.split("-")[1])
    dummies = pd.get_dummies(month, prefix="month")
    df[dummies.columns] = dummies

AttributeError: 'DataFrame' object has no attribute 'date_time'

In [None]:
_train

In [33]:
# Select properties we care about (kind of arbitrary right now)
features = ["srch_id", "site_id", "prop_country_id", "prop_starrating", "prop_review_score", "prop_brand_bool",
              "prop_location_score1", "prop_location_score2", "price_usd", "promotion_flag", "srch_destination_id",
              "srch_length_of_stay", "srch_adults_count", "srch_children_count", "srch_room_count",
              "relevance","comp_rate_sum","comp_inv_sum"]

_train = _train[features]
_val = _val[features]
_test = _test[features]
test = test[features[0:-1]]

_train.head()

KeyError: "['relevance'] not in index"

In [10]:
def split_X_y_qids(df):
    X = df.drop(["srch_id", "relevance"], axis=1)
    y = df.relevance
    qids = df.srch_id
    return (X, y, qids)
    
TX, Ty, Tqids = split_X_y_qids(_train)
VX, Vy, Vqids = split_X_y_qids(_val)
EX, Ey, Eqids = split_X_y_qids(_test)

# Train model
metric = pyltr.metrics.NDCG(k=38)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=100)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=100,
    learning_rate=0.1,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1
)
model.fit(TX, Ty, Tqids, monitor=monitor)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.7993       0.1776       15.67m      C:      0.3518 B:      0.3518 S:  0
    2       0.8317       0.0317       12.98m      C:      0.3841 B:      0.3841 S:  0
    3       0.8368       0.0040       12.03m      C:      0.3905 B:      0.3905 S:  0
    4       0.8472       0.0100       11.48m      C:      0.4031 B:      0.4031 S:  0
    5       0.8526       0.0042       11.10m      C:      0.4096 B:      0.4096 S:  0
    6       0.8560       0.0022       10.83m      C:      0.4157 B:      0.4157 S:  0
    7       0.8628       0.0077       10.60m      C:      0.4306 B:      0.4306 S:  0
    8       0.8684       0.0057       10.40m      C:      0.4459 B:      0.4459 S:  0
    9       0.8703       0.0018       10.21m      C:      0.4479 B:      0.4479 S:  0
   10       0.8723       0.0005       10.05m      C:      0.4490 B:      0.4490 S:  0
   15       0.8776       0.0015        9.36m      C: 

<pyltr.models.lambdamart.LambdaMART at 0x13004665278>

In [11]:
# Test model locally
Epred = model.predict(EX)

query_groups = pyltr.util.group.get_groups(Eqids)

score = 0.0
i = 0
highest_e = 0
fault_count = 0
for qid, a, b in query_groups:
    sorted_y = pyltr.util.sort.get_sorted_y(Ey[a:b].as_matrix(), Epred[a:b])
    e = metric.evaluate(qid, sorted_y)
    highest_e = max(e, highest_e)
    
    if (e > 1):
        fault_count += 1
    else:
        score += e
        i += 1
        
score /= i
        
score, i, highest_e, fault_count
#metric.calc_mean(Eqids, Ey, Epred)

(0.4912118276400273, 19979, 1.0, 0)

In [12]:
# Predict final submission order
SX = test.drop("srch_id", axis=1)
Sqids = test["srch_id"]
Spred = model.predict(SX)

In [13]:
# Create submission data frame
result = pd.DataFrame()
result["SearchId"] = Sqids
result["PropertyId"] = Sprops
result["Sort"] = Spred
result = result.sort(["SearchId", "Sort"], ascending=False)
result



Unnamed: 0,SearchId,PropertyId,Sort
6622614,665571,52204,0.983664
6622620,665571,108152,0.810477
6622621,665571,111905,0.564644
6622607,665571,30880,0.530451
6622611,665571,42127,0.317272
6622625,665571,127979,0.308668
6622616,665571,58641,0.143860
6622598,665571,2312,0.123178
6622609,665571,36329,-0.116911
6622600,665571,10681,-0.118344


In [14]:
# Write submission to file
result = result.drop("Sort", axis=1)
result.to_csv("submission.csv", index=False)