In [1]:
import pandas as pd
import numpy as np
import random
import math
import time
import datetime

import pyltr
from sklearn.model_selection import train_test_split

## Load Data

In [2]:
# Function to sample queries
def sample_queries(df, frac):
    srch_ids = df["srch_id"].unique()[0:int(len(df)*frac)]
    return df.loc[df.srch_id.isin(srch_ids)]

# Train, validation and test set generated from the train set
_train = pd.read_csv("data/processed/_train_undersampled.csv")
_val = pd.read_csv("data/processed/_val.csv")
_test = pd.read_csv("data/processed/_test.csv")

# The actual test set that will be used to submit our final result
test = pd.read_csv("data/processed/kaggle_test.csv")

# The properties that will be used for submitting the final result
Sprops = test["prop_id"]

# To speed up execution we'll just sample for now
_train = sample_queries(_train, 1)
_val = sample_queries(_val, 0.5)
_test = sample_queries(_test, 1)

In [3]:
# Drop training specific data
for df in [_train, _val, _test]:
    df.drop(["gross_bookings_usd", "position"], axis=1, inplace=True)

_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,prop_median_price_usd,prop_std_price_usd,prop_mean_prop_review_score,prop_median_prop_review_score,prop_std_prop_review_score,srch_price_rank,srch_star_rank,srch_review_score_rank,srch_location_score1_rank,srch_location_score2_rank
0,1,2013-04-04 08:32:15,12,187,,,219,68914,2,3.0,...,72.0,231.308865,3.0,3.0,0.0,3.0,4.0,5.0,10.0,14.0
1,1,2013-04-04 08:32:15,12,187,,,219,88218,4,3.5,...,114.885,413.886419,3.5,3.5,0.0,7.5,24.0,11.0,22.5,27.0
2,4,2012-12-31 08:59:22,5,219,,,219,134162,5,4.5,...,548.0,91.382003,4.5,4.5,0.0,31.0,31.0,27.0,18.0,22.0
3,4,2012-12-31 08:59:22,5,219,,,219,139893,2,3.0,...,143.23,18.074398,3.0,3.0,0.0,3.0,7.5,4.0,7.0,10.0
4,6,2013-06-05 12:27:51,14,100,,,100,52376,2,0.0,...,88.08,3.789397,0.0,0.0,0.0,1.0,3.5,1.0,3.0,3.0


## Feature Engineering

In [4]:
# Normalize price_usd according to prop and srch
for df in [_train, _val, _test, test]:
    df["price_norm_srch"] = (df.price_usd - df.srch_mean_price_usd) / df.srch_std_price_usd
    df["price_norm_prop"] = (df.price_usd - df.prop_mean_price_usd) / df.prop_std_price_usd
    
    # There are properties where std = 0 so it would give infinite as result, which is bad
    df.price_norm_prop.loc[~np.isfinite(df.price_norm_prop)] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
# Generate price difference feature
for df in [_train, _val, _test, test]:
    df["srch_price_usd_diff"] = df["price_usd"] - df["srch_mean_price_usd"]
    #df["prop_starrating_diff"] = df["prop_starrating"] - df["prop_starrating_mean"] # Seems to be a bad feature

In [6]:
# Generate month feature
for df in [_train, _val, _test, test]:
    month = df.date_time.map(lambda d: d.split("-")[1])
    dummies = pd.get_dummies(month, prefix="month")
    df[dummies.columns] = dummies

In [7]:
# Get date_time object from date_time string, more efficient than strptime
def get_date_time(s):
    year = int(s[0:4])
    month = int(s[5:7])
    day = int(s[8:10])
    return datetime.datetime(year, month, day)

# Generate target month feature
def row_to_target_month(row):
    visit_date = get_date_time(row.date_time)
    target_date = visit_date + datetime.timedelta(days=int(row.srch_booking_window + row.srch_length_of_stay/2))
    return target_date.strftime('%m')

for df in [_train, _val, _test, test]:
    if (len(df) > 0): # This is just cause sometimes I use an empty test set to speed things up
        target_month = df.apply(row_to_target_month, axis=1)
        dummies = pd.get_dummies(target_month, prefix="target_month")
        df[dummies.columns] = dummies

In [8]:
# Composite visitor features
for df in [_train, _val, _test, test]:
    df["srch_visitor_count"] = df.srch_adults_count + df.srch_children_count
    df["srch_visitor_per_room_count"] = df.srch_visitor_count / df.srch_room_count
    df["srch_price_per_adult_count"] = df.price_usd / df.srch_adults_count

In [9]:
# Composite history features
for df in [_train, _val, _test, test]:
    df.visitor_hist_starrating.fillna(df.visitor_hist_starrating.median())
    df.visitor_hist_adr_usd.fillna(df.visitor_hist_adr_usd.median())
    
    df["visitor_hist_price_diff"] = abs(df.visitor_hist_adr_usd - df.price_usd)
    df["visitor_hist_star_diff"] = abs(df.visitor_hist_starrating - df.prop_starrating)

In [10]:
# Aggregate competitor information
for df in [_train, _val, _test, test]:
    rate_cols = ['comp' + str(i) + '_rate' for i in range(1,9)]
    inv_cols = ['comp' + str(i) + '_inv' for i in range(1,9)]
    df['comp_rate_sum'] = df[rate_cols].sum(axis=1)
    df['comp_inv_sum'] = df[inv_cols].sum(axis=1)
    df['comp_rate_min'] = df[rate_cols].min(axis=1)
    df['comp_inv_min'] = df[rate_cols].min(axis=1)

In [11]:
# Generate target label
for df in [_train, _val, _test]:
    df["relevance"] = df.click_bool + df.booking_bool
    df.relevance = df.relevance.map(lambda r: 5 if r > 1 else r)
    df.drop(["click_bool", "booking_bool"], axis=1, inplace=True)

In [12]:
# Fill missing values for prop_location_score2
for df in [_train, _val, _test, test]:
    rounded_scores = df.prop_location_score1.map(lambda s: int(s))
    for score1 in rounded_scores.unique():
        median_score2 = df.loc[rounded_scores == score1].prop_location_score2.quantile(0.25)
        df.loc[(rounded_scores == score1) & (df.prop_location_score2.isnull())].prop_location_score2 = median_score2

# Fill remaining missing values (need better methods for each feature)
for df in [_train, _val, _test, test]:
    df.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


## Training and Testing

In [23]:
def split_X_y_qids(df):
    X = df.drop(["srch_id", "relevance"], axis=1)
    y = df.relevance
    qids = df.srch_id
    
    # Remove features we don't want to include
    X = X.drop(["date_time", 
                #"visitor_hist_starrating", 
                #"visitor_hist_adr_usd", 
                "prop_id", 
                "random_bool",
                "price_usd"
               ], axis=1)
    
    return (X, y, qids)
    
TX, Ty, Tqids = split_X_y_qids(_train)
VX, Vy, Vqids = split_X_y_qids(_val)
EX, Ey, Eqids = split_X_y_qids(_test)

# Train model
metric = pyltr.metrics.NDCG(k=38)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=100)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=2500,
    #max_features=1,
    query_subsample=0.5,
    max_leaf_nodes=15,
    min_samples_leaf=64,
    verbose=1,
    max_depth=6
)

start = time.time()

model.fit(TX, Ty, Tqids, monitor=monitor)

print("Time elapsed:", (time.time() - start)/60)

# Test model locally
Epred = model.predict(EX)
metric.calc_mean(Eqids, Ey.as_matrix(), Epred)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.8376       0.2168     1287.15m      C:      0.3845 B:      0.3845 S:  0
    2       0.8588       0.0198     1182.25m      C:      0.4276 B:      0.4276 S:  0
    3       0.8605       0.0022     1136.00m      C:      0.4337 B:      0.4337 S:  0
    4       0.8688       0.0079     1110.86m      C:      0.4485 B:      0.4485 S:  0
    5       0.8728       0.0038     1099.43m      C:      0.4581 B:      0.4581 S:  0
    6       0.8750       0.0029     1091.20m      C:      0.4648 B:      0.4648 S:  0
    7       0.8778       0.0013     1083.40m      C:      0.4687 B:      0.4687 S:  0
    8       0.8791       0.0013     1075.62m      C:      0.4751 B:      0.4751 S:  0
    9       0.8797       0.0007     1071.06m      C:      0.4774 B:      0.4774 S:  0
   10       0.8807       0.0008     1065.78m      C:      0.4793 B:      0.4793 S:  0
   15       0.8840       0.0006     1052.47m      C: 

0.51213843221035837

In [24]:
# Feature importances
fi = pd.DataFrame()
fi["Features"] = TX.columns
fi["Importance"] = model.feature_importances_
fi.sort("Importance", ascending=False)



Unnamed: 0,Features,Importance
9,prop_location_score2,0.061817
74,price_norm_srch,0.058210
75,price_norm_prop,0.052900
65,prop_std_price_usd,0.048425
64,prop_median_price_usd,0.046098
76,srch_price_usd_diff,0.045028
10,prop_log_historical_price,0.040693
63,prop_mean_price_usd,0.038673
8,prop_location_score1,0.035619
69,srch_price_rank,0.031208


## Submit Final Result

In [25]:
# Predict final submission order
SX = test[TX.columns]
Sqids = test["srch_id"]
Spred = model.predict(SX)

In [26]:
# Create submission data frame
result = pd.DataFrame()
result["SearchId"] = Sqids
result["PropertyId"] = Sprops
result["Sort"] = Spred
result = result.sort(["SearchId", "Sort"], ascending=[True, False])
result



Unnamed: 0,SearchId,PropertyId,Sort
5,2,25579,0.581687
2,2,7374,0.475623
6,2,26540,-0.026116
8,2,30434,-0.416048
10,2,78858,-0.483851
4,2,12938,-0.505909
1,2,6399,-0.548559
12,2,131173,-0.733897
11,2,91899,-0.764027
3,2,7771,-0.941981


In [27]:
# Write submission to file
result = result.drop("Sort", axis=1)
result.to_csv("submission.csv", index=False)