In [1]:
import pandas as pd
import numpy as np
import random
import math
import time
import datetime

import pyltr
from sklearn.model_selection import train_test_split

## Load Data

In [2]:
# Function to sample queries
def sample_queries(df, frac):
    srch_ids = df["srch_id"].unique()[0:int(len(df)*frac)]
    return df.loc[df.srch_id.isin(srch_ids)]

# Train, validation and test set generated from the train set
_train = pd.read_csv("data/processed/_train_undersampled.csv")
_val = pd.read_csv("data/processed/_val.csv")
_test = pd.read_csv("data/processed/_test.csv")

# The actual test set that will be used to submit our final result
test = pd.read_csv("data/processed/kaggle_test.csv")

# The properties that will be used for submitting the final result
Sprops = test["prop_id"]

# To speed up execution we'll just sample for now
_train = sample_queries(_train, 1)
_val = sample_queries(_val, 1)
_test = sample_queries(_test, 1)

In [3]:
# Drop training specific data
for df in [_train, _val, _test]:
    df.drop(["gross_bookings_usd", "position"], axis=1, inplace=True)

_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,prop_median_price_usd,prop_std_price_usd,prop_mean_prop_review_score,prop_median_prop_review_score,prop_std_prop_review_score,srch_price_rank,srch_star_rank,srch_review_score_rank,srch_location_score1_rank,srch_location_score2_rank
0,1,2013-04-04 08:32:15,12,187,,,219,59526,3,3.5,...,99.325,365.834592,3.5,3.5,0.0,3.0,13.5,11.0,10.0,18.0
1,1,2013-04-04 08:32:15,12,187,,,219,68914,2,3.0,...,72.0,231.308865,3.0,3.0,0.0,3.0,4.0,5.0,10.0,14.0
2,4,2012-12-31 08:59:22,5,219,,,219,137826,2,3.0,...,189.0,50.831769,3.0,3.0,0.0,10.0,7.5,4.0,30.0,25.0
3,4,2012-12-31 08:59:22,5,219,,,219,139893,2,3.0,...,143.23,18.074398,3.0,3.0,0.0,3.0,7.5,4.0,7.0,10.0
4,6,2013-06-05 12:27:51,14,100,,,100,104251,3,4.0,...,132.48,10.382009,4.0,4.0,0.0,4.0,5.0,3.0,3.0,3.0


## Feature Engineering

In [4]:
# Normalize price_usd according to prop and srch
for df in [_train, _val, _test, test]:
    df["price_norm_srch"] = (df.price_usd - df.srch_mean_price_usd) / df.srch_std_price_usd
    df["price_norm_prop"] = (df.price_usd - df.prop_mean_price_usd) / df.prop_std_price_usd
    
    # There are properties where std = 0 so it would give infinite as result, which is bad
    df.price_norm_prop.loc[~np.isfinite(df.price_norm_prop)] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
# Generate price difference feature
for df in [_train, _val, _test, test]:
    df["srch_price_usd_diff"] = df["price_usd"] - df["srch_mean_price_usd"]
    #df["prop_starrating_diff"] = df["prop_starrating"] - df["prop_starrating_mean"] # Seems to be a bad feature

In [6]:
# Generate month feature
for df in [_train, _val, _test, test]:
    month = df.date_time.map(lambda d: d.split("-")[1])
    dummies = pd.get_dummies(month, prefix="month")
    df[dummies.columns] = dummies

In [7]:
# Get date_time object from date_time string, more efficient than strptime
def get_date_time(s):
    year = int(s[0:4])
    month = int(s[5:7])
    day = int(s[8:10])
    return datetime.datetime(year, month, day)

# Generate target month feature
def row_to_target_month(row):
    visit_date = get_date_time(row.date_time)
    target_date = visit_date + datetime.timedelta(days=int(row.srch_booking_window + row.srch_length_of_stay/2))
    return target_date.strftime('%m')

for df in [_train, _val, _test, test]:
    if (len(df) > 0): # This is just cause sometimes I use an empty test set to speed things up
        target_month = df.apply(row_to_target_month, axis=1)
        dummies = pd.get_dummies(target_month, prefix="target_month")
        df[dummies.columns] = dummies

In [8]:
# Composite visitor features
for df in [_train, _val, _test, test]:
    df["srch_visitor_count"] = df.srch_adults_count + df.srch_children_count
    df["srch_visitor_per_room_count"] = df.srch_visitor_count / df.srch_room_count
    df["srch_price_per_adult_count"] = df.price_usd / df.srch_adults_count

In [9]:
# Composite history features
for df in [_train, _val, _test, test]:
    df.visitor_hist_starrating.fillna(df.visitor_hist_starrating.median())
    df.visitor_hist_adr_usd.fillna(df.visitor_hist_adr_usd.median())
    
    df["visitor_hist_price_diff"] = abs(df.visitor_hist_adr_usd - df.price_usd)
    df["visitor_hist_star_diff"] = abs(df.visitor_hist_starrating - df.prop_starrating)

In [10]:
# Generate target label
for df in [_train, _val, _test]:
    df["relevance"] = df.click_bool + df.booking_bool
    df.relevance = df.relevance.map(lambda r: 5 if r > 1 else r)
    df.drop(["click_bool", "booking_bool"], axis=1, inplace=True)

In [11]:
# Fill missing values for prop_location_score2
for df in [_train, _val, _test, test]:
    rounded_scores = df.prop_location_score1.map(lambda s: int(s))
    for score1 in rounded_scores.unique():
        median_score2 = df.loc[rounded_scores == score1].prop_location_score2.quantile(0.25)
        df.loc[(rounded_scores == score1) & (df.prop_location_score2.isnull())].prop_location_score2 = median_score2

# Fill remaining missing values (need better methods for each feature)
for df in [_train, _val, _test, test]:
    df.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


## Training and Testing

In [15]:
def split_X_y_qids(df):
    X = df.drop(["srch_id", "relevance"], axis=1)
    y = df.relevance
    qids = df.srch_id
    
    # Remove features we don't want to include
    X = X.drop(["date_time", 
                #"visitor_hist_starrating", 
                #"visitor_hist_adr_usd", 
                "prop_id", 
                "random_bool",
                "price_usd"
               ], axis=1)
    
    return (X, y, qids)
    
TX, Ty, Tqids = split_X_y_qids(_train)
VX, Vy, Vqids = split_X_y_qids(_val)
EX, Ey, Eqids = split_X_y_qids(_test)

# Train model
metric = pyltr.metrics.NDCG(k=38)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=20)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=25,
    #max_features=1,
    query_subsample=0.5,
    max_leaf_nodes=15,
    min_samples_leaf=64,
    verbose=1,
    max_depth=6
)

start = time.time()

model.fit(TX, Ty, Tqids)

print("Time elapsed:", (time.time() - start)/60)

# Test model locally
Epred = model.predict(EX)
metric.calc_mean(Eqids, Ey.as_matrix(), Epred)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.8362       0.2147       10.78m                                         
    2       0.8543       0.0178        9.68m                                         
    3       0.8631       0.0093        8.98m                                         
    4       0.8685       0.0050        8.40m                                         
    5       0.8719       0.0037        7.87m                                         
    6       0.8763       0.0040        7.41m                                         
    7       0.8772       0.0006        7.00m                                         
    8       0.8795       0.0020        6.59m                                         
    9       0.8806       0.0008        6.19m                                         
   10       0.8811       0.0015        5.78m                                         
   15       0.8853       0.0006        3.81m         

0.49290946376835237

In [13]:
# Feature importances
fi = pd.DataFrame()
fi["Features"] = TX.columns
fi["Importance"] = model.feature_importances_
fi.sort("Importance", ascending=False)



Unnamed: 0,Features,Importance
9,prop_location_score2,0.071497
75,price_norm_prop,0.055463
74,price_norm_srch,0.055071
64,prop_median_price_usd,0.052183
65,prop_std_price_usd,0.046922
76,srch_price_usd_diff,0.045198
10,prop_log_historical_price,0.041018
63,prop_mean_price_usd,0.039822
69,srch_price_rank,0.034903
8,prop_location_score1,0.033765


## Submit Final Result

In [82]:
# Predict final submission order
SX = test[TX.columns]
Sqids = test["srch_id"]
Spred = model.predict(SX)

In [83]:
# Create submission data frame
result = pd.DataFrame()
result["SearchId"] = Sqids
result["PropertyId"] = Sprops
result["Sort"] = Spred
result = result.sort(["SearchId", "Sort"], ascending=False)
result



Unnamed: 0,SearchId,PropertyId,Sort
6622614,665571,52204,1.413508
6622620,665571,108152,1.015146
6622621,665571,111905,0.951610
6622607,665571,30880,0.859494
6622616,665571,58641,0.610257
6622609,665571,36329,0.598544
6622598,665571,2312,0.184159
6622625,665571,127979,0.104297
6622611,665571,42127,0.046426
6622600,665571,10681,-0.089647


In [84]:
# Write submission to file
result = result.drop("Sort", axis=1)
result.to_csv("submission.csv", index=False)

In [18]:
[c for c in _train.columns if c.endswith("_rank")]

['srch_price_rank',
 'srch_star_rank',
 'srch_review_score_rank',
 'srch_location_score1_rank',
 'srch_location_score2_rank']

In [25]:
Smax = SX.max()

In [26]:
Smax.sort_values()

prop_std_prop_review_score     0.000000e+00
srch_query_affinity_score      0.000000e+00
comp3_inv                      1.000000e+00
comp3_rate                     1.000000e+00
month_05                       1.000000e+00
comp4_rate                     1.000000e+00
comp4_inv                      1.000000e+00
month_04                       1.000000e+00
comp5_rate                     1.000000e+00
comp5_inv                      1.000000e+00
month_03                       1.000000e+00
comp6_rate                     1.000000e+00
comp6_inv                      1.000000e+00
month_02                       1.000000e+00
comp7_rate                     1.000000e+00
comp7_inv                      1.000000e+00
month_01                       1.000000e+00
comp8_rate                     1.000000e+00
comp8_inv                      1.000000e+00
month_06                       1.000000e+00
comp2_rate                     1.000000e+00
comp2_inv                      1.000000e+00
comp1_inv                      1

In [29]:
np.finfo(np.float32).max

3.4028235e+38

In [61]:
x = ~np.isfinite(SX)
x

Unnamed: 0,site_id,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,promotion_flag,...,target_month_08,target_month_09,target_month_10,target_month_11,target_month_12,srch_visitor_count,srch_visitor_per_room_count,srch_price_per_adult_count,visitor_hist_price_diff,visitor_hist_star_diff
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [45]:
~ np.isfinite(SX)

Unnamed: 0,site_id,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,promotion_flag,...,target_month_08,target_month_09,target_month_10,target_month_11,target_month_12,srch_visitor_count,srch_visitor_per_room_count,srch_price_per_adult_count,visitor_hist_price_diff,visitor_hist_star_diff


In [58]:
np.any((~np.isfinite(SX)))

True

In [52]:
~x.sum(axis=1)

0         -101
1         -101
2         -101
3         -101
4         -101
5         -101
6         -101
7         -101
8         -101
9         -101
10        -101
11        -101
12        -101
13        -101
14        -101
15        -101
16        -101
17        -101
18        -101
19        -101
20        -101
21        -101
22        -101
23        -101
24        -101
25        -101
26        -101
27        -101
28        -101
29        -101
          ... 
6622599   -101
6622600   -101
6622601   -101
6622602   -101
6622603   -101
6622604   -101
6622605   -101
6622606   -101
6622607   -101
6622608   -101
6622609   -101
6622610   -101
6622611   -101
6622612   -101
6622613   -101
6622614   -101
6622615   -101
6622616   -101
6622617   -101
6622618   -101
6622619   -101
6622620   -101
6622621   -101
6622622   -101
6622623   -101
6622624   -101
6622625   -101
6622626   -101
6622627   -101
6622628   -101
dtype: int64

In [55]:
(~pd.Series([True, False, True])).any()

True

In [57]:
SX.loc[~x.any(axis=1)]

Unnamed: 0,site_id,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,promotion_flag,...,target_month_08,target_month_09,target_month_10,target_month_11,target_month_12,srch_visitor_count,srch_visitor_per_room_count,srch_price_per_adult_count,visitor_hist_price_diff,visitor_hist_star_diff


In [64]:
x.any()

site_id                        False
visitor_location_country_id    False
prop_country_id                False
prop_starrating                False
prop_review_score              False
prop_brand_bool                False
prop_location_score1           False
prop_location_score2           False
prop_log_historical_price      False
promotion_flag                 False
srch_destination_id            False
srch_length_of_stay            False
srch_booking_window            False
srch_adults_count              False
srch_children_count            False
srch_room_count                False
srch_saturday_night_bool       False
srch_query_affinity_score      False
orig_destination_distance      False
comp1_rate                     False
comp1_inv                      False
comp1_rate_percent_diff        False
comp2_rate                     False
comp2_inv                      False
comp2_rate_percent_diff        False
comp3_rate                     False
comp3_inv                      False
c

In [80]:
test.loc[~np.isfinite(test.price_norm_prop)][["price_norm_prop", "price_usd", "prop_std_price_usd", "prop_mean_price_usd"]]

Unnamed: 0,price_norm_prop,price_usd,prop_std_price_usd,prop_mean_price_usd
4199633,-inf,177.55,0.0,177.55
4686415,-inf,177.55,0.0,177.55
5287580,-inf,177.55,0.0,177.55


In [78]:
test.prop_mean_price_usd

0           115.531818
1            41.858125
2           228.067143
3            31.745455
4            41.412143
5            60.160000
6            96.624583
7           174.050392
8            91.724524
9           118.760000
10          159.207973
11           48.868000
12          118.646000
13          337.460418
14         2879.860366
15          521.410899
16          735.404415
17          150.073145
18          177.724464
19          578.480426
20          210.092194
21          656.710286
22          700.954788
23          310.334286
24          197.989326
25         1632.421097
26          224.410655
27          230.576565
28          409.131771
29          278.332398
              ...     
6622599     225.982453
6622600     156.549733
6622601     297.182890
6622602     128.781368
6622603     169.392141
6622604     261.744730
6622605      56.031209
6622606     362.399900
6622607     173.665656
6622608      86.718358
6622609     348.057581
6622610     105.163910
6622611    