In [99]:
import pandas as pd
import numpy as np
import random
import math
import time

import pyltr
from sklearn.model_selection import train_test_split

In [100]:
# Function to sample queries
def sample_queries(df, frac):
    srch_ids = df["srch_id"].unique()[0:int(len(df)*frac)]
    return df.loc[df.srch_id.isin(srch_ids)]

# Train, validation and test set generated from the train set
_train = pd.read_csv("data/processed/_train_undersampled.csv")
_val = pd.read_csv("data/processed/_val.csv")
_test = pd.read_csv("data/processed/_test.csv")

# The actual test set that will be used to submit our final result
test = pd.read_csv("data/processed/kaggle_test.csv")

# The properties that will be used for submitting the final result
Sprops = test["prop_id"]

# To speed up execution we'll just sample for now
_train = sample_queries(_train, 0.1)
_val = sample_queries(_val, 0.2)
_test = sample_queries(_test, 1)

In [101]:
# Generate price difference feature
for df in [_train, _val, _test, test]:
    df["srch_price_usd_diff"] = df["price_usd"] - df["srch_mean_price_usd"]
    #df["prop_starrating_diff"] = df["prop_starrating"] - df["prop_starrating_mean"] # Seems to be a bad feature

In [102]:
# Generate month feature
for df in [_train, _val, _test, test]:
    month = df.date_time.map(lambda d: d.split("-")[1])
    dummies = pd.get_dummies(month, prefix="month")
    df[dummies.columns] = dummies

In [103]:
# Composite visitor features
for df in [_train, _val, _test, test]:
    df["srch_visitor_count"] = df.srch_adults_count + df.srch_children_count
    df["srch_visitor_per_room_count"] = df.srch_visitor_count / df.srch_room_count
    df["srch_price_per_adult_count"] = df.price_usd / df.srch_adults_count

In [104]:
# Composite history features
for df in [_train, _val, _test, test]:
    df.visitor_hist_starrating.fillna(df.visitor_hist_starrating.median())
    df.visitor_hist_adr_usd.fillna(df.visitor_hist_adr_usd.median())
    
    df["visitor_hist_price_diff"] = abs(df.visitor_hist_adr_usd - df.price_usd)
    df["visitor_hist_star_diff"] = abs(df.visitor_hist_starrating - df.prop_starrating)

In [105]:
# Generate target label
for df in [_train, _val, _test]:
    df["relevance"] = df.click_bool + df.booking_bool
    df.relevance = df.relevance.map(lambda r: 5 if r > 1 else r)
    df.drop(["click_bool", "booking_bool"], axis=1, inplace=True)

In [106]:
# Fill missing values for prop_location_score2
for df in [_train, _val, _test, test]:
    rounded_scores = df.prop_location_score1.map(lambda s: int(s))
    for score1 in rounded_scores.unique():
        median_score2 = df.loc[rounded_scores == score1].prop_location_score2.quantile(0.25)
        df.loc[(rounded_scores == score1) & (df.prop_location_score2.isnull())].prop_location_score2 = median_score2

# Fill remaining missing values (need better methods for each feature)
for df in [_train, _val, _test, test]:
    df.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [107]:
# Drop training specific data
for df in [_train, _val, _test]:
    df.drop(["gross_bookings_usd", "position"], axis=1, inplace=True)

_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,month_05,month_06,month_11,month_12,srch_visitor_count,srch_visitor_per_room_count,srch_price_per_adult_count,visitor_hist_price_diff,visitor_hist_star_diff,relevance
0,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,10404,4,4.0,...,0,0,0,0,4,4.0,42.685,0.0,0.0,0
1,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,68914,2,3.0,...,0,0,0,0,4,4.0,25.2225,0.0,0.0,5
2,4,2012-12-31 08:59:22,5,219,0.0,0.0,219,11826,5,4.5,...,0,0,0,1,1,1.0,373.0,0.0,0.0,0
3,4,2012-12-31 08:59:22,5,219,0.0,0.0,219,139893,2,3.0,...,0,0,0,1,1,1.0,129.0,0.0,0.0,1
4,6,2013-06-05 12:27:51,14,100,0.0,0.0,100,22135,0,5.0,...,0,1,0,0,2,2.0,57.515,0.0,0.0,0


In [109]:
def split_X_y_qids(df):
    X = df.drop(["srch_id", "relevance"], axis=1)
    y = df.relevance
    qids = df.srch_id
    
    # Remove features we don't want to include
    X = X.drop(["date_time", 
                "visitor_hist_starrating", 
                "visitor_hist_adr_usd", 
                "prop_id", 
                "random_bool",
                "price_usd"
               ], axis=1)
    
    # Destination stuff doesn't seem to be great, TODO: figure out which ones to keep and which ones to leave in
    dest_cols = [c for c in _train.columns if c.startswith("dest")]
    X = X.drop(dest_cols, axis=1)
    
    return (X, y, qids)
    
TX, Ty, Tqids = split_X_y_qids(_train)
VX, Vy, Vqids = split_X_y_qids(_val)
EX, Ey, Eqids = split_X_y_qids(_test)

# Train model
metric = pyltr.metrics.NDCG(k=38)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=100)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=10,
    learning_rate=0.5,
    #max_features=1,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1
)

start = time.time()

model.fit(TX, Ty, Tqids, monitor=monitor)

print("Time elapsed:", time.time() - start)

# Test model locally
Epred = model.predict(EX)
metric.calc_mean(Eqids, Ey.as_matrix(), Epred)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.8343       0.2132        1.25m      C:      0.3802 B:      0.3802 S:  0
    2       0.8629       0.0291       56.82s      C:      0.4321 B:      0.4321 S:  0
    3       0.8711       0.0073       46.94s      C:      0.4478 B:      0.4478 S:  0
    4       0.8780       0.0053       40.16s      C:      0.4645 B:      0.4645 S:  0
    5       0.8793       0.0025       33.43s      C:      0.4706 B:      0.4706 S:  0
    6       0.8831       0.0021       26.63s      C:      0.4746 B:      0.4746 S:  0
    7       0.8858       0.0028       20.35s      C:      0.4789 B:      0.4789 S:  0
    8       0.8861      -0.0002       13.80s      C:      0.4803 B:      0.4803 S:  0
    9       0.8871       0.0013        7.02s      C:      0.4828 B:      0.4828 S:  0
Early termination at iteration  9
Time elapsed: 70.02717518806458


0.48544528967843931

In [93]:
# Feature importances
fi = pd.DataFrame()
fi["Features"] = TX.columns
fi["Importance"] = model.feature_importances_
fi.sort("Importance", ascending=False)



Unnamed: 0,Features,Importance
7,prop_location_score2,0.274553
58,price_norm_srch,0.137536
3,prop_starrating,0.125012
60,price_norm_prop_srch,0.087001
72,visitor_hist_price_diff,0.052063
6,prop_location_score1,0.044023
9,promotion_flag,0.043599
54,prop_std_price_usd,0.038585
74,srch_price_usd_diff,0.035795
55,prop_mean_prop_review_score,0.026412


In [30]:
# Predict final submission order
SX = test[TX.columns]
Sqids = test["srch_id"]
Spred = model.predict(SX)

KeyError: "Index(['month_01', 'month_02', 'month_03', 'month_04', 'month_05', 'month_06',\n       'month_11', 'month_12'],\n      dtype='object') not in index"

In [None]:
# Create submission data frame
result = pd.DataFrame()
result["SearchId"] = Sqids
result["PropertyId"] = Sprops
result["Sort"] = Spred
result = result.sort(["SearchId", "Sort"], ascending=False)
result

In [None]:
# Write submission to file
result = result.drop("Sort", axis=1)
result.to_csv("submission.csv", index=False)

In [46]:
_train.columns

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv

In [None]:
model.feature_importances_

In [55]:
s = pd.Series([0.42, -1.23, 5.3])
s

0    0.42
1   -1.23
2    5.30
dtype: float64

In [56]:
s.sort_values()

1   -1.23
0    0.42
2    5.30
dtype: float64

In [61]:
s.apply(lambda x: x.name)

AttributeError: 'float' object has no attribute 'name'

In [62]:
s.loc[s < 0] = -100

In [63]:
s

0      0.42
1   -100.00
2      5.30
dtype: float64

In [82]:
_train.loc[_train.prop_location_score1.map(lambda s: int(s)) == 6].prop_location_score2.median()

0.2365

In [98]:
_train.prop_location_score1.quantile(0.75)

3.99

In [91]:
_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,month_06,month_11,month_12,srch_visitor_count,srch_visitor_per_room_count,srch_price_per_adult_count,relevance,visitor_hist_price_diff,star_diff,srch_price_usd_diff
0,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,10404,4,4.0,...,0,0,0,4,4.0,42.685,0,170.74,4.0,7.021071
1,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,68914,2,3.0,...,0,0,0,4,4.0,25.2225,5,100.89,2.0,-62.828929
2,4,2012-12-31 08:59:22,5,219,0.0,0.0,219,11826,5,4.5,...,0,0,1,1,1.0,373.0,0,373.0,5.0,107.4375
3,4,2012-12-31 08:59:22,5,219,0.0,0.0,219,139893,2,3.0,...,0,0,1,1,1.0,129.0,1,129.0,2.0,-136.5625
4,6,2013-06-05 12:27:51,14,100,0.0,0.0,100,22135,0,5.0,...,1,0,0,2,2.0,57.515,0,115.03,0.0,-10.44
