In [14]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split

# Function to sample queries
def sample_queries(df, frac):
    srch_ids = pd.Series(df["srch_id"].unique()).sample(frac=frac, random_state=42)
    return df.loc[df.srch_id.isin(srch_ids)]

in_train = pd.read_csv("data/training_set_VU_DM_2014.csv")
#in_test = pd.read_csv("data/testing_set_VU_DM_2014.csv")
in_test = pd.read_csv("data/kaggle_test.csv")
#in_test = in_train.head(0)

# Speed up execution
#in_train = sample_queries(in_train, 0.1)

in_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


In [15]:
# Add mean, median and std per property for a variety of numericals
def add_mean_median_std_per_factor(df, factor, numerical, prefix):
    grouped = df.groupby(factor)[numerical]
    mean_by_factor = grouped.mean().to_dict()
    median_by_factor = grouped.median().to_dict()
    std_by_factor = grouped.std().to_dict()
    
    df[prefix + "_mean_" + numerical] = df[factor].map(lambda f: mean_by_factor[f])
    df[prefix + "_median_" + numerical] = df[factor].map(lambda f: median_by_factor[f])
    df[prefix + "_std_" + numerical] = df[factor].map(lambda f: std_by_factor[f])
    
for df in [in_train, in_test]:
    f = add_mean_median_std_per_factor
    f(df, "srch_id", "price_usd", "srch")
    f(df, "srch_id", "prop_starrating", "srch")
    f(df, "srch_id", "prop_review_score", "srch")
    
    # These features don't seem to work well, keeping it in for now though
    f(df, "srch_destination_id", "price_usd", "dest")
    f(df, "srch_destination_id", "prop_starrating", "dest")
    f(df, "srch_destination_id", "prop_review_score", "dest")
    
    f(df, "prop_id", "price_usd", "prop")
    f(df, "prop_id", "prop_review_score", "prop")
    
in_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,dest_std_prop_starrating,dest_mean_prop_review_score,dest_median_prop_review_score,dest_std_prop_review_score,prop_mean_price_usd,prop_median_price_usd,prop_std_price_usd,prop_mean_prop_review_score,prop_median_prop_review_score,prop_std_prop_review_score
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,0.813616,3.760563,4.0,0.7554,118.758742,118.0,17.778734,3.5,3.5,0.0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,0.813616,3.760563,4.0,0.7554,152.054082,129.0,390.928573,4.0,4.0,0.0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,0.813616,3.760563,4.0,0.7554,168.540871,165.0,345.479493,4.5,4.5,0.0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,0.813616,3.760563,4.0,0.7554,82.59887,65.1,305.765579,4.0,4.0,0.0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,0.813616,3.760563,4.0,0.7554,137.648135,117.0,433.301637,3.5,3.5,0.0


In [16]:
# Normalize price per query
for df in [in_train, in_test]:
    df["price_usd"] = df.groupby("srch_id")["price_usd"].transform(lambda x: (x - x.mean()) / x.std())


In [17]:
# Estimate probability of being clicked and booked
grouped = in_train.groupby("prop_id")    
click_prob_by_prop = grouped.click_bool.mean().to_dict()
book_prob_by_prop = grouped.booking_bool.mean().to_dict()

mean_click_prob = in_train.click_bool.mean()
mean_book_prob = in_train.booking_bool.mean()
    
for df in [in_train, in_test]:
    df["click_probability"] = df.prop_id.map(lambda p: click_prob_by_prop.setdefault(p, mean_click_prob))
    df["booking_probability"] = df.prop_id.map(lambda p: book_prob_by_prop.setdefault(p, mean_book_prob))

In [18]:
# Undersample a series so that there are as many non-clicked items as clicked and booked
def downsample_series(s):
    s1 = s.loc[s.click_bool == 1]
    s2 = s.loc[s.click_bool == 0]
    
    if (len(s2) == 0):
        return s1
    
    s2 = s2.sample(frac=min(1, len(s1)/len(s)))
    return s1.append(s2).sort_index()

# Split on search ids
srch_ids = in_train["srch_id"].unique()
srch_ids_train, srch_ids_test_val = train_test_split(srch_ids, test_size=0.2, random_state = 42)
srch_ids_test, srch_ids_val = train_test_split(srch_ids_test_val, test_size=0.5, random_state = 42)

_train = in_train.loc[in_train.srch_id.isin(srch_ids_train)]
_val = in_train.loc[in_train.srch_id.isin(srch_ids_val)]
_test = in_train.loc[in_train.srch_id.isin(srch_ids_test)]
_train_undersampled = _train.groupby("srch_id").apply(downsample_series).reset_index(drop=True)

In [19]:
# Save to CSV
#_train.to_csv("data/processed/_train.csv", index=False)
_val.to_csv("data/processed/_val.csv", index=False)
_test.to_csv("data/processed/_test.csv", index=False)
_train_undersampled.to_csv("data/processed/_train_undersampled.csv", index=False)
in_test.to_csv("data/processed/kaggle_test.csv", index=False)

In [20]:
in_test.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,dest_median_prop_review_score,dest_std_prop_review_score,prop_mean_price_usd,prop_median_price_usd,prop_std_price_usd,prop_mean_prop_review_score,prop_median_prop_review_score,prop_std_prop_review_score,click_probability,booking_probability
0,2,2012-11-05 21:28:38,15,55,,,98,3105,3,2.0,...,3.0,1.921683,115.531818,102.91,55.242827,2.0,2.0,0.0,0.0,0.0
1,2,2012-11-05 21:28:38,15,55,,,98,6399,3,0.0,...,3.0,1.921683,41.858125,38.06,8.871601,0.0,0.0,0.0,0.0,0.0
2,2,2012-11-05 21:28:38,15,55,,,98,7374,4,3.5,...,3.0,1.921683,228.067143,90.74,676.597142,3.5,3.5,0.0,0.05,0.05
3,2,2012-11-05 21:28:38,15,55,,,98,7771,3,4.5,...,3.0,1.921683,31.745455,27.73,11.559451,4.5,4.5,0.0,0.047619,0.047619
4,2,2012-11-05 21:28:38,15,55,,,98,12938,3,0.0,...,3.0,1.921683,41.412143,42.565,3.559505,0.0,0.0,0.0,0.095238,0.047619
