In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split

# Function to sample queries
def sample_queries(df, frac):
    srch_ids = pd.Series(df["srch_id"].unique()).sample(frac=frac, random_state=42)
    return df.loc[df.srch_id.isin(srch_ids)]

in_train = pd.read_csv("data/training_set_VU_DM_2014.csv")
#in_test = pd.read_csv("data/testing_set_VU_DM_2014.csv")
#in_test = pd.read_csv("data/kaggle_test.csv")
in_test = in_train.drop(["click_bool", "booking_bool", "position", "gross_bookings_usd"], axis=1).head(n=0)

# Speed up execution
#in_train = sample_queries(in_train, 0.1)

in_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


In [None]:
# Add mean, median and std per property for a variety of numericals
def add_mean_median_std_per_factor(df, factor, numerical, prefix):
    grouped = df.groupby(factor)[numerical]
    mean_by_factor = grouped.mean().to_dict()
    median_by_factor = grouped.median().to_dict()
    std_by_factor = grouped.std().to_dict()
    
    df[prefix + "_mean_" + numerical] = df[factor].map(lambda f: mean_by_factor[f])
    df[prefix + "_median_" + numerical] = df[factor].map(lambda f: median_by_factor[f])
    df[prefix + "_std_" + numerical] = df[factor].map(lambda f: std_by_factor[f])
    
for df in [in_train, in_test]:
    f = add_mean_median_std_per_factor
    f(df, "srch_id", "price_usd", "srch")
    f(df, "srch_id", "prop_starrating", "srch")
    f(df, "srch_id", "prop_review_score", "srch")
    
    # These features don't seem to work well, keeping it in for now though
    f(df, "srch_destination_id", "price_usd", "dest")
    f(df, "srch_destination_id", "prop_starrating", "dest")
    f(df, "srch_destination_id", "prop_review_score", "dest")
    
    f(df, "prop_id", "price_usd", "prop")
    f(df, "prop_id", "prop_review_score", "prop")
    
in_train.head()

In [None]:
# Normalize price per factor
for df in [in_train, in_test]:
    df["price_norm_srch"] = df.groupby("srch_id").price_usd.transform(lambda x: (x - x.mean()) / x.std())
    df["price_norm_prop"] = df.groupby("prop_id").price_usd.transform(lambda x: (x - x.mean()) / x.std())
    df["price_norm_prop_srch"] = df.groupby("srch_id").price_norm_prop.transform(lambda x: (x - x.mean()) / x.std())


In [2]:
# Split on search ids
srch_ids = in_train["srch_id"].unique()
srch_ids_train, srch_ids_test_val = train_test_split(srch_ids, test_size=0.5, random_state = 42)
srch_ids_test, srch_ids_val = train_test_split(srch_ids_test_val, test_size=0.5, random_state = 42)

_train = in_train.loc[in_train.srch_id.isin(srch_ids_train)]
_val = in_train.loc[in_train.srch_id.isin(srch_ids_val)]
_test = in_train.loc[in_train.srch_id.isin(srch_ids_test)]

In [None]:
# Estimate probability of being clicked and booked
#grouped = _train.groupby("prop_id")    
#click_prob_by_prop = grouped.click_bool.mean().to_dict()
#book_prob_by_prop = grouped.booking_bool.mean().to_dict()

#mean_click_prob = in_train.click_bool.mean()
#mean_book_prob = in_train.booking_bool.mean()
    
#for df in [_train, _val, _test, in_test]:
#    df["click_probability"] = df.prop_id.map(lambda p: click_prob_by_prop.setdefault(p, mean_click_prob))
#    df["booking_probability"] = df.prop_id.map(lambda p: book_prob_by_prop.setdefault(p, mean_book_prob))

In [None]:
# Undersample a series so that there are as many non-clicked items as clicked and booked
def downsample_series(s):
    s1 = s.loc[s.click_bool == 1]
    s2 = s.loc[s.click_bool == 0]
    
    if (len(s2) == 0):
        return s1
    
    s2 = s2.sample(frac=min(1, len(s1)/len(s)))
    return s1.append(s2).sort_index()

_train_undersampled = _train.groupby("srch_id").apply(downsample_series).reset_index(drop=True)

In [None]:
# Save to CSV
#_train.to_csv("data/processed/_train.csv", index=False)
_val.to_csv("data/processed/_val.csv", index=False)
_test.to_csv("data/processed/_test.csv", index=False)
_train_undersampled.to_csv("data/processed/_train_undersampled.csv", index=False)
in_test.to_csv("data/processed/kaggle_test.csv", index=False)

In [None]:
in_test.head()

In [None]:
#pos_by_dest_prop = _train.groupby(["srch_destination_id", "prop_id"]).position.mean().to_dict()
#pos_by_prop = _train.groupby("prop_id").position.mean().to_dict()
#pos_by_dest = _train.groupby("srch_destination_id").position.mean().to_dict()

#total_mean_pos = _train.position.mean()

#def series_add_est_pos(s):
    # TODO: estimate position based on property and destination id

#_val.groupby(["prop_id", "srch_destination_id"]).apply(series_add_est_pos)

4653476    3931
Name: srch_destination_id, dtype: int64
4653476    3931
Name: srch_destination_id, dtype: int64
160862     6475
201194     6475
455818     6475
602985     6475
849798     6475
1245901    6475
1304993    6475
1668432    6475
1941617    6475
2579011    6475
3180218    6475
3516642    6475
4269156    6475
Name: srch_destination_id, dtype: int64
3550611    14215
Name: srch_destination_id, dtype: int64
288598    20983
Name: srch_destination_id, dtype: int64
2614309    1989
Name: srch_destination_id, dtype: int64
1840347    617
Name: srch_destination_id, dtype: int64
1229185    1472
Name: srch_destination_id, dtype: int64
4102948    3480
Name: srch_destination_id, dtype: int64
3052153    10494
Name: srch_destination_id, dtype: int64
3657703    11008
Name: srch_destination_id, dtype: int64
1836100    11381
3222963    11381
4482532    11381
Name: srch_destination_id, dtype: int64
773131    12022
Name: srch_destination_id, dtype: int64
1046820    14385
Name: srch_destination_id,

KeyboardInterrupt: 

In [None]:
in_train.columns