In [1]:
import pandas as pd
import numpy as np
import random
import logging
from sklearn.model_selection import train_test_split

#Turn on Logging
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

# Function to sample queries
def sample_queries(df, frac):
    srch_ids = pd.Series(df["srch_id"].unique()).sample(frac=frac, random_state=42)
    return df.loc[df.srch_id.isin(srch_ids)]

in_train = pd.read_csv("data/training_set_VU_DM_2014.csv")
             
#in_test = pd.read_csv("data/testing_set_VU_DM_2014.csv")
             
in_test = pd.read_csv("data/kaggle_test.csv")
#in_test = in_train.head(0)
logging.info('Loading Training data: %s' % in_train)
logging.info('Loading Test data: %s' % in_test)

# Speed up execution
#in_train = sample_queries(in_train, 0.1)

2017-05-17 00:57:29,786 : Loading Training data:          srch_id            date_time  site_id  visitor_location_country_id  \
0              1  2013-04-04 08:32:15       12                          187   
1              1  2013-04-04 08:32:15       12                          187   
2              1  2013-04-04 08:32:15       12                          187   
3              1  2013-04-04 08:32:15       12                          187   
4              1  2013-04-04 08:32:15       12                          187   
5              1  2013-04-04 08:32:15       12                          187   
6              1  2013-04-04 08:32:15       12                          187   
7              1  2013-04-04 08:32:15       12                          187   
8              1  2013-04-04 08:32:15       12                          187   
9              1  2013-04-04 08:32:15       12                          187   
10             1  2013-04-04 08:32:15       12                          187   
11 

In [2]:
# Add mean, median and std per property for a variety of numericals
def add_mean_median_std_per_factor(df, factor, numerical, prefix):
    grouped = df.groupby(factor)[numerical]
    mean_by_factor = grouped.mean().to_dict()
    median_by_factor = grouped.median().to_dict()
    std_by_factor = grouped.std().to_dict()
    
    df[prefix + "_mean_" + numerical] = df[factor].map(lambda f: mean_by_factor[f])
    df[prefix + "_median_" + numerical] = df[factor].map(lambda f: median_by_factor[f])
    df[prefix + "_std_" + numerical] = df[factor].map(lambda f: std_by_factor[f])
    
for df in [in_train, in_test]:
    f = add_mean_median_std_per_factor
    f(df, "srch_id", "price_usd", "srch")
    f(df, "srch_id", "prop_starrating", "srch")
    f(df, "srch_id", "prop_review_score", "srch")
    
    # These features don't seem to work well, keeping it in for now though
    #f(df, "srch_destination_id", "price_usd", "dest")
    #f(df, "srch_destination_id", "prop_starrating", "dest")
    #f(df, "srch_destination_id", "prop_review_score", "dest")
    
    #f(df, "prop_id", "price_usd", "prop")
    #f(df, "prop_id", "prop_review_score", "prop")
    
in_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,booking_bool,srch_mean_price_usd,srch_median_price_usd,srch_std_price_usd,srch_mean_prop_starrating,srch_median_prop_starrating,srch_std_prop_starrating,srch_mean_prop_review_score,srch_median_prop_review_score,srch_std_prop_review_score
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,0,163.718929,139.05,96.355255,3.071429,3.0,0.766356,3.482143,3.5,1.109572
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,0,163.718929,139.05,96.355255,3.071429,3.0,0.766356,3.482143,3.5,1.109572
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,0,163.718929,139.05,96.355255,3.071429,3.0,0.766356,3.482143,3.5,1.109572
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,0,163.718929,139.05,96.355255,3.071429,3.0,0.766356,3.482143,3.5,1.109572
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,0,163.718929,139.05,96.355255,3.071429,3.0,0.766356,3.482143,3.5,1.109572


In [4]:
# Normalize price per query
for df in [in_train, in_test]:
    df["price_usd"] = df.groupby("srch_id")["price_usd"].transform(lambda x: (x - x.mean()) / x.std())

In [5]:
# Estimate probability of being clicked and booked
grouped = in_train.groupby("prop_id")    
click_prob_by_prop = grouped.click_bool.mean().to_dict()
book_prob_by_prop = grouped.booking_bool.mean().to_dict()

mean_click_prob = in_train.click_bool.mean()
mean_book_prob = in_train.booking_bool.mean()
    
for df in [in_train, in_test]:
    df["click_probability"] = df.prop_id.map(lambda p: click_prob_by_prop.setdefault(p, mean_click_prob))
    df["booking_probability"] = df.prop_id.map(lambda p: book_prob_by_prop.setdefault(p, mean_book_prob))

In [15]:
# Undersample a series so that there are as many non-clicked items as clicked and booked
def downsample_series(s):
    s1 = s.loc[s.click_bool == 1]
    s2 = s.loc[s.click_bool == 0]
    
    if (len(s2) == 0):
        return s1
    
    s2 = s2.sample(frac=min(1, len(s1)/len(s)))
    return s1.append(s2).sort_index()

# Split on search ids
srch_ids = in_train["srch_id"].unique()
srch_ids_train, srch_ids_test_val = train_test_split(srch_ids, test_size=0.2, random_state = 42)
srch_ids_test, srch_ids_val = train_test_split(srch_ids_test_val, test_size=0.5, random_state = 42)

_train = in_train.loc[in_train.srch_id.isin(srch_ids_train)]
_val = in_train.loc[in_train.srch_id.isin(srch_ids_val)]
_test = in_train.loc[in_train.srch_id.isin(srch_ids_test)]
_train_undersampled = _train.groupby("srch_id").apply(downsample_series).reset_index(drop=True)

In [16]:
# Save to CSV
#_train.to_csv("data/processed/_train.csv", index=False)
_val.to_csv("data/processed/_val.csv", index=False)
_test.to_csv("data/processed/_test.csv", index=False)
_train_undersampled.to_csv("data/processed/_train_undersampled.csv", index=False)
in_test.to_csv("data/processed/kaggle_test.csv", index=False)

In [1]:
_val['visitor_hist_starrating']

NameError: name '_val' is not defined

In [4]:
area_map = df.visitor_location_country_id.unique()
area_map.sort()

In [5]:
area_map

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  44,  45,  46,  47,  48,  50,  51,  52,  53,  54,
        55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,
        68,  69,  70,  71,  72,  73,  74,  76,  77,  78,  79,  80,  81,
        82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,
        95,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
       109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122,
       123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
       137, 138, 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150,
       151, 152, 153, 154, 155, 156, 157, 158, 160, 161, 162, 163, 164,
       166, 167, 168, 169, 170, 171, 172, 173, 174, 176, 177, 178, 179,
       180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 19

In [6]:
grouped = in_train.groupby('visitor_location_country_id')

In [58]:
grouped.get_group['1']

AttributeError: 'dict' object has no attribute 'get_group'

In [7]:
len(grouped)

210

In [8]:
in_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4958347 entries, 0 to 4958346
Data columns (total 60 columns):
srch_id                        int64
date_time                      object
site_id                        int64
visitor_location_country_id    int64
visitor_hist_starrating        float64
visitor_hist_adr_usd           float64
prop_country_id                int64
prop_id                        int64
prop_starrating                int64
prop_review_score              float64
prop_brand_bool                int64
prop_location_score1           float64
prop_location_score2           float64
prop_log_historical_price      float64
position                       int64
price_usd                      float64
promotion_flag                 int64
srch_destination_id            int64
srch_length_of_stay            int64
srch_booking_window            int64
srch_adults_count              int64
srch_children_count            int64
srch_room_count                int64
srch_saturday_night_b

In [64]:
in_train

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,price_usd_mean,price_usd_median,prop_starrating_mean,prop_starrating_median,prop_review_score_mean,prop_review_score_median
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,5.0,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50
5,1,2013-04-04 08:32:15,12,187,,,219,30184,4,4.5,...,7.0,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50
6,1,2013-04-04 08:32:15,12,187,,,219,44147,3,3.5,...,,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50
7,1,2013-04-04 08:32:15,12,187,,,219,50984,2,0.0,...,,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50
8,1,2013-04-04 08:32:15,12,187,,,219,53341,4,4.0,...,6.0,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50
9,1,2013-04-04 08:32:15,12,187,,,219,56880,4,4.0,...,,0,,0,163.718929,139.050,3.071429,3.0,3.482143,3.50


In [9]:
trial_df = df[df["srch_destination_id"]== 3]
trial_df.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,price_usd_mean,price_usd_median,prop_starrating_mean,prop_starrating_median,prop_review_score_mean,prop_review_score_median
4164643,279427,2012-11-19 04:57:28,14,190,,,53,1675,4,0.0,...,,1,,0,128.72625,125.465,3.625,4.0,2.0625,1.75
4164644,279427,2012-11-19 04:57:28,14,190,,,53,3080,4,4.5,...,,0,,0,128.72625,125.465,3.625,4.0,2.0625,1.75
4164645,279427,2012-11-19 04:57:28,14,190,,,53,23029,3,3.5,...,,0,,0,128.72625,125.465,3.625,4.0,2.0625,1.75
4164646,279427,2012-11-19 04:57:28,14,190,,,53,25247,4,0.0,...,,0,,0,128.72625,125.465,3.625,4.0,2.0625,1.75
4164647,279427,2012-11-19 04:57:28,14,190,,,53,39801,4,4.5,...,,0,,0,128.72625,125.465,3.625,4.0,2.0625,1.75


In [1]:
# Downsample a series so that there are as many non-clicked items as clicked and booked
def addToFile(file, what):
    f = open(file, 'a').write(what)
def downsample_series(s):
    s1 = s.loc[s.click_bool == 1]
    s2 = s.loc[s.click_bool == 0]
    
    if (len(s2) == 0):
        return s1
    
    s2 = s2.sample(frac=min(1, len(s1)/len(s)))
    return s1.append(s2).sort_index()
def groupby_and_downsample(df):
    x=0
    column_areas1=[1,2,3,4,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,  21,  22,  23,  25,  26,  27,  28,
        29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
        42,  44,  45,  46,  47,  48,  50,  51,  52,  53,  54,  55,  56,
        57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
        70,  71,  72,  73,  74,  76,  77,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  90,  91,  92,  93,  94,  95,  97,  98,
        99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 125, 126, 127,
       128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
       142, 145, 146, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
       158, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 172, 173,
       174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
       188, 189, 190, 191, 193, 194, 195, 196, 198, 199, 200, 201, 202,
       203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,
       216, 217, 218, 219, 220, 221, 222, 223, 224, 226, 227, 228, 229,
       230, 231]
    for i in column_areas:
            trial_df = df[df['srch_destination_id'] == column_areas[i]] 
            srch_ids = trial_df["srch_id"].unique()
            srch_ids_train, srch_ids_test_val = train_test_split(srch_ids, test_size=0.2, random_state = 42)
            srch_ids_test, srch_ids_val = train_test_split(srch_ids_test_val, test_size=0.5, random_state = 42)
            _train = trial_df.loc[trial_df.srch_id.isin(srch_ids_train)]
            _val = trial_df.loc[trial_df.srch_id.isin(srch_ids_val)]
            _test = trial_df.loc[trial_df.srch_id.isin(srch_ids_test)]
            _train_undersampled = _train.groupby("srch_id").apply(downsample_series).reset_index(drop=True)
            addToFile("processed/_train.csv",_train)
            addToFile("processed/_val.csv",_val)
            addToFile("processed/_test.csv",_test)
            addToFile("processed/_train_undersampled.csv",_train_undersampled)
#_train = in_train.loc[in_train.srch_id.isin(srch_ids_train)]
#_val = in_train.loc[in_train.srch_id.isin(srch_ids_val)]
#_test = in_train.loc[in_train.srch_id.isin(srch_ids_test)]
#_train_undersampled = _train.groupby("srch_id").apply(downsample_series).reset_index(drop=True)


# Split on search ids
#srch_ids = in_train["srch_id"].unique()
#srch_ids_train, srch_ids_test_val = train_test_split(srch_ids, test_size=0.2, random_state = 42)
#srch_ids_test, srch_ids_val = train_test_split(srch_ids_test_val, test_size=0.5, random_state = 42)



In [2]:
groupby_and_downsample(_train)

NameError: name '_train' is not defined

In [107]:
srch_ids_train

array([ 71, 173, 209,  26, 222, 142, 128, 206, 138, 151,  51,  81,  41,
       191, 111, 145,  10, 216, 203,  70, 164, 188,  78, 166, 226,  50,
        69,  23, 130, 214,  22,  36, 152, 125,  84, 168, 229, 149, 174,
       211, 182,  68, 148, 198,  74, 100,  88, 178, 163, 120, 102, 194,
       201,  53, 204,  65, 132, 199, 134,   9,  99, 116,   7,  92, 106,
       189,  94, 176,  55, 224,  63,  58, 107,  27, 153, 205, 212, 105,
       136,  87, 187, 121,  54, 127, 227,  97, 137,  82,  15,  30,  98,
        25, 110,  38, 103, 180,  56,  57, 161, 160,  17, 129, 230,   6,
        44, 139, 172, 208,  13, 126,  32, 220, 115,  11, 184,  66, 101,
       109,  77, 181,  90, 140,  76,  39, 122,   5, 207, 196,  62, 217,
       162,  16, 131,  33, 190, 150, 179, 123, 117, 113,  73,  42, 200,
        95,  48, 158, 210,  34, 219,  14, 108, 195,  29, 114, 186, 213,
        86, 223, 157,   3,  85, 193,  45,  93,  35,  47, 185,  37], dtype=int64)

In [5]:
# Save to CSV
_train.to_csv("data/processed/_train.csv", index=False)
_val.to_csv("data/processed/_val.csv", index=False)
_test.to_csv("data/processed/_test.csv", index=False)
_train_undersampled.to_csv("data/processed/_train_undersampled.csv", index=False)
in_test.to_csv("data/processed/kaggle_test.csv")