#### Introduction

Airbnb provided 4 years worth of anonymized data to Kaggle. The goal of competition is to predict the booked destination based on historical web session data. Session data is preprocessed and features are engineered based on given data and XGBoost is used for softmax classification.

Based on variable importance analysis, it was determined that month-to-month seasonality led to overfitting. Training using only T-1YR worth of data led to better accuracy. Other ways to improve this model may be to cluster users and destinations or using blended models.

In [35]:
import pdb
import csv
import pandas as pd
import os
from pprint import pprint as pp
from collections import defaultdict, Counter
from operator import itemgetter
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import matplotlib.dates as mpldt
import warnings
import rank_metrics

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
DATASET_PATHS = {'train_users': 'train_users_2.csv',
        'sessions': 'sessions.csv',
        'countries': 'countries.csv',
        'age_gender_bkts': 'age_gender_bkts.csv',
        'test_users': 'test_users.csv'}

DATASETS = {}
DATASETS['training'] = pd.DataFrame.from_csv(DATASET_PATHS['train_users'], header=0)
DATASETS['age_gender_bkts'] = pd.DataFrame.from_csv(DATASET_PATHS['age_gender_bkts'], header=0)
DATASETS['countries'] = pd.DataFrame.from_csv(DATASET_PATHS['countries'], header=0)
DATASETS['sessions'] = pd.DataFrame.from_csv(DATASET_PATHS['sessions'], header=0)
DATASETS['testing'] = pd.DataFrame.from_csv(DATASET_PATHS['test_users'], header=0)

In [37]:
df_train = DATASETS['training'].copy()
df_test = DATASETS['testing'].copy()
df_age_gender_buckets = DATASETS['age_gender_bkts']

df_test_x = df_test.copy()
df_train_y = df_train['country_destination']
df_train_x = df_train.drop('country_destination', axis=1)

#### Preprocess sessions data

In [38]:
def group_datasets(df, group_by_columns, pivot_by, column_length=None, custom_columns=None):
    
    df = df.reset_index().groupby(group_by_columns).sum().reset_index()
    
    if column_length:
         columns, count = zip(*Counter(df[pivot_by]).most_common()[:column_length])
    else:
         columns, count = zip(*Counter(df[pivot_by]).most_common())
        
    columns = list(columns)
    if custom_columns:
        aggregated_df = df.pivot_table('secs_elapsed', 'user_id', pivot_by)[custom_columns] 
    else:
        aggregated_df = df.pivot_table('secs_elapsed', 'user_id', pivot_by)[columns] 
    
    aggregated_df.fillna(-1, inplace=True)
    
    return aggregated_df

In [39]:
def group_datasets_w_device(df, group_by_columns, pivot_by, column_length=None, custom_columns=None):
    
    group_by_columns = group_by_columns + ['device_type']
    df = df.reset_index().groupby(group_by_columns + []).sum().reset_index()
    
    pivot_label = pivot_by + '_device_type'
    df[pivot_label] = df[pivot_by] + '_' + df.device_type
    
    if column_length:
        columns, count = zip(*Counter(df[pivot_label]).most_common()[:column_length])
    else:
        columns, count = zip(*Counter(df[pivot_label]).most_common())
        
    columns = list(columns)
    if custom_columns:
        aggregated_df = df.pivot_table('secs_elapsed', 'user_id', pivot_label)[custom_columns] 
    else:
        aggregated_df = df.pivot_table('secs_elapsed', 'user_id', pivot_label)[columns] 
    
    aggregated_df.fillna(-1, inplace=True)
    
    return aggregated_df

In [40]:
device_summary = group_datasets(DATASETS['sessions'].copy(),
                                ['user_id','device_type'],
                                'device_type')

In [41]:
action_details_summary = group_datasets(DATASETS['sessions'].copy(), 
                                        ['user_id','action_detail'],
                                        'action_detail', column_length=60)

In [42]:
actions_generic_summary = group_datasets(DATASETS['sessions'].copy(), 
                                        ['user_id','action'],
                                        'action', column_length=60)

In [43]:
actions_type_columns = ['-unknown-', 'view', 'data', 'click', 'submit', 'message_post', 
                        'partner_callback', 'booking_request', 'booking_response']
actions_type_summary = group_datasets(DATASETS['sessions'].copy(), 
                                        ['user_id','action_type'],
                                        'action_type', custom_columns=actions_type_columns)

#### Merge all session data

In [44]:
action_summary = pd.merge(action_details_summary.reset_index(), actions_type_summary.reset_index(), on='user_id', how='outer')
action_summary = pd.merge(action_summary.reset_index(), actions_generic_summary.reset_index(), on='user_id', how='outer')
sessions_summary = pd.merge(action_summary.reset_index(), device_summary.reset_index(), on='user_id', how='outer')
sessions_summary.fillna(-1, inplace=True)
sessions_summary.head()

Unnamed: 0,index,index_x,user_id,-unknown-_x,p3,header_userpic_x,view_search_results,wishlist_content_update,user_profile,change_trip_characteristics,...,Android Phone,iPad Tablet,Android App Unknown Phone/Tablet,Tablet,Linux Desktop,Chromebook,Windows Phone,Blackberry,iPodtouch,Opera Phone
0,0,0,00023iyk9l,-1,60596,3198,54791,3515,-1,1447,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1,1,0010k6l0om,123033,229119,585,75951,135661,-1,20110,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,2,2,001wyh0pz8,22181,10639,-1,158796,-1,1510,-1,...,-1,-1,282965,-1,-1,-1,-1,-1,-1,-1
3,3,3,0028jgx1x1,489,1027,-1,199802,-1,87089,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,4,4,002qnbzfs5,738169,56386,10954,493417,-1,125071,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


#### Data preprocessing on the main dataset:

In [45]:
categorical_features = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 
                        'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 
                        'first_device_type', 'first_browser']

In [46]:
# check if feature frequencies are similar between training and testing set
def print_feature_counter(df_train, df_test):
    for f in categorical_features:
        print 'feature: {}'.format(f)
        print 'train set types: {}'.format(set(df_train[f]))
        print 'test set types: {}'.format(set(df_test[f]))
        
        num_training_records = len(df_train)
        num_testing_records = len(df_test)
        
        print 'train set types: {}'.format(Counter(df_train[f]))
        print 'test set types: {} \n'.format(Counter(df_test[f]))
        
# print_feature_counter(df_train_x, df_test_x)

#### Preprocess Age Data

In [47]:
continuous_features = ['age']

In [48]:
def how_many_nan(df, f):
    print '# of records: {}'.format(len(df))
    print '# of records w/ nan in attribute {}: {}'.format(
        f, len(df[np.isnan(df[f])]))
    print '% of records w/ nan in attribute {}: {}%'.format(
        f, round(len(df[np.isnan(df[f])]) / float(len(df[f])) * 100, 1))
    
how_many_nan(df_train_x, 'age')
how_many_nan(df_test_x, 'age') 

# of records: 213451
# of records w/ nan in attribute age: 87990
% of records w/ nan in attribute age: 41.2%
# of records: 62096
# of records w/ nan in attribute age: 28876
% of records w/ nan in attribute age: 46.5%


In [49]:
df_train_x['age'][(df_train_x['age'] <= 18) & (df_train_x['age'] >= 90)] = float('nan') 
df_test_x['age'][(df_test_x['age'] <= 18) & (df_test_x['age'] >= 90)] = float('nan')

df_train_x['is_age_nan'] = 0
df_train_x['is_age_nan'][df_train_x['age'].isnull()] = 1
df_test_x['is_age_nan'] = 0
df_test_x['is_age_nan'][df_test_x['age'].isnull()] = 1

#### Preprocess datetime features

In [50]:
datetime_features = ['date_account_created', 'timestamp_first_active']

In [51]:
import calendar
import math
month_num2abbr = dict((k,v) for k,v in enumerate(calendar.month_abbr))

def get_month_abbr(num):
    if math.isnan(num):
        return float('nan')
    
    return month_num2abbr[num]

In [52]:
import holidays
import datetime
day_of_week_mapper = {0:'M', 1:'T', 2:'W', 3:'R', 4:'F', 5:'S', 6:'U'}
us_holidays = holidays.UnitedStates() 
for dataset in [df_train_x, df_test_x]:
    for f in datetime_features:
        if f == 'timestamp_first_active':
            dataset.loc[:, f] = [datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:8])) 
                                 for x in dataset.loc[:, f]]
        else:
            dataset.loc[:, f] = pd.to_datetime(dataset.loc[:, f])
        
        # mapping month as categorical feature to keep seasonality
        dataset[f + '_day_of_week'] = [day_of_week_mapper[x] for x in pd.DatetimeIndex(dataset[f]).dayofweek]
        dataset[f + '_day'] = pd.DatetimeIndex(dataset[f]).day
        dataset[f + '_month'] = map(get_month_abbr, pd.DatetimeIndex(dataset[f]).month)
        dataset[f + '_year'] = pd.DatetimeIndex(dataset[f]).year - min(pd.DatetimeIndex(dataset[f]).year)
        dataset[f + '_is_holiday'] = [x in us_holidays for x in pd.DatetimeIndex(dataset[f])]

####  only select most recent last year 

In [53]:
df_train_y = df_train_y[df_train_x.date_account_created_year == 4]
df_train_x = df_train_x[df_train_x.date_account_created_year == 4]

In [54]:
to_drop_features = ['date_account_created', 'timestamp_first_active', 'date_first_booking']
for dataset in [df_train_x, df_test_x]:
    dataset.drop(to_drop_features, axis=1, inplace=True)

#### Merge user account with sessions data

In [55]:
# need to merge training and testing set to ensure 
# all possible categorical features are encoded

df_train_x['source'] = 'train'
df_test_x['source'] = 'test'
_df = pd.concat([df_train_x, df_test_x])

# merge main dataset with sessions action data
_df = _df.reset_index()
_df = _df.rename(columns = {'id':'user_id'})
_df = pd.merge(_df, sessions_summary, how='left', on=['user_id'])
_df.fillna(-1, inplace=True)
_df = _df.set_index('user_id')

#### Categorical encoding

In [56]:
_df_encoded = pd.get_dummies(_df)
_df_encoded.fillna(-1, inplace=True)

df_train_x_encoded = _df_encoded[_df_encoded['source_train'] == 1]
df_test_x_encoded = _df_encoded[_df_encoded['source_test'] == 1]
    
df_train_x_encoded.drop(['source_train', 'source_test'], axis=1, inplace=True)
df_test_x_encoded.drop(['source_train', 'source_test'], axis=1, inplace=True)

#### Dataframe to values

In [57]:
import sklearn.preprocessing as pp
le = pp.LabelEncoder()
train_y_encoded = le.fit_transform(df_train_y)

x_train = df_train_x_encoded.values
x_test = df_test_x_encoded.values
y_train = train_y_encoded

#### Create custom NDCG scorer 

In [59]:
def score_ndcg(y_train, predicted_probs):
    predicted_labels = []
    for predicted_prob in predicted_probs:
        predicted_labels.append(np.argsort(predicted_prob)[::-1][:5])

    score = 0
    for prediction, correct_label in zip(predicted_labels, y_train):
        ranks = []
        for pred_label in zip(prediction):
            if pred_label == correct_label:
                ranks.append(1)
            else:
                ranks.append(0)

        score += rank_metrics.ndcg_at_k(ranks, 5, method=1)

    return round(score / float(len(y_train)), 4)

#### Train XGBoost

In [61]:
## XGBoost
## with hyperparameter-tuning
## w/ engineered features with all session variables
## best set: alpha=0.2, n_estimators=50, score=0.8319

from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import StratifiedKFold
from itertools import product
import pdb

kfold = StratifiedKFold(y=y_train, n_folds=2, shuffle=True, random_state=None)

params_range = {}
params_range['max_depth'] = [6, 8]
params_range['learning_rate'] = [0.2] 
params_range['n_estimators'] = [20] 
params_range['reg'] = [7.5]
params_range['reg_lambda'] = [1]
params_range['subsample'] = [0.5]
params_range['colsample_bytree'] = [0.75, 0.85]

# grid search using k-holdout cross-validation
for max_depth, alpha, n_estimators, reg, reg_lambda, subsample, colsample_bytree in product(params_range['max_depth'], params_range['learning_rate'], 
                                              params_range['n_estimators'], params_range['reg'], params_range['reg_lambda'], 
                                                   params_range['subsample'], params_range['colsample_bytree']): 
    
    xgb = XGBClassifier(max_depth=max_depth, learning_rate=alpha, n_estimators=n_estimators,
                        objective='multi:softprob', subsample=subsample, colsample_bytree=colsample_bytree, 
                        seed=0, reg_alpha=reg, reg_lambda=reg_lambda, silent=False) 
        
    for k, (train, test) in enumerate(kfold):
        xgb.fit(x_train[train], y_train[train])
            
        print 'Fold: {}, Accuracy: {}, max_depth: {}, learning_rate: {}, n_estimators: {}, reg: {}, reg_lambda:{}, subsample: {}, colsample_bytree: {}'.format(
            k, score_ndcg(y_train[test], xgb.predict_proba(x_train[test])), 
            max_depth, alpha, n_estimators, reg, reg_lambda, subsample, colsample_bytree)

    print '\n'

Fold: 0, Accuracy: 0.8572, max_depth: 6, learning_rate: 0.2, n_estimators: 20, reg: 7.5, reg_lambda:1, subsample: 0.5, colsample_bytree: 0.75
Fold: 1, Accuracy: 0.8563, max_depth: 6, learning_rate: 0.2, n_estimators: 20, reg: 7.5, reg_lambda:1, subsample: 0.5, colsample_bytree: 0.75


Fold: 0, Accuracy: 0.857, max_depth: 6, learning_rate: 0.2, n_estimators: 20, reg: 7.5, reg_lambda:1, subsample: 0.5, colsample_bytree: 0.85
Fold: 1, Accuracy: 0.8563, max_depth: 6, learning_rate: 0.2, n_estimators: 20, reg: 7.5, reg_lambda:1, subsample: 0.5, colsample_bytree: 0.85


Fold: 0, Accuracy: 0.8568, max_depth: 8, learning_rate: 0.2, n_estimators: 20, reg: 7.5, reg_lambda:1, subsample: 0.5, colsample_bytree: 0.75
Fold: 1, Accuracy: 0.8564, max_depth: 8, learning_rate: 0.2, n_estimators: 20, reg: 7.5, reg_lambda:1, subsample: 0.5, colsample_bytree: 0.75


Fold: 0, Accuracy: 0.8565, max_depth: 8, learning_rate: 0.2, n_estimators: 20, reg: 7.5, reg_lambda:1, subsample: 0.5, colsample_bytree: 0.85
F

#### Train classifier with optimal parameters and predict

In [62]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(max_depth=8, learning_rate=0.2, n_estimators=25, 
                    objective='multi:softprob', subsample=0.5, 
                    colsample_bytree=0.5, seed=0, reg_alpha=7.5)

xgb.fit(x_train, y_train)
predicted_probs = xgb.predict_proba(x_test)
predicted_labels = []
for predicted_prob in predicted_probs:
    predicted_labels.append(le.inverse_transform(np.argsort(predicted_prob))[::-1][:5])

#### Output file

In [64]:
import csv
from collections import defaultdict
solution_set = defaultdict(list)

predicted_labels_list = predicted_labels
with open('results/predictions_xgb_2014.csv', 'wb') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(('id', 'country'))
    for id, predicted_labels in zip(df_test_x_encoded.index.values, predicted_labels_list):
        for predicted_label, rank in zip(predicted_labels, xrange(5)):      
            solution_set[rank].append(predicted_label)
            csv_writer.writerow((id, predicted_label))