# Short description (ranking 60/1463)
'''
An important characteristic of this competition is that there is a time cut-off between training and testing data -- training data end on 7/1/2014, and testing data start on that data. Moreover, the session informations are available only for the data points after 2014. This code used 4 classifiers: (1) XGB trained on all (training) data, (2) RandomForests trained on all data, (3) XGB classifier trained on recent (aka fresh) data only, and (4) RandomForests trained on all data only. The results from each clasifier formed the final prediction via weighted voting.
'''

# Feature_extraction_sessions

In [130]:
import collections
import numpy as np
import pandas as pd
import datetime as dt
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
# pylint: disable=fixme, no-member

In [17]:
def remove_rare_values_inplace(df_frame, column_list, threshold):
    """ Remove rare values to speed up computation.
    Args:
        df_frame -- A pandas data frame.
        column_list -- A list of columns.
        threshold -- The threshold, below which a value is removed.
    """
    insignificant_population = int(np.floor(threshold * len(df_frame)))
    for cat in column_list:
        freqs = collections.Counter(df_frame[cat])
        other = [i for i in freqs if freqs[i] < insignificant_population]
        for i in other:
            df_frame[cat].replace(i, 'other', inplace=True)

In [23]:
collections.Counter(sessions['action'])

Counter({-1: 79626,
         '10': 3215,
         '11': 716,
         '12': 2209,
         '15': 1053,
         'about_us': 416,
         'accept_decline': 2,
         'account': 9040,
         'acculynk_bin_check_failed': 1,
         'acculynk_bin_check_success': 51,
         'acculynk_load_pin_pad': 50,
         'acculynk_pin_pad_error': 4,
         'acculynk_pin_pad_inactive': 30,
         'acculynk_pin_pad_success': 5,
         'acculynk_session_obtained': 52,
         'active': 188036,
         'add_business_address_colorbox': 9,
         'add_guest_colorbox': 7,
         'add_guests': 60,
         'add_note': 961,
         'agree_terms_check': 10938,
         'agree_terms_uncheck': 598,
         'airbnb_picks': 278,
         'airbrb': 3,
         'ajax_check_dates': 52517,
         'ajax_get_referrals_amt': 11306,
         'ajax_get_results': 369,
         'ajax_google_translate': 290,
         'ajax_google_translate_description': 933,
         'ajax_google_translate_reviews': 95

In [18]:
INDEX_COLUMN = 'user_id'
SECS_ELAPSED_NUMERICAL = 'secs_elapsed'
CATEGORICAL_FEATURES = ['action', 'action_type', 'action_detail', 'device_type']
SESSSIONS_CSV_FILE = 'data/airbnb/sessions.csv'
OUTPUT_TO_CSV_FILE = 'data/airbnb/session_features.csv'  # Results will be saved here.

# A parameter to speed-up computation. Categorical values that appear
# less than the threshold will be removed.
VALUE_THRESHOLD = 0.005

In [19]:
def extract_frequency_counts(pd_frame, column_list):
    """ Extract frequency counts from pd_frame.

    For each index (that correspond to a user) this method will count the
    number of times that C == Ci, where C is a column in column_list, and Ci
    is a unique value of that column. The arg olumn_list is assumed
    to contain categorical columns.

    Args:
        df_frame -- A pandas data frame.
        column_list -- A list of columns.

    Returns:
        A pandas DataFrame, containing frequency counts.
    """
    df_extracted_sessions = []
    for col in column_list:
        for val in set(pd_frame[col]):
            print 'Extracting frequency counts for (%s == %s)' % (col, val)
            tmp_df = pd_frame.groupby(pd_frame.index).apply(
                lambda group, x=col, y=val: np.sum(group[x] == y))
            tmp_df.name = '%s=%s' % (col, val)
            df_extracted_sessions.append(tmp_df)
    frequency_counts = pd.concat(df_extracted_sessions, axis=1)
    return frequency_counts

In [49]:
sessions.head()

Unnamed: 0_level_0,action,action_type,action_detail,device_type,secs_elapsed
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
d1mm9tcy42,lookup,-1,-1,Windows Desktop,319.0
d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
d1mm9tcy42,lookup,-1,-1,Windows Desktop,301.0
d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
d1mm9tcy42,lookup,-1,-1,Windows Desktop,435.0


In [48]:
set(sessions['device_type'])

{'-unknown-',
 'Android App Unknown Phone/Tablet',
 'Android Phone',
 'Mac Desktop',
 'Tablet',
 'Windows Desktop',
 'iPad Tablet',
 'iPhone',
 'other'}

In [65]:
A = list(sessions.groupby(sessions.index))
A[0]

('00023iyk9l',
                            action   action_type                action_detail  \
 user_id                                                                        
 00023iyk9l                  index          view          view_search_results   
 00023iyk9l              dashboard          view                    dashboard   
 00023iyk9l         header_userpic          data               header_userpic   
 00023iyk9l              dashboard          view                    dashboard   
 00023iyk9l                  other         other                        other   
 00023iyk9l                     -1  message_post                 message_post   
 00023iyk9l                  other         other                        other   
 00023iyk9l              requested          view                        other   
 00023iyk9l              requested          view                        other   
 00023iyk9l              requested          view                        other   
 00023iyk9l  

In [71]:
A[0][1].head()

Unnamed: 0_level_0,action,action_type,action_detail,device_type,secs_elapsed
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00023iyk9l,index,view,view_search_results,Mac Desktop,20438.0
00023iyk9l,dashboard,view,dashboard,Mac Desktop,787.0
00023iyk9l,header_userpic,data,header_userpic,Mac Desktop,850.0
00023iyk9l,dashboard,view,dashboard,Mac Desktop,934.0
00023iyk9l,other,other,other,Mac Desktop,-1.0


In [69]:
np.sum(A[0][1]['device_type']=='Mac Desktop')

36

In [20]:
def extract_distribution_stats(pd_frame, numerical_col):
    """ Extract simple distribution statistics from a numerical column.

    Args:
        df_frame -- A pandas data frame.
        numerical_col -- A column in pd_frame that contains numerical values.

    Returns:
        A pandas DataFrame, containing simple satistics for col_name.
    """
    tmp_df = pd_frame[numerical_col].groupby(pd_frame.index).aggregate(
        [np.mean, np.std, np.median, stats.skew])
    tmp_df.columns = ['%s_%s'% (numerical_col, i) for i in tmp_df.columns]
    return tmp_df

In [72]:
SECS_ELAPSED_NUMERICAL

'secs_elapsed'

In [21]:
# Load basic training and testing data, from CSV file.
sessions = pd.read_csv(SESSSIONS_CSV_FILE)
sessions.set_index(INDEX_COLUMN, inplace=True)
sessions.fillna(-1, inplace=True)

In [24]:
# Extract features from sessions.
remove_rare_values_inplace(sessions, CATEGORICAL_FEATURES, VALUE_THRESHOLD)
frequency_counts = extract_frequency_counts(sessions, CATEGORICAL_FEATURES)
simple_stats = extract_distribution_stats(sessions, SECS_ELAPSED_NUMERICAL)

Extracting frequency counts for (action == show)
Extracting frequency counts for (action == similar_listings)
Extracting frequency counts for (action == index)
Extracting frequency counts for (action == search_results)
Extracting frequency counts for (action == confirm_email)
Extracting frequency counts for (action == create)
Extracting frequency counts for (action == header_userpic)
Extracting frequency counts for (action == lookup)
Extracting frequency counts for (action == collections)
Extracting frequency counts for (action == requested)
Extracting frequency counts for (action == qt2)
Extracting frequency counts for (action == personalize)
Extracting frequency counts for (action == update)
Extracting frequency counts for (action == track_page_view)
Extracting frequency counts for (action == notifications)
Extracting frequency counts for (action == active)
Extracting frequency counts for (action == similar_listings_v2)
Extracting frequency counts for (action == identity)
Extracting 

In [25]:
# Save new data.
session_data = pd.concat((frequency_counts, simple_stats), axis=1)
session_data.fillna(-1, inplace=True)
#session_data.to_csv(OUTPUT_TO_CSV_FILE)

In [75]:
session_data.head(10)

Unnamed: 0_level_0,action=show,action=similar_listings,action=index,action=search_results,action=confirm_email,action=create,action=header_userpic,action=lookup,action=collections,action=requested,...,device_type=Android App Unknown Phone/Tablet,device_type=Mac Desktop,device_type=other,device_type=iPhone,device_type=iPad Tablet,device_type=Android Phone,secs_elapsed_mean,secs_elapsed_std,secs_elapsed_median,secs_elapsed_skew
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00023iyk9l,9,3,4,1,1,0,2,0,0,5,...,0,36,0,4,0,0,21697.375,91120.253083,843.0,5.604319
0010k6l0om,20,0,5,8,0,0,1,0,0,0,...,0,63,0,0,0,0,9310.190476,22598.464587,847.0,3.550714
001wyh0pz8,6,0,2,0,0,1,0,0,4,0,...,90,0,0,0,0,0,3144.044444,6541.232094,1104.0,4.913634
0028jgx1x1,15,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,9580.935484,17791.437472,489.0,2.864292
002qnbzfs5,232,0,115,0,1,21,2,0,0,0,...,0,0,0,775,0,0,8221.899873,58074.501937,498.0,19.436206
0031awlkjq,0,0,2,0,0,0,1,0,0,0,...,0,0,0,8,0,0,4946.625,8167.051416,1197.0,1.722653
0035hobuyj,47,12,23,180,1,1,2,148,0,0,...,0,489,0,0,0,0,11706.889571,61264.809869,865.0,16.056027
00378ocvlh,7,0,0,0,0,1,1,0,0,0,...,0,75,0,0,0,0,24362.173333,91144.4388,2157.0,5.531566
00389675gq,33,17,21,6,0,2,2,0,3,0,...,0,0,0,0,0,17,14721.525424,41217.016461,959.0,5.361414
003iamz20l,60,0,0,0,0,1,0,0,0,0,...,0,0,0,0,163,0,9756.042945,29152.050769,378.0,7.411174


In [76]:
session_data.shape

(135483, 71)

# Data_preparation

In [84]:
LABEL = 'country_destination'
CATEGORICAL_FEATURES = ['affiliate_channel', 'affiliate_provider',
                        'first_affiliate_tracked', 'first_browser',
                        'first_device_type', 'gender', 'language', 'signup_app',
                        'signup_method', 'signup_flow']

DATE_FORMAT = '%Y-%m-%d'                # Expected format for date.
ACCOUNT_DATE = 'date_account_created'   # Date column that will be exploited.
ACCOUNT_DATE_YEAR = '%s_%s' % (ACCOUNT_DATE, 'year')
ACCOUNT_DATE_MONTH = '%s_%s' % (ACCOUNT_DATE, 'month')
UNUSED_DATE_COLUMNS = ['timestamp_first_active', 'date_first_booking']

TRAIN_DATA_BASIC = 'data/airbnb/train_users.csv'
TEST_DATA_BASIC = 'data/airbnb/test_users.csv'
SESSION_DATA = 'data/airbnb/session_features.csv'
TRAINING_FINAL_CSV_FILE = 'data/airbnb/training_features.csv'
TESTING_FINAL_CSV_FILE = 'data/airbnb/testing_features.csv'
LABELS_FINAL_CSV_FILE = 'data/airbnb/labels.csv'

# A parameter to speed-up computation. Categorical values that appear
# less than the threshold will be removed.
VALUE_THRESHOLD = 0.001

In [85]:
def _parse_date(date_str, format_str):
    """ Extract features from the data_account_creted column.
    Warning: There is strong dependency between this method and the method
    replace_dates_inplace.
    Args:
        date_str -- A string containing a date value.
        str_format -- The format of the string date.
    Returns:
        A list of 4 values containing the extracted [year, month, day, weekend].
    """
    time_dt = dt.datetime.strptime(date_str, format_str)
    return [time_dt.year, time_dt.month, time_dt.day, time_dt.weekday()]


def extract_dates_inplace(features, date_column):
    """ Extract from the date-columns, year, month, and other numericals.
    Warning: There is strong dependency between this method and _parse_date.
    """
    extracted_vals = np.vstack(features[date_column].apply(
        (lambda x: _parse_date(x, DATE_FORMAT))))
    for i, period in enumerate(['year', 'month', 'day', 'weekday']):
        features['%s_%s' % (date_column, period)] = extracted_vals[:, i]
    features.drop(date_column, inplace=True, axis=1)

In [86]:
def apply_one_hot_encoding(pd_frame, column_list):
    """ Apply One-Hot-Encoding to pd_frame's categorical columns.
    Args:
        df_frame -- A pandas data frame.
        column_list -- A list of categorical columns, in df_frame.
    Returns:
        A pandas dataframe where the colums in column_list have been replaced
            by one-hot-encoded-columns.
    """
    new_column_list = []
    for col in column_list:
        tmp = pd.get_dummies(pd_frame[col], prefix=col)
        new_column_list.append(tmp)
    new_pd_frame = pd.concat(new_column_list+[pd_frame], axis=1)
    new_pd_frame.drop(column_list, inplace=True, axis=1)
    return new_pd_frame

In [113]:
new_column_list = []
for col in CATEGORICAL_FEATURES:
    tmp = pd.get_dummies(features[col], prefix=col)
    print tmp.head()
    print tmp.dtypes
    print type(tmp)
    new_column_list.append(tmp)
new_pd_frame = pd.concat(new_column_list+[features], axis=1)
new_pd_frame.drop(CATEGORICAL_FEATURES, inplace=True, axis=1)
new_pd_frame.head()

            affiliate_channel_api  affiliate_channel_content  \
id                                                             
gxn3p5htnn                      0                          0   
820tgsjxq7                      0                          0   
4ft3gnwmtx                      0                          0   
bjjt8pjhuk                      0                          0   
87mebub9p4                      0                          0   

            affiliate_channel_direct  affiliate_channel_other  \
id                                                              
gxn3p5htnn                         1                        0   
820tgsjxq7                         0                        0   
4ft3gnwmtx                         1                        0   
bjjt8pjhuk                         1                        0   
87mebub9p4                         1                        0   

            affiliate_channel_remarketing  affiliate_channel_sem-brand  \
id                   

Unnamed: 0_level_0,affiliate_channel_api,affiliate_channel_content,affiliate_channel_direct,affiliate_channel_other,affiliate_channel_remarketing,affiliate_channel_sem-brand,affiliate_channel_sem-non-brand,affiliate_channel_seo,affiliate_provider_bing,affiliate_provider_craigslist,...,signup_flow_8,signup_flow_12,signup_flow_23,signup_flow_24,signup_flow_25,signup_flow_other,date_account_created,timestamp_first_active,date_first_booking,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gxn3p5htnn,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2010-06-28,20090319043255,-1,-1.0
820tgsjxq7,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,2011-05-25,20090523174809,-1,38.0
4ft3gnwmtx,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2010-09-28,20090609231247,2010-08-02,56.0
bjjt8pjhuk,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2011-12-05,20091031060129,2012-09-08,42.0
87mebub9p4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2010-09-14,20091208061105,2010-02-18,41.0


In [120]:
for i, period in enumerate(['year', 'month', 'day', 'weekday']):
    print i ,period

0 year
1 month
2 day
3 weekday


In [116]:
extracted_vals = np.vstack(features[ACCOUNT_DATE].apply(
    (lambda x: _parse_date(x, DATE_FORMAT))))
for i, period in enumerate(['year', 'month', 'day', 'weekday']):
    features['%s_%s' % (ACCOUNT_DATE, period)] = extracted_vals[:, i]
features.drop(ACCOUNT_DATE, axis=1)

Unnamed: 0_level_0,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,date_account_created_year,date_account_created_month,date_account_created_day,date_account_created_weekday
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
gxn3p5htnn,20090319043255,-1,-unknown-,-1.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,6,28,0
820tgsjxq7,20090523174809,-1,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,2011,5,25,2
4ft3gnwmtx,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,2010,9,28,1
bjjt8pjhuk,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,2011,12,5,0
87mebub9p4,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,9,14,1
osr2jwljor,20100101215619,2010-01-02,-unknown-,-1.0,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,2010,1,1,4
lsw9q7uk0j,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,2010,1,2,5
0d01nltbrs,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,2010,1,3,6
a1vcnhxeij,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,2010,1,4,0
6uh8zyj2gn,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,2010,1,4,0


In [123]:
extracted_vals

array([[2010,    6,   28,    0],
       [2011,    5,   25,    2],
       [2010,    9,   28,    1],
       ..., 
       [2014,    9,   30,    1],
       [2014,    9,   30,    1],
       [2014,    9,   30,    1]])

In [87]:
def get_basic_train_test_data():
    """ Load the basic data in a pandas dataframe, and pre-process them. """
    training = pd.read_csv(TRAIN_DATA_BASIC, index_col=0)
    testing = pd.read_csv(TEST_DATA_BASIC, index_col=0)
    labels = training[LABEL].copy()
    training.drop(LABEL, inplace=True, axis=1)
    features = pd.concat((training, testing), axis=0)
    features.fillna(-1, inplace=True)

    # Process all features by removing rare values, appling one-hot-encoding to
    # those that are categorical and extracting numericals from ACCOUNT_DATE.

    remove_rare_values_inplace(features, CATEGORICAL_FEATURES, VALUE_THRESHOLD)
    features = apply_one_hot_encoding(features, CATEGORICAL_FEATURES)
    extract_dates_inplace(features, ACCOUNT_DATE)
    features.drop(UNUSED_DATE_COLUMNS, inplace=True, axis=1)
    return features, labels, training.index, testing.index

In [88]:
features, labels, training_ids, testing_ids = get_basic_train_test_data()
sessions = pd.read_csv(SESSION_DATA, index_col=0)
features = pd.concat((features, sessions), axis=1)
features.fillna(-1, inplace=True)
# Save data training and testing data.
training = features.ix[training_ids]
testing = features.ix[testing_ids]

# Warning: When saving the data, it's important that the header is True,
# because labels is of type pandas.core.series.Series, while training is of
# type pandas.core.frame.DataFrame, and they have different default values
# for the header argument.

assert set(training.index) == set(labels.index)
training.to_csv(TRAINING_FINAL_CSV_FILE, header=True)
testing.to_csv(TESTING_FINAL_CSV_FILE, header=True)
labels.to_csv(LABELS_FINAL_CSV_FILE, header=True)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [124]:
features

Unnamed: 0_level_0,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,date_account_created_year,date_account_created_month,date_account_created_day,date_account_created_weekday
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
gxn3p5htnn,2010-06-28,20090319043255,-1,-unknown-,-1.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,6,28,0
820tgsjxq7,2011-05-25,20090523174809,-1,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,2011,5,25,2
4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,2010,9,28,1
bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,2011,12,5,0
87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,9,14,1
osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,-1.0,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,2010,1,1,4
lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,2010,1,2,5
0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,2010,1,3,6
a1vcnhxeij,2010-01-04,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,2010,1,4,0
6uh8zyj2gn,2010-01-04,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,2010,1,4,0


In [126]:
training.head()

Unnamed: 0_level_0,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome


In [129]:
labels.head()

id
gxn3p5htnn      NDF
820tgsjxq7      NDF
4ft3gnwmtx       US
bjjt8pjhuk    other
87mebub9p4       US
Name: country_destination, dtype: object

# Prediction

In [131]:
DEPTH_XGB, ESTIMATORS_XGB, LEARNING_XGB, SUBSAMPLE_XGB, COLSAMPLE_XGB = (
    7, 60, 0.2, 0.7, 0.6)                # XGBoost parameters.

ESTIMATORS_RF, CRITERION_RF, DEPTH_RF, MIN_LEAF_RF, JOBS_RF = (
    500, 'gini', 20, 8, 30)              # RandomForestClassifier parameters.
FRESH_DATA_YEAR = 2014                   # Year when data is considered fresh.
SUBMISSION_CSV = 'data/airbnb/final_prediction.csv'  # Where to store the predictions.

# Tunning ensemble members. The votes show the importnce of each classfier
# in the final prediction.

XGB_ALL_VOTE, RF_ALL_VOTE, XGB_FRESH_VOTE, RF_FRESH_VOTE = (5, 2, 10, 4)

In [132]:
def perform_prediction(training, labels, testing, xgb_votes, rf_votes):
    """ Perform prediction using a combination of XGB and RandomForests. """
    predictions = np.zeros((len(testing), len(set(labels))))
    # Predictions using xgboost.
    for i in range(xgb_votes):
        print 'XGB vote %d' % i
        xgb = XGBClassifier(
            max_depth=DEPTH_XGB, learning_rate=LEARNING_XGB,
            n_estimators=ESTIMATORS_XGB, objective='multi:softprob',
            subsample=SUBSAMPLE_XGB, colsample_bytree=COLSAMPLE_XGB)
        xgb.fit(training, labels)
        predictions += xgb.predict_proba(testing)
    # Predictions using RandomForestClassifier.
    for i in range(rf_votes):
        print 'RandomForest vote %d' % i
        rand_forest = RandomForestClassifier(
            n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF,
            max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, bootstrap=True)
        rand_forest.fit(training, labels)
        predictions += rand_forest.predict_proba(testing)
    return predictions

In [133]:
train_df = pd.read_csv(TRAINING_FINAL_CSV_FILE, index_col=0)
labels_df = pd.read_csv(LABELS_FINAL_CSV_FILE, index_col=0)
test_df = pd.read_csv(TESTING_FINAL_CSV_FILE, index_col=0)
assert set(train_df.index) == set(labels_df.index)

encoder = LabelEncoder()
encoder.fit(labels_df[LABEL])
predictions = np.zeros((len(test_df), len(encoder.classes_)))

# Use the full data set for the prediction.
labels = encoder.transform(labels_df[LABEL])
predictions += perform_prediction(
    train_df, labels, test_df, XGB_ALL_VOTE, RF_ALL_VOTE)

# Use only "fresh" data for prediction. Fresh data, are considered those
# that are an ACCOUNT_DATE_YEAR equal or higher than FRESH_DATA_YEAR.

train_fresh = train_df[train_df[ACCOUNT_DATE_YEAR] >= FRESH_DATA_YEAR]
labels_fresh = encoder.transform(labels_df.ix[train_fresh.index][LABEL])
predictions += perform_prediction(
    train_fresh, labels_fresh, test_df, XGB_FRESH_VOTE, RF_FRESH_VOTE)

# Use the 5 classes with highest scores.
ids, countries = ([], [])
for i in range(len(test_df)):
    idx = test_df.index[i]
    ids += [idx] * 5
    countries += encoder.inverse_transform(
        np.argsort(predictions[i])[::-1])[:5].tolist()

# Save prediction in CSV file.
sub = pd.DataFrame(
    np.column_stack((ids, countries)), columns=['id', 'country'])
sub.to_csv(SUBMISSION_CSV, index=False)

XGB vote 0
XGB vote 1
XGB vote 2
XGB vote 3
XGB vote 4
RandomForest vote 0
RandomForest vote 1


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


XGB vote 0
XGB vote 1
XGB vote 2
XGB vote 3
XGB vote 4
XGB vote 5
XGB vote 6
XGB vote 7
XGB vote 8
XGB vote 9
RandomForest vote 0
RandomForest vote 1
RandomForest vote 2
RandomForest vote 3


In [135]:
sub.head()

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,5uwns89zht,US
2,5uwns89zht,other
3,5uwns89zht,IT
4,5uwns89zht,FR
