File descriptions

train_users.csv - the training set of users

test_users.csv - the test set of users

id: user id

date_account_created: the date of account creation

timestamp_first_active: timestamp of the first activity, note that it can be earlier than date_account_created or
date_first_booking because a user can search before signing up

date_first_booking: date of first booking

gender

age

signup_method

signup_flow: the page a user came to signup up from

language: international language preference

affiliate_channel: what kind of paid marketing

affiliate_provider: where the marketing is e.g. google, craigslist, other

first_affiliate_tracked: whats the first marketing the user interacted with before the signing up

signup_app

first_device_type

first_browser

country_destination: this is the target variable you are to predict


In [35]:
# AirBNB script
import sys
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
import numpy as np 
from pandasql import sqldf
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
np.random.seed(0)

Reading in and joining the train and test user datasets

In [36]:
# Reading user datasets into pandas. Want to predict destination country (target feature)
orig_train_users = pd.read_csv('/Users/nicholaslipanovich/Documents/airbnb_kaggle/train_users_2.csv')
orig_test_users = pd.read_csv('/Users/nicholaslipanovich/Documents/airbnb_kaggle/test_users.csv')
# getting the labels from the training set into a numpy array
labels = orig_train_users['country_destination'].values
# dropping the labels column from the training set
orig_train_users = orig_train_users.drop(['country_destination'], axis=1)
# getting the ids from the test set into a numpy array
id_test = orig_test_users['id']
# getting the number of rows in the training set to use when splitting the dataset back into test and train
piv_train = orig_train_users.shape[0]
orig_train_users.head(n=10)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari
8,a1vcnhxeij,2010-01-04,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari
9,6uh8zyj2gn,2010-01-04,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox


In [37]:
# Creating a dataframe with train+test data
train_plus_test = pd.concat((orig_train_users, orig_test_users), axis=0, ignore_index=True)
# Removing data_first_booking... why? This column is all NaNs in the test set.
train_plus_test = train_plus_test.drop(['date_first_booking'], axis=1)
# Filling NaNs in with negative ones
train_plus_test = train_plus_test.fillna(-1)

Feature Engineering

In [38]:
# making date_account_created into features that can be used in modeling
# splitting each date by year, month, and day. Turning them into a Series of int lists, then into a numpy array, and 
# finally vertically stacking the array
dac = np.vstack(train_plus_test.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
# making new year, month, and day features
train_plus_test['dac_year'] = dac[:,0]
train_plus_test['dac_month'] = dac[:,1]
train_plus_test['dac_day'] = dac[:,2]
# dropping the original date_account_created feature, since we got what we needed (new features) out of it
train_plus_test = train_plus_test.drop(['date_account_created'], axis=1)
print dac[:2,:]
train_plus_test.head()

[[2010    6   28]
 [2011    5   25]]


Unnamed: 0,id,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,dac_year,dac_month,dac_day
0,gxn3p5htnn,20090319043255,-unknown-,-1,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,6,28
1,820tgsjxq7,20090523174809,MALE,38,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,2011,5,25
2,4ft3gnwmtx,20090609231247,FEMALE,56,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,2010,9,28
3,bjjt8pjhuk,20091031060129,FEMALE,42,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,2011,12,5
4,87mebub9p4,20091208061105,-unknown-,41,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,9,14


In [39]:
# making timestamp_first_active into features that can be used in modeling
# splitting each timestamp by year, month, day, hour, minute, and second and then putting year, month, and day into a stacked
# numpy array. Not using day, hour, or minute as features.
tfa = np.vstack(train_plus_test.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
train_plus_test['tfa_year'] = tfa[:,0]
train_plus_test['tfa_month'] = tfa[:,1]
train_plus_test['tfa_day'] = tfa[:,2]
train_plus_test['tfa_hour'] = tfa[:,3]
train_plus_test = train_plus_test.drop(['timestamp_first_active'], axis=1)
train_plus_test.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day,tfa_hour
0,gxn3p5htnn,-unknown-,-1,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,6,28,2009,3,19,4
1,820tgsjxq7,MALE,38,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,2011,5,25,2009,5,23,17
2,4ft3gnwmtx,FEMALE,56,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,2010,9,28,2009,6,9,23
3,bjjt8pjhuk,FEMALE,42,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,2011,12,5,2009,10,31,6
4,87mebub9p4,-unknown-,41,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,9,14,2009,12,8,6


In [40]:
# Making outlier age values equal to negative one
av = train_plus_test.age.values
train_plus_test['age'] = np.where(np.logical_or(av<18, av>100), -1, av)

In [41]:
#One-hot-encoding features (making indicator variables)
# getting all categorical feature names
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
# for every categorical feature
for f in ohe_feats:
    # create an indicator variable for each unique categorical level
    train_plus_test_indicator_vars = pd.get_dummies(train_plus_test[f], prefix=f)
    # drop the categorical feature from the dataset
    train_plus_test = train_plus_test.drop([f], axis=1)
    # concatenate the train_plus_test dataset with the indicator feature dataset for that category
    # excluding the first indicator variable, so that I don't fall into the "dummy variable trap"
    train_plus_test = pd.concat((train_plus_test, train_plus_test_indicator_vars.ix[:,1:]), axis=1)
    

Joining the users and sessions datasets

In [42]:
# reading in the web sessions dataset
orig_sessions = pd.read_csv('/Users/nicholaslipanovich/Documents/airbnb_kaggle/sessions.csv')
orig_sessions.head(n=10)

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753
2,d1mm9tcy42,lookup,,,Windows Desktop,301
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141
4,d1mm9tcy42,lookup,,,Windows Desktop,435
5,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,7703
6,d1mm9tcy42,lookup,,,Windows Desktop,115
7,d1mm9tcy42,personalize,data,wishlist_content_update,Windows Desktop,831
8,d1mm9tcy42,index,view,view_search_results,Windows Desktop,20842
9,d1mm9tcy42,lookup,,,Windows Desktop,683


In [32]:
#print orig_sessions.info()
# looking at how many unique values each column has
print(orig_sessions.apply(lambda x: x.nunique(),axis=0))

user_id          135483
action              359
action_type          10
action_detail       155
device_type          14
secs_elapsed     337661
dtype: int64


In [9]:
# grouping by user and summing their seconds elapsed
grpby = orig_sessions.groupby(['user_id'])['secs_elapsed'].sum().reset_index()
grpby.columns = ['user_id','secs_elapsed']

In [10]:
# making a pivot table that has the counts for each action_types for each user (each action type is a column)
#print orig_sessions.action_type.value_counts()
#print(orig_sessions.groupby(['action_type'])['user_id'].nunique().reset_index())
action_type = pd.pivot_table(orig_sessions, index = ['user_id'],columns = ['action_type'],values = 'action',aggfunc=len,fill_value=0).reset_index()
# dropping the booking_request column since it had only 4 total counts
action_type = action_type.drop(['booking_response'],axis=1)
print(action_type.head())

action_type     user_id  -unknown-  booking_request  click  data  \
0            00023iyk9l          0                1      4     9   
1            0010k6l0om          5                0     16     9   
2            001wyh0pz8          6                0     66     2   
3            0028jgx1x1          1                0      9     5   
4            002qnbzfs5        184                1    140   140   

action_type  message_post  modify  partner_callback  submit  view  
0                       1       0                 1       0    21  
1                       0       0                 1       0    17  
2                       0       0                 0       3     8  
3                       0       0                 0       1    15  
4                      16       0                 0      15   216  


In [11]:
# making a pivot table that has the counts for each device_types for each user (each device type is a column)
#print(sessions.groupby(['device_type'])['user_id'].nunique().reset_index())
#print(sessions.groupby(['user_id'])['device_type'].nunique().reset_index())
device_type = pd.pivot_table(orig_sessions, index = ['user_id'],columns = ['device_type'],values = 'action',aggfunc=len,fill_value=0).reset_index()
# dropping device types that had less than 10,000 counts
device_type = device_type.drop(['Blackberry','Opera Phone','iPodtouch','Windows Phone'],axis=1)
#device_type = device_type.replace(device_type.iloc[:,1:]>0,1)
print(device_type.head())

device_type     user_id  -unknown-  Android App Unknown Phone/Tablet  \
0            00023iyk9l          0                                 0   
1            0010k6l0om          0                                 0   
2            001wyh0pz8          0                                90   
3            0028jgx1x1         30                                 0   
4            002qnbzfs5         14                                 0   

device_type  Android Phone  Chromebook  Linux Desktop  Mac Desktop  Tablet  \
0                        0           0              0           36       0   
1                        0           0              0           63       0   
2                        0           0              0            0       0   
3                        1           0              0            0       0   
4                        0           0              0            0       0   

device_type  Windows Desktop  iPad Tablet  iPhone  
0                          0            0     

In [12]:
# combining the counts of the action types and device types
action_device_counts = pd.merge(action_type,device_type,on='user_id',how='inner')
action_device_counts = pd.merge(action_device_counts,grpby,on='user_id',how='inner')
print(action_device_counts.head())

      user_id  -unknown-_x  booking_request  click  data  message_post  \
0  00023iyk9l            0                1      4     9             1   
1  0010k6l0om            5                0     16     9             0   
2  001wyh0pz8            6                0     66     2             0   
3  0028jgx1x1            1                0      9     5             0   
4  002qnbzfs5          184                1    140   140            16   

   modify  partner_callback  submit  view      ...       \
0       0                 1       0    21      ...        
1       0                 1       0    17      ...        
2       0                 0       3     8      ...        
3       0                 0       1    15      ...        
4       0                 0      15   216      ...        

   Android App Unknown Phone/Tablet  Android Phone  Chromebook  Linux Desktop  \
0                                 0              0           0              0   
1                                 0   

Looking at the demographics data

In [13]:
# reading in users' summary statistics (age, gender, country of destination)
orig_user_summary = pd.read_csv('/Users/nicholaslipanovich/Documents/airbnb_kaggle/age_gender_bkts.csv')
orig_user_summary.tail(n=10)

# could get proportions for choosing to travel each country based on age bucket and gender
# get a proportion for each age_bucket and for each gender by dividing its population by the total population

# first get the test set's distribution of those with a country destination and those with NDF or other
# multiply the found proportions by the proportion of records in the test set that are NOT other or NDF.

Unnamed: 0,age_bucket,country_destination,gender,population_in_thousands,year
410,35-39,US,female,10352,2015
411,30-34,US,female,10863,2015
412,25-29,US,female,11011,2015
413,20-24,US,female,11094,2015
414,100+,US,male,13,2015
415,95-99,US,male,115,2015
416,90-94,US,male,541,2015
417,15-19,US,female,10570,2015
418,85-89,US,male,1441,2015
419,80-84,US,male,2442,2015


In [14]:
# getting proportion of users in training set that do NOT have the destination of NDF or other
r = np.unique(labels, return_counts=True)
counts = r[1]
prop_not_ndf_other = float(counts.sum() - counts[7] - counts[11])/(counts.sum())
print prop_not_ndf_other
prop_ndf = float(counts[7])/counts.sum()
prop_other = float(counts[11])/counts.sum()
print prop_ndf, prop_other

0.369236967735
0.583473490403 0.0472895418621


In [15]:
prop_not_ndf_other + prop_ndf + prop_other

1.0

In [16]:
# getting proportions of each age_bucket that went to each country in the user summary set
age_propor = pd.pivot_table(orig_user_summary, index = ['age_bucket'],
                            columns = ['country_destination'], values = 'population_in_thousands',
                            aggfunc=np.sum,fill_value=0).reset_index()

In [17]:
# summing the rows to get total counts for each age
# using only country columns to sum
propor_countries = age_propor.ix[:,1:]
total_age_counts = age_propor.sum(axis=1)
# add the counts to the main table
age_propor['total_count'] = total_age_counts

In [18]:
age_propor.head()

country_destination,age_bucket,AU,CA,DE,ES,FR,GB,IT,NL,PT,US,total_count
0,0-4,1605,2036,3524,2470,3973,3869,2851,900,464,21094,42786
1,10-14,1450,1912,3692,2313,3879,3461,2856,1010,554,21117,42244
2,100+,5,8,17,12,22,16,18,2,1,74,175
3,15-19,1505,2063,4050,2114,3932,3647,2904,994,543,21595,43347
4,20-24,1607,2362,4427,2281,3987,4018,3115,1031,560,22695,46083


In [19]:
# dividing each country row by the total counts and multiplying by the proportion of people in the test set that went to 
# one of these ten countries to get the proportions
country_feats = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT', 'US']
# for every country feature
for f in country_feats:
    # divide by total counts and put into new column
    new_name = f + '_propors'
    age_propor[new_name] = (age_propor[f]/age_propor['total_count'])*prop_not_ndf_other
    
# if your ages 0-4 then you get these values for each age-country-proportion feature

In [20]:
# getting only the proportions into a dataframe
age_prop = age_propor.ix[:,12:]
age_prop.shape

(21, 10)

In [21]:
from multiprocessing import Pool
import pandas.util.testing as pdt

def add_props(a_series):
       # got rid of all ages under 18 and over 100
    if a_series.ix[1] in (15,16,17,18,19):
        a_series = a_series.append(age_prop.ix[3,:])
    elif a_series.ix[1] in (20,21,22,23,24):
        a_series = a_series.append(age_prop.ix[4,:])
    elif a_series.ix[1] in (25,26,27,28,29):
        a_series = a_series.append(age_prop.ix[5,:])
    elif a_series.ix[1] in (30,31,32,33,34):
        a_series = a_series.append(age_prop.ix[6,:])
    elif a_series.ix[1] in (35,36,37,38,39):
        a_series = a_series.append(age_prop.ix[7,:])
    elif a_series.ix[1] in (40,41,42,43,44):
        a_series = a_series.append(age_prop.ix[8,:])
    elif a_series.ix[1] in (45,46,47,48,49):
        a_series = a_series.append(age_prop.ix[9,:])
    elif a_series.ix[1] in (50,51,52,53,54):
        a_series = a_series.append(age_prop.ix[11,:])
    elif a_series.ix[1] in (55,56,57,58,59):
        a_series = a_series.append(age_prop.ix[12,:])
    elif a_series.ix[1] in (60,61,62,63,64):
        a_series = a_series.append(age_prop.ix[13,:])
    elif a_series.ix[1] in (65,66,67,68,69):
        a_series = a_series.append(age_prop.ix[14,:])
    elif a_series.ix[1] in (70,71,72,73,74):
        a_series = a_series.append(age_prop.ix[15,:])
    elif a_series.ix[1] in (75,76,77,78,79):
        a_series = a_series.append(age_prop.ix[16,:])
    elif a_series.ix[1] in (80,81,82,83,84):
        a_series = a_series.append(age_prop.ix[17,:])
    elif a_series.ix[1] in (85,86,87,88,89):
        a_series = a_series.append(age_prop.ix[18,:])
    elif a_series.ix[1] in (90,91,92,93,94):
        a_series = a_series.append(age_prop.ix[19,:])
    elif a_series.ix[1] in (95,96,97,98,99):
        a_series = a_series.append(age_prop.ix[20,:])
    else:
        a_series = a_series.append(pd.Series({'AU_propors':np.nan,'CA_propors':np.nan,'DE_propors':np.nan,'ES_propors':np.nan,
                                              'FR_propors':np.nan,'GB_propors':np.nan,'IT_propors':np.nan,'NL_propors':np.nan,
                                              'PT_propors':np.nan,'US_propors':np.nan}))
    return a_series

def process(df):
    res = df.apply(add_props, axis=1) # apply to each row
    return res

pool = Pool(processes=8) # process per core
split_dfs = np.array_split(train_plus_test, 8)
pool_results = pool.map(process, split_dfs)
pool.close()
pool.join()

# merging parts processed by different processes
parts = pd.concat(pool_results, axis=0)
parts.head()
# merging newly calculated parts to big_df
#big_df = pd.concat([big_df, parts], axis=1)

    # checking if the dfs were merged correctly
#pdt.assert_series_equal(parts['id'], big_df['id'])

Unnamed: 0,id,age,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day,tfa_hour,gender_FEMALE,...,AU_propors,CA_propors,DE_propors,ES_propors,FR_propors,GB_propors,IT_propors,NL_propors,PT_propors,US_propors
0,gxn3p5htnn,-1,2010,6,28,2009,3,19,4,0,...,,,,,,,,,,
1,820tgsjxq7,38,2011,5,25,2009,5,23,17,0,...,0.012487,0.018605,0.037897,0.031904,0.028896,0.03081,0.032873,0.007658,0.006509,0.161599
2,4ft3gnwmtx,56,2010,9,28,2009,6,9,23,1,...,0.010975,0.019579,0.045163,0.022736,0.030698,0.02958,0.030554,0.00877,0.005356,0.165825
3,bjjt8pjhuk,42,2011,12,5,2009,10,31,6,1,...,0.012451,0.017632,0.038989,0.029857,0.033364,0.0319,0.036338,0.008748,0.006263,0.153695
4,87mebub9p4,41,2010,9,14,2009,12,8,6,0,...,0.012451,0.017632,0.038989,0.029857,0.033364,0.0319,0.036338,0.008748,0.006263,0.153695


In [22]:
# checking that all column names, except new ones are the same
#print all(parts.columns.values[:153] == train_plus_test.columns.values)
# checking that all the dataframes are exactly the same except for the new columns
#all(train_plus_test.ix[:, :] == parts.ix[:, :153])
# setting train_plus_test equal to the new dataframe 
train_plus_test = parts
train_plus_test.columns

Index([u'id', u'age', u'dac_year', u'dac_month', u'dac_day', u'tfa_year',
       u'tfa_month', u'tfa_day', u'tfa_hour', u'gender_FEMALE',
       ...
       u'AU_propors', u'CA_propors', u'DE_propors', u'ES_propors',
       u'FR_propors', u'GB_propors', u'IT_propors', u'NL_propors',
       u'PT_propors', u'US_propors'],
      dtype='object', length=163)

In [43]:
# combining the table of actions and counts with the main table
users_combined_df = pd.merge(train_plus_test, action_device_counts, left_on='id', right_on='user_id', how='left')

In [44]:
# Dropping both the ids columns
users_combined_df = users_combined_df.drop(['user_id', 'id'], axis=1)
# fill NaNs with zeros
users_combined_df = users_combined_df.fillna(0)

In [45]:
from sklearn import preprocessing
# feature scaling
users_combined_df = preprocessing.scale(users_combined_df)

In [46]:
type(users_combined_df)

numpy.ndarray

In [47]:
#Splitting train and test
# getting the values from the train_plus_test set into a numpy array
vals = users_combined_df # users_combined_df.values
# getting a numpy array of the training set
X = vals[:piv_train]
# getting a numpy array of the labels for the training set and converting them from strings into ints
le = LabelEncoder()
y = le.fit_transform(labels)
# getting a numpy array of the testing set
X_test = vals[piv_train:]

In [26]:
######################## Doing cross-validation to this to pick best parameters #######################
#dtrain = xgb.DMatrix('../data/agaricus.txt.train')
#param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
#num_round = 2
#xgb.cv(param, dtrain, num_round, nfold=5,
 #      metrics={'error'}, seed = 0, show_stdv = False)
from sklearn.grid_search import GridSearchCV
# max_depth=6, learning_rate=0.3, n_estimators=25, subsample=0.5, colsample_bytree=0.5,

In [36]:
#from sklearn.grid_search import RandomizedSearchCV
#help(RandomizedSearchCV)

In [48]:
from sklearn.grid_search import GridSearchCV
#Classifier
# Specifying the classifier and its parameters
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, nthread=-1, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) 
# Doing crossvalidation
# the parameters to try
parameters = {
    'max_depth':[3, 6, 9],
    'learning_rate':[0.1, 0.3, 0.5]
}
# doing 3-fold cross-validation
#clf = GridSearchCV(xgb, parameters, cv=3, n_jobs=-1)
# fitting the model to do cross-validation
xgb.fit(X, y)
# getting the best parameters and scores
#best_params, score, _=max(clf.grid_scores_, key=lambda x: x[1])
#for param_name in sorted(best_params.keys()):
 #   print("%s: %r" % (param_name, best_params[param_name]))
  

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=25, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5)

In [None]:
from sklearn.grid_search import RandomizedSearchCV
# Doing crossvalidation
# the parameters to try
parameters = {
    'max_depth':[3, 4, 5, 6, 7, 8, 9, 10],
    'learning_rate':[0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5],
}
xgb = XGBClassifier(n_estimators=25, nthread=8, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) 
rsearch = RandomizedSearchCV(estimator=xgb, param_distributions=parameters, n_iter=16, n_jobs=-1)
# fitting the model to do cross-validation
rsearch.fit(X, y)
# getting the best parameters and scores
best_params, score, _=max(rsearch.grid_scores_, key=lambda x: x[1])
for param_name in sorted(best_params.keys()):
    print("%s: %r" % (param_name, best_params[param_name]))

In [49]:
# predicting the testing set labels from the model
y_pred = xgb.predict_proba(X_test)

In [235]:
#help(XGBClassifier)
#help(GridSearchCV)

In [50]:
# the prediction output consists of numpy arrays of probabilities that the example (row) belongs
# to each classification label (level).
y_pred[0]

array([ 0.0033883 ,  0.0074479 ,  0.00348055,  0.00733136,  0.01258055,
        0.00702718,  0.01137972,  0.65903759,  0.00394648,  0.00242723,
        0.22397436,  0.05797875], dtype=float32)

In [51]:
#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
# for each id
for i in range(len(id_test)):
    idx = id_test[i] # get id
    ids += [idx] * 5 # make 5 copies of it (one for each class of the five highest probabilities' classes)
    # sorts in decending order and gets the 5 largest probabilities, gets the class labels of these probabilities, and 
    # stores these top 5 most probable class labels in a list, which is then added to the entire list of predicted 
    # classes (countries)
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [52]:
% cd ~/Documents/airbnb_kaggle/
% pwd

/Users/nicholaslipanovich/Documents/airbnb_kaggle


u'/Users/nicholaslipanovich/Documents/airbnb_kaggle'

In [53]:
#Generate submission
# get a 2D column numpy array of the ids and each ids' top predicted countries, putting this into a pandas dataframe
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
# saving it to a csv
sub.to_csv('sub1.csv',index=False)

In [None]:
#action
#print orig_sessions.action_type.value_counts()
#print(orig_sessions.groupby(['action_type'])['user_id'].nunique().reset_index())
#action_type = action_type.drop(['booking_response'],axis=1) # since there were only 4 total 'booking' action types
print(action_type.head())

sessions.csv - web sessions log for users
user_id: to be joined with the column 'id' in users table
action
action_type
action_detail
device_type
secs_elapsed

countries.csv - summary statistics of destination countries in this dataset and their locations
age_gender_bkts.csv - summary statistics of users' age group, gender, country of destination
sample_submission.csv - correct format for submitting your predictions

In [173]:
# destination countries' summary statistics
orig_countries_summary = pd.read_csv('/Users/nicholaslipanovich/Documents/airbnb_kaggle/countries.csv')
orig_countries_summary.head(n=10)

# could make a indicator variable, english or not

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801,fra,92.06
5,GB,54.63322,-3.432277,6883.659,243610,eng,0.0
6,IT,41.87399,12.564167,8636.631,301340,ita,89.4
7,NL,52.133057,5.29525,7524.3203,41543,nld,63.22
8,PT,39.553444,-7.839319,7355.2534,92090,por,95.45
9,US,36.966427,-95.84403,0.0,9826675,eng,0.0


In [None]:
print orig_countries_summary.info()
#print(orig_countries_summary.apply(lambda x: x.nunique(),axis=0))

In [255]:
# Similar to 'str(dataframe)' in R
orig_sessions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10567737 entries, 0 to 10567736
Data columns (total 6 columns):
user_id          object
action           object
action_type      object
action_detail    object
device_type      object
secs_elapsed     float64
dtypes: float64(1), object(5)
memory usage: 564.4+ MB


In [258]:
orig_sessions.apply(lambda x: x.nunique(), axis=0)

user_id          135483
action              359
action_type          10
action_detail       155
device_type          14
secs_elapsed     337661
dtype: int64

In [263]:
grpby = orig_sessions.groupby(['user_id'])['secs_elapsed'].sum().reset_index()
grpby.head()

Unnamed: 0,user_id,secs_elapsed
0,00023iyk9l,867896
1,0010k6l0om,586543
2,001wyh0pz8,282965
3,0028jgx1x1,297010
4,002qnbzfs5,6487080


In [272]:

orig_sessions['action_type'].astype('category').head()

0      NaN
1    click
2      NaN
3    click
4      NaN
Name: action_type, dtype: category
Categories (10, object): [-unknown-, booking_request, booking_response, click, ..., modify, partner_callback, submit, view]

In [265]:
action_type = pd.pivot_table(orig_sessions, index = ['user_id'], 
                             columns = ['action_type'], values = 'action',
                            aggfunc = len,fill_value=0).reset_index()
action_type.head()

action_type,user_id,-unknown-,booking_request,booking_response,click,data,message_post,modify,partner_callback,submit,view
0,00023iyk9l,0,1,0,4,9,1,0,1,0,21
1,0010k6l0om,5,0,0,16,9,0,0,1,0,17
2,001wyh0pz8,6,0,0,66,2,0,0,0,3,8
3,0028jgx1x1,1,0,0,9,5,0,0,0,1,15
4,002qnbzfs5,184,1,0,140,140,16,0,0,15,216


In [None]:
action_type = action_type.drop(['booking_response'], axis=1)
action_type.head()

In [10]:
# Looking at the datasets
orig_countries_sum_stats.head(n=20)

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801,fra,92.06
5,GB,54.63322,-3.432277,6883.659,243610,eng,0.0
6,IT,41.87399,12.564167,8636.631,301340,ita,89.4
7,NL,52.133057,5.29525,7524.3203,41543,nld,63.22
8,PT,39.553444,-7.839319,7355.2534,92090,por,95.45
9,US,36.966427,-95.84403,0.0,9826675,eng,0.0


In [12]:
# Looking at the datasets
orig_users_sum_stats.head(n=10)

Unnamed: 0,age_bucket,country_destination,gender,population_in_thousands,year
0,100+,AU,male,1,2015
1,95-99,AU,male,9,2015
2,90-94,AU,male,47,2015
3,85-89,AU,male,118,2015
4,80-84,AU,male,199,2015
5,75-79,AU,male,298,2015
6,70-74,AU,male,415,2015
7,65-69,AU,male,574,2015
8,60-64,AU,male,636,2015
9,55-59,AU,male,714,2015


In [16]:
# Looking at the datasets
orig_sessions.head(n=20)

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753
2,d1mm9tcy42,lookup,,,Windows Desktop,301
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141
4,d1mm9tcy42,lookup,,,Windows Desktop,435
5,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,7703
6,d1mm9tcy42,lookup,,,Windows Desktop,115
7,d1mm9tcy42,personalize,data,wishlist_content_update,Windows Desktop,831
8,d1mm9tcy42,index,view,view_search_results,Windows Desktop,20842
9,d1mm9tcy42,lookup,,,Windows Desktop,683


In [22]:
## changing categorical features into Category types
# for user sum. stats dataset
orig_users_sum_stats['age_bucket'] = orig_users_sum_stats['age_bucket'].astype('category')
orig_users_sum_stats['country_destination'] = orig_users_sum_stats['country_destination'].astype('category')
orig_users_sum_stats['gender'] = orig_users_sum_stats['gender'].astype('category')
orig_users_sum_stats['year'] = orig_users_sum_stats['year'].astype('category')
# for sessions dataset
orig_sessions['action'] = orig_sessions['action'].astype('category')
orig_sessions['action_type'] = orig_sessions['action_type'].astype('category')
orig_sessions['action_detail'] = orig_sessions['action_detail'].astype('category')
orig_sessions['device_type'] = orig_sessions['device_type'].astype('category')
# for country sum. stats dataset
orig_countries_sum_stats['country_destination'] = orig_countries_sum_stats['country_destination'].astype('category')
orig_countries_sum_stats['destination_language '] = orig_countries_sum_stats['destination_language '].astype('category')
# for train user datasets
orig_train_users['gender'] = orig_train_users['gender'].astype('category')
orig_train_users['signup_method'] = orig_train_users['signup_method'].astype('category')
orig_train_users['language'] = orig_train_users['language'].astype('category')
orig_train_users['affiliate_channel'] = orig_train_users['affiliate_channel'].astype('category')
orig_train_users['affiliate_provider'] = orig_train_users['affiliate_provider'].astype('category')
orig_train_users['first_affiliate_tracked'] = orig_train_users['first_affiliate_tracked'].astype('category')
orig_train_users['signup_app'] = orig_train_users['signup_app'].astype('category')
orig_train_users['first_device_type'] = orig_train_users['first_device_type'].astype('category')
orig_train_users['first_browser'] = orig_train_users['first_browser'].astype('category')
orig_train_users['country_destination'] = orig_train_users['country_destination'].astype('category')
# for test user datasets
orig_test_users['gender'] = orig_test_users['gender'].astype('category')
orig_test_users['signup_method'] = orig_test_users['signup_method'].astype('category')
orig_test_users['language'] = orig_test_users['language'].astype('category')
orig_test_users['affiliate_channel'] = orig_test_users['affiliate_channel'].astype('category')
orig_test_users['affiliate_provider'] = orig_test_users['affiliate_provider'].astype('category')
orig_test_users['first_affiliate_tracked'] = orig_test_users['first_affiliate_tracked'].astype('category')
orig_test_users['signup_app'] = orig_test_users['signup_app'].astype('category')
orig_test_users['first_device_type'] = orig_test_users['first_device_type'].astype('category')
orig_test_users['first_browser'] = orig_test_users['first_browser'].astype('category')

In [83]:
## Looking at the levels of the categories
# the test dataset
orig_test_users['gender'].head() # [-unknown-, FEMALE, MALE, OTHER]
orig_test_users['signup_method'].head() # [basic, facebook, google, weibo]
orig_test_users['language'].head() # Categories (24, object): [-unknown-, ca, cs, da, ..., sv, th, tr, zh]
orig_test_users['affiliate_channel'].head() # [content, direct, other, remarketing, sem-brand, sem-non-brand, seo]
orig_test_users['affiliate_provider'].head() # Categories (17, object): [baidu, bing, craigslist, daum, ..., padmapper, vast, yahoo, yandex]
orig_test_users['first_affiliate_tracked'].head() # [linked, local ops, marketing, omg, product, tracked-other, untracked]
orig_test_users['signup_app'].head() # [Android, Moweb, Web, iOS]
orig_test_users['first_device_type'].head() # Categories (9, object): [Android Phone, Android Tablet, Desktop (Other), Mac Desktop, ..., SmartPhone (Other), Windows Desktop, iPad, iPhone]
orig_test_users['first_browser'].head() # Categories (31, object): [-unknown-, AOL Explorer, Android Browser, Apple Mail, ..., Sogou Explorer, UC Browser, Yandex.Browser, wOSBrowser]
# the train dataset's country_destination target variable
orig_train_users['country_destination'].head() # Categories (12, object): [AU, CA, DE, ES, ..., NL, PT, US, other]
# the sessions dataset
orig_sessions['action'].head() # Categories (359, object): [10, 11, 12, 15, ..., why_host, widget, wishlists, zendesk_login_jwt]
orig_sessions['action_type'].head() # Categories (10, object): [-unknown-, booking_request, booking_response, click, ..., modify, partner_callback, submit, view]
orig_sessions['action_detail'].head() # Categories (155, object): [-unknown-, account_notification_settings, account_payment_methods, account_payout_preferences, ..., wishlist_note, your_listings, your_reservations, your_trips]
orig_sessions['device_type'].head() # Categories (14, object): [-unknown-, Android App Unknown Phone/Tablet, Android Phone, Blackberry, ..., Windows Phone, iPad Tablet, iPhone, iPodtouch]
# the countries sum stats dataset
orig_countries_sum_stats['country_destination'].head() # Categories (10, object): [AU, CA, DE, ES, ..., IT, NL, PT, US]
orig_countries_sum_stats['destination_language '].head() # [deu, eng, fra, ita, nld, por, spa]
# the users sum stats dataset
orig_users_sum_stats['age_bucket'].head() # Categories (21, object): [0-4, 10-14, 100+, 15-19, ..., 80-84, 85-89, 90-94, 95-99]
orig_users_sum_stats['country_destination'].head() # Categories (10, object): [AU, CA, DE, ES, ..., IT, NL, PT, US]
orig_users_sum_stats['gender'].head() # [female, male]
orig_users_sum_stats['year'].head() # Categories (1, float64): [2015]    

0    2015
1    2015
2    2015
3    2015
4    2015
Name: year, dtype: category
Categories (1, float64): [2015]

In [45]:
## changing date and timestamp features into datetime types
# for test user datasets
orig_test_users['date_account_created'] = pd.to_datetime(orig_test_users['date_account_created'])
orig_test_users['date_first_booking'] = pd.to_datetime(orig_test_users['date_first_booking'])
orig_test_users['timestamp_first_active'] = pd.to_datetime(orig_test_users['timestamp_first_active'], format='%Y%m%d%H%M%S')
orig_train_users['date_account_created'] = pd.to_datetime(orig_train_users['date_account_created'])
orig_train_users['date_first_booking'] = pd.to_datetime(orig_train_users['date_first_booking'])
orig_train_users['timestamp_first_active'] = pd.to_datetime(orig_train_users['timestamp_first_active'], format='%Y%m%d%H%M%S')

In [91]:
print ('We have ' + str(orig_train_users.shape[0]) + ' users in the training set and ' + 
      str(orig_test_users.shape[0]) + ' in the test set.')
print ('In total we have ' + str(orig_train_users.shape[0] + orig_test_users.shape[0]) + ' users.')

We have 213451 users in the training set and 62096 in the test set.
In total we have 275547 users.


In [93]:
orig_train_users.dtypes

id                                 object
date_account_created       datetime64[ns]
timestamp_first_active     datetime64[ns]
date_first_booking         datetime64[ns]
gender                           category
age                               float64
signup_method                    category
signup_flow                         int64
language                         category
affiliate_channel                category
affiliate_provider               category
first_affiliate_tracked          category
signup_app                       category
first_device_type                category
first_browser                    category
country_destination              category
dtype: object

In [94]:
orig_test_users.dtypes

id                                 object
date_account_created       datetime64[ns]
timestamp_first_active     datetime64[ns]
date_first_booking         datetime64[ns]
gender                           category
age                               float64
signup_method                    category
signup_flow                         int64
language                         category
affiliate_channel                category
affiliate_provider               category
first_affiliate_tracked          category
signup_app                       category
first_device_type                category
first_browser                    category
dtype: object

In [98]:
#sqldf('select * from df;', locals())
# Merge train and test users
users = pd.concat((orig_train_users, orig_test_users), axis=0, ignore_index=True)

ValueError: incompatible categories in categorical concat

In [None]:
# Creating new version of the dataset to manipulate
new_train = orig_train
new_test = orig_test
# Making triptype, weekday, departmentdescription, visitnumber, and finelinenumber categories
new_train.dtypes