AirBnB recruiting kaggle
------

https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings

## Load libraries

In [1]:
from __future__ import print_function
import datetime as DT
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

In [2]:
class MultiColumnLabelEncoder:
    ''' Create a class that encodes
        labels for a matrix of data
    '''
    def __init__(self, columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here
    
    def get_params(self, deep=True):
        out = dict()
        if self.columns: out['columns'] = columns
        return out

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). 
        '''
        numerics = [np.float16, np.float32, np.float64]
        ints = [np.int16, np.int32, np.int64]
        output = X.copy()
        '''
        if self.columns is not None:
            for col in self.columns:
                if col.dtype not in numerics+ints:
                    output[col] = LabelEncoder().fit_transform(output[col])
                elif col.dtype not in ints:
                    output[col] = scale(output[col])
        else:
        '''
        try:
            for colname,col in output.iteritems():
                if col.dtype not in numerics+ints:
                    # Turn text columns into ints
                    output[colname] = LabelEncoder().fit_transform(output[colname])
                elif col.dtype in numerics:
                    # handle floats with scaling
                    # output[colname] = scale(output[colname])
                    pass 
                elif col.dtype in ints:
                    pass # leave integers alone
        except:
            output = LabelEncoder().fit_transform(output)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

## Declare Args

In [3]:
## Files ## 
AGE_GENDER_BUCKETS_FILE = 'Data/age_gender_bkts.csv'
COUNTRIES_FILE = 'Data/countries.csv'
SAMPLE_SUBMISSION_FILE = 'Data/sample_submission.csv'
SESSIONS_FILE = 'Data/sessions.csv'
TEST_DATA_FINAL_FILE = 'Data/test_users.csv'
TRAIN_DATA_FILE = 'Data/train_users.csv'

## Model args ##
TEST_N = 20000

## Fields ## 
USER_COLUMNS = ['id',
 'date_account_created',
 'timestamp_first_active',
 'date_first_booking',
 'gender',
 'age',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'signup_app',
 'first_device_type',
 'first_browser']
TARGET_COLUMN = ['country_destination']

SESSION_COLUMNS = ['user_id',
 'action',
 'action_type',
 'action_detail',
 'device_type',
 'secs_elapsed']

AGE_BUCKET_COLUMNS = ['age_bucket',
 'country_destination',
 'gender',
 'population_in_thousands',
 'year']

## Read data


In [394]:
## Read user data ## 
train_full = pd.read_csv(TRAIN_DATA_FILE)
train_set, train_target = train_full[TEST_N:][USER_COLUMNS+TARGET_COLUMN],\
    train_full[TEST_N:][TARGET_COLUMN]
test_set, test_target = train_full[:TEST_N][USER_COLUMNS+TARGET_COLUMN],\
    train_full[:TEST_N][TARGET_COLUMN]

In [395]:
## Read supplemental datasets ## 
countries = pd.read_csv(COUNTRIES_FILE)
age_buckets = pd.read_csv(AGE_GENDER_BUCKETS_FILE)

In [None]:
## Read session data ##
sessions = pd.read_csv(SESSIONS_FILE)

## Explore data

#### Sessions

In [5]:
sessions.shape

(5600850, 6)

In [6]:
sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,ailzdefy6o,similar_listings,data,similar_listings,Windows Desktop,255
1,ailzdefy6o,similar_listings,data,similar_listings,Windows Desktop,183
2,ailzdefy6o,ajax_refresh_subtotal,click,change_trip_characteristics,Windows Desktop,175570
3,ailzdefy6o,show,,,Windows Desktop,86
4,ailzdefy6o,personalize,data,wishlist_content_update,Windows Desktop,1535


In [7]:
cf = ['action','action_type','action_detail','device_type']
s = sessions[cf].copy().fillna('missing')
mcl = MultiColumnLabelEncoder()
ohe = OneHotEncoder()
x = ohe.fit_transform(
    mcl.fit_transform(s)
).todense()

In [8]:
n = 100
loops = sessions.shape[0]//n*np.arange(n)

o = []
start_time = DT.datetime.now()
for i,l in enumerate(loops):
    try:
        a,b = loops[i],loops[i+1]
    except:
        a,b = loops[i],sessions.shape[0]
    
    sessions_new = pd.DataFrame(np.concatenate(\
        (
            sessions[['user_id']][a:b]\
            , x[a:b]\
            , sessions[['secs_elapsed']][a:b]
        )
        , axis=1
    ))
    sessions_grouped = sessions_new.groupby([0]).sum()
    o.append(sessions_grouped)
    
    if i%10==0:
        this_time = DT.datetime.now()
        time_change = (this_time - start_time).seconds
        per_second = b*1.0 / (time_change)
        total_time = sessions.shape[0] / per_second / 60
        print('finished {}%, {} mins est. total time'\
                .format(round(b*100.0/sessions.shape[0],2)\
                        ,total_time))

finished 1.0%, 86.6674403657 mins est. total time
finished 11.0%, 96.6675296386 mins est. total time
finished 21.0%, 102.699329518 mins est. total time
finished 31.0%, 105.753632258 mins est. total time
finished 41.0%, 105.976555832 mins est. total time
finished 51.0%, 103.399615881 mins est. total time
finished 61.0%, 101.339702496 mins est. total time
finished 71.0%, 100.165213445 mins est. total time
finished 81.0%, 99.5893664221 mins est. total time
finished 91.0%, 98.8287210932 mins est. total time


In [9]:
sessions_new = pd.concat(o,ignore_index=True)
sessions_new['user_id'] = pd.concat(o).index
sessions_new = sessions_new.groupby('user_id').sum()

#### User data

In [400]:
train_set.shape

(151239, 16)

In [401]:
train_set.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
20000,t9e8wr7dsl,2012-03-28,20120328191328,,-unknown-,,basic,1,en,direct,direct,,Web,Other/Unknown,-unknown-,NDF
20001,dekuxqk9lk,2012-03-28,20120328194449,2012-04-02,FEMALE,30.0,facebook,0,en,direct,direct,untracked,iOS,iPad,Mobile Safari,US
20002,dvsvhyf3od,2012-03-28,20120328195154,2012-04-10,FEMALE,27.0,basic,0,en,sem-brand,google,tracked-other,Web,Mac Desktop,Safari,US
20003,zusk44ltvs,2012-03-28,20120328195655,2012-03-31,-unknown-,67.0,basic,0,en,sem-non-brand,google,omg,Web,Mac Desktop,Firefox,FR
20004,4ci9oqtl8q,2012-03-28,20120328200645,2012-04-02,MALE,34.0,facebook,0,en,sem-non-brand,google,untracked,Web,iPhone,Mobile Safari,US


In [402]:
train_set.index = train_set['id']

In [403]:
train_set['gender'].value_counts()

-unknown-    66422
FEMALE       45658
MALE         38954
OTHER          205
Name: gender, dtype: int64

In [404]:
train_set.loc[train_set['gender']=='-unknown-',['gender']] = np.nan

In [405]:
train_set.head()

Unnamed: 0_level_0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
t9e8wr7dsl,t9e8wr7dsl,2012-03-28,20120328191328,,,,basic,1,en,direct,direct,,Web,Other/Unknown,-unknown-,NDF
dekuxqk9lk,dekuxqk9lk,2012-03-28,20120328194449,2012-04-02,FEMALE,30.0,facebook,0,en,direct,direct,untracked,iOS,iPad,Mobile Safari,US
dvsvhyf3od,dvsvhyf3od,2012-03-28,20120328195154,2012-04-10,FEMALE,27.0,basic,0,en,sem-brand,google,tracked-other,Web,Mac Desktop,Safari,US
zusk44ltvs,zusk44ltvs,2012-03-28,20120328195655,2012-03-31,,67.0,basic,0,en,sem-non-brand,google,omg,Web,Mac Desktop,Firefox,FR
4ci9oqtl8q,4ci9oqtl8q,2012-03-28,20120328200645,2012-04-02,MALE,34.0,facebook,0,en,sem-non-brand,google,untracked,Web,iPhone,Mobile Safari,US


In [406]:
train_set['age'].value_counts()

30      4367
28      4251
31      4215
29      4212
27      4104
32      4101
33      3822
26      3582
34      3546
35      3316
25      3112
36      2858
37      2486
38      2347
24      2321
39      2055
40      1905
41      1783
23      1696
42      1531
44      1513
45      1505
43      1438
46      1307
22      1227
47      1147
48      1054
50       982
51       931
49       913
        ... 
108       11
97        10
92        10
107       10
89         9
115        7
88         6
2          5
15         5
91         4
1932       3
14         2
1931       2
113        2
1925       2
112        2
111        2
1928       1
1927       1
1933       1
1934       1
1935       1
1936       1
1942       1
1953       1
1995       1
1919       1
150        1
132        1
1          1
Name: age, dtype: int64

In [407]:
train_set.loc[train_set['age']>110,['age']] = np.nan

In [408]:
train_set['date_created'] = pd.to_datetime(train_set['date_account_created'])
train_set['date_first_booking'] = pd.to_datetime(train_set['date_first_booking'])
train_set['year_created'] = train_set['date_created'].dt.year
train_set['month_created'] = train_set['date_created'].dt.month
train_set['year_first_booking'] = train_set['date_first_booking'].dt.year
train_set['month_first_booking'] = train_set['date_first_booking'].dt.month
train_set['days_to_first_booking'] = train_set['date_first_booking']-train_set['date_created']

In [409]:
train_set.head()

Unnamed: 0_level_0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,...,signup_app,first_device_type,first_browser,country_destination,date_created,year_created,month_created,year_first_booking,month_first_booking,days_to_first_booking
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
t9e8wr7dsl,t9e8wr7dsl,2012-03-28,20120328191328,NaT,,,basic,1,en,direct,...,Web,Other/Unknown,-unknown-,NDF,2012-03-28,2012,3,,,NaT
dekuxqk9lk,dekuxqk9lk,2012-03-28,20120328194449,2012-04-02,FEMALE,30.0,facebook,0,en,direct,...,iOS,iPad,Mobile Safari,US,2012-03-28,2012,3,2012.0,4.0,5 days
dvsvhyf3od,dvsvhyf3od,2012-03-28,20120328195154,2012-04-10,FEMALE,27.0,basic,0,en,sem-brand,...,Web,Mac Desktop,Safari,US,2012-03-28,2012,3,2012.0,4.0,13 days
zusk44ltvs,zusk44ltvs,2012-03-28,20120328195655,2012-03-31,,67.0,basic,0,en,sem-non-brand,...,Web,Mac Desktop,Firefox,FR,2012-03-28,2012,3,2012.0,3.0,3 days
4ci9oqtl8q,4ci9oqtl8q,2012-03-28,20120328200645,2012-04-02,MALE,34.0,facebook,0,en,sem-non-brand,...,Web,iPhone,Mobile Safari,US,2012-03-28,2012,3,2012.0,4.0,5 days


In [410]:
train_set['days_to_first_booking'].value_counts()

0 days      14385
1 days      10027
2 days       4416
3 days       2729
4 days       1984
5 days       1540
6 days       1235
7 days       1154
8 days        912
9 days        734
10 days       623
11 days       557
12 days       488
13 days       474
14 days       456
15 days       404
16 days       374
17 days       308
21 days       279
19 days       277
18 days       266
20 days       262
23 days       210
29 days       205
24 days       204
28 days       204
25 days       201
22 days       198
27 days       190
30 days       167
            ...  
305 days       30
201 days       30
193 days       29
275 days       29
266 days       29
311 days       28
246 days       28
180 days       27
143 days       27
288 days       27
207 days       27
321 days       26
298 days       26
319 days       26
219 days       25
251 days       25
255 days       25
242 days       25
227 days       25
363 days       24
264 days       23
252 days       23
341 days       22
300 days       19
-85 days  

In [411]:
train_set.loc[train_set['days_to_first_booking']<pd.Timedelta(0)\
              ,['days_to_first_booking']] = np.nan

#### merge age buckets

In [412]:
age_buckets.head()

Unnamed: 0,age_bucket,country_destination,gender,population_in_thousands,year
0,100+,AU,male,1,2015
1,95-99,AU,male,9,2015
2,90-94,AU,male,47,2015
3,85-89,AU,male,118,2015
4,80-84,AU,male,199,2015


In [413]:
age_buckets['age_merge'] = (np.floor(\
                  np.array([int(re.split(r'[-+]',str(x))[0]) \
                  for x in age_buckets['age_bucket']]\
            )/10)*10).astype('int')

In [414]:
age_buckets.index = age_buckets['age_merge'].astype('string') \
            +'-'+age_buckets['country_destination'] \
            +'-'+age_buckets['gender'].str.lower()

In [415]:
for c in set(countries['country_destination']):
    train_set['age_merge'+'-'+c] = (
                        np.floor(\
                            train_set['age']/10)*10\
                        )\
                            .fillna(0)\
                            .astype('int')\
                            .astype('string') \
                        +'-'+c \
                        +'-'+train_set['gender'].str.lower()

In [416]:
age_buckets = age_buckets[[
        'age_merge' \
        ,'country_destination' \
        ,'gender' \
        ,'population_in_thousands']] \
    .groupby(['age_merge','country_destination','gender']).sum()

In [417]:
age_buckets.index = pd.Series([ str(i[0])+'-'+i[1]+'-'+i[2] for i in age_buckets.index])

In [418]:
for c in set(countries['country_destination']):
    train_set = pd.merge(
        train_set \
         , age_buckets \
         , left_on=['age_merge'+'-'+c] \
         , right_index=True \
         , how='outer' \
         , suffixes=(c,c)
    )
print(train_set.shape)

(153219, 42)


In [421]:
train_set = train_set.drop_duplicates(['id'])

In [422]:
train_set['population_estimate'] = 0
for c in set(countries['country_destination']):
    try:
        train_set.loc[:,'population_estimate'] = \
            train_set.loc[:,'population_estimate']\
            +np.nansum(train_set.loc[:,'population_in_thousands'+c]\
                       ,axis=1)
    except KeyError:
        pass

In [427]:
cols = [
 'gender',
 'age',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'signup_app',
 'first_device_type',
 'first_browser',
 'year_created' ,
 'month_created' ,
 'year_first_booking' ,
 'month_first_booking' ,
 'days_to_first_booking',
 'population_estimate'
]
print(train_set[cols].shape)

(151240, 17)


#### Encode training set 

In [428]:
train_set[cols].head()

Unnamed: 0_level_0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,year_created,month_created,year_first_booking,month_first_booking,days_to_first_booking,population_estimate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
t9e8wr7dsl,,,basic,1,en,direct,direct,,Web,Other/Unknown,-unknown-,2012,3,,,NaT,0
zusk44ltvs,,67.0,basic,0,en,sem-non-brand,google,omg,Web,Mac Desktop,Firefox,2012,3,2012.0,3.0,3 days,0
0n455wucwl,,34.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,2012,3,2012.0,3.0,3 days,0
8e07eppnbv,,,basic,0,en,direct,direct,untracked,Web,Windows Desktop,Chrome,2012,3,2012.0,4.0,10 days,0
qu1b3jfqm6,,,basic,0,en,other,craigslist,untracked,Web,Windows Desktop,Firefox,2012,3,2012.0,4.0,5 days,0


In [429]:
mcl2 = MultiColumnLabelEncoder()
ohe2 = OneHotEncoder()
ss = StandardScaler(with_mean=False)
ii = Imputer(strategy='most_frequent')
p = Pipeline([('mcl',mcl2),('ii',ii), ('ohe',ohe2),('ss', ss)])
train_set_new = p.fit_transform(train_set[cols])

In [430]:
print(train_set_new[:,:].shape)

(151240, 665)


In [439]:
train_target = train_set['country_destination'].fillna('unknown')
print(train_target.shape)

(151240,)


In [441]:
lda = LDA()
l = lda.fit_transform(train_set_new.toarray(), np.array(train_target))



In [443]:
l.shape

(151240, 12)

### Test prediction model 