AirBnB recruiting kaggle
------

https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings

## Load libraries

In [79]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [16]:
class MultiColumnLabelEncoder:
    ''' Create a class that encodes
        labels for a matrix of data
    '''
    def __init__(self, columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here
    
    def get_params(self, deep=True):
        out = dict()
        if self.columns: out['columns'] = columns
        return out

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). 
        '''
        numerics = [np.float16, np.float32, np.float64]
        ints = [np.int16, np.int32, np.int64]
        output = X.copy()
        '''
        if self.columns is not None:
            for col in self.columns:
                if col.dtype not in numerics+ints:
                    output[col] = LabelEncoder().fit_transform(output[col])
                elif col.dtype not in ints:
                    output[col] = scale(output[col])
        else:
        '''
        try:
            for colname,col in output.iteritems():
                if col.dtype not in numerics+ints:
                    # Turn text columns into ints
                    output[colname] = LabelEncoder().fit_transform(output[colname])
                elif col.dtype in numerics:
                    # handle floats with scaling
                    # output[colname] = scale(output[colname])
                    pass 
                elif col.dtype in ints:
                    pass # leave integers alone
        except:
            output = LabelEncoder().fit_transform(output)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

## Declare Args

In [17]:
## Files ## 
AGE_GENDER_BUCKETS_FILE = 'Data/age_gender_bkts.csv'
COUNTRIES_FILE = 'Data/countries.csv'
SAMPLE_SUBMISSION_FILE = 'Data/sample_submission.csv'
SESSIONS_FILE = 'Data/sessions.csv'
TEST_DATA_FINAL_FILE = 'Data/test_users.csv'
TRAIN_DATA_FILE = 'Data/train_users.csv'

## Model args ##
TEST_N = 20000

## Fields ## 
USER_COLUMNS = ['id',
 'date_account_created',
 'timestamp_first_active',
 'date_first_booking',
 'gender',
 'age',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'signup_app',
 'first_device_type',
 'first_browser']
TARGET_COLUMN = ['country_destination']

SESSION_COLUMNS = ['user_id',
 'action',
 'action_type',
 'action_detail',
 'device_type',
 'secs_elapsed']

AGE_BUCKET_COLUMNS = ['age_bucket',
 'country_destination',
 'gender',
 'population_in_thousands',
 'year']

## Read data


In [180]:
## Read user data ## 
train_full = pd.read_csv(TRAIN_DATA_FILE)
train_set, train_target = train_full[TEST_N:][USER_COLUMNS],\
    train_full[TEST_N:][TARGET_COLUMN]
test_set, test_target = train_full[:TEST_N][USER_COLUMNS],\
    train_full[:TEST_N][TARGET_COLUMN]

## Read session data ##
sessions = pd.read_csv(SESSIONS_FILE)

## Read supplemental datasets ## 
countries = pd.read_csv(COUNTRIES_FILE)
age_buckets = pd.read_csv(AGE_GENDER_BUCKETS_FILE)

## Explore data

#### Sessions

In [19]:
sessions.shape

(5600850, 6)

In [20]:
sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,ailzdefy6o,similar_listings,data,similar_listings,Windows Desktop,255
1,ailzdefy6o,similar_listings,data,similar_listings,Windows Desktop,183
2,ailzdefy6o,ajax_refresh_subtotal,click,change_trip_characteristics,Windows Desktop,175570
3,ailzdefy6o,show,,,Windows Desktop,86
4,ailzdefy6o,personalize,data,wishlist_content_update,Windows Desktop,1535


In [21]:
cf = ['action','action_type','action_detail','device_type']
s = sessions[cf].copy().fillna('missing')
mcl = MultiColumnLabelEncoder()
ohe = OneHotEncoder()
x = ohe.fit_transform(
    mcl.fit_transform(s)
).todense()

In [22]:
loops = sessions.shape[0]//100*np.arange(100)

o = []
for i,l in enumerate(loops):
    try:
        a,b = loops[i],loops[i+1]
    except:
        a,b = loops[i],sessions.shape[0]
    sessions_new = pd.DataFrame(np.concatenate(
        (
            sessions[['user_id']][a:b]\
            , x[a:b]\
            , sessions[['secs_elapsed']][a:b]
        )
        , axis=1
    ))
    sessions_grouped = sessions_new.groupby([0]).sum()
    o.append(sessions_grouped)

KeyError: 0

In [47]:
sessions_new = pd.concat(o,ignore_index=True)
sessions_new['user_id'] = pd.concat(o).index
sessions_new = sessions_new.groupby('user_id').sum()

#### User data

In [77]:
train_set.shape

(151239, 15)

In [78]:
train_set.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
20000,t9e8wr7dsl,2012-03-28,20120328191328,,-unknown-,,basic,1,en,direct,direct,,Web,Other/Unknown,-unknown-
20001,dekuxqk9lk,2012-03-28,20120328194449,2012-04-02,FEMALE,30.0,facebook,0,en,direct,direct,untracked,iOS,iPad,Mobile Safari
20002,dvsvhyf3od,2012-03-28,20120328195154,2012-04-10,FEMALE,27.0,basic,0,en,sem-brand,google,tracked-other,Web,Mac Desktop,Safari
20003,zusk44ltvs,2012-03-28,20120328195655,2012-03-31,-unknown-,67.0,basic,0,en,sem-non-brand,google,omg,Web,Mac Desktop,Firefox
20004,4ci9oqtl8q,2012-03-28,20120328200645,2012-04-02,MALE,34.0,facebook,0,en,sem-non-brand,google,untracked,Web,iPhone,Mobile Safari


In [158]:
train_set.index = train_set['id']

In [119]:
train_set['gender'].value_counts()

FEMALE    45658
MALE      38954
OTHER       205
Name: gender, dtype: int64

In [159]:
train_set.loc[train_set['gender']=='-unknown-',['gender']] = np.nan

In [120]:
train_set.head()

Unnamed: 0_level_0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
t9e8wr7dsl,t9e8wr7dsl,2012-03-28,20120328191328,,,,basic,1,en,direct,direct,,Web,Other/Unknown,-unknown-
dekuxqk9lk,dekuxqk9lk,2012-03-28,20120328194449,2012-04-02,FEMALE,30.0,facebook,0,en,direct,direct,untracked,iOS,iPad,Mobile Safari
dvsvhyf3od,dvsvhyf3od,2012-03-28,20120328195154,2012-04-10,FEMALE,27.0,basic,0,en,sem-brand,google,tracked-other,Web,Mac Desktop,Safari
zusk44ltvs,zusk44ltvs,2012-03-28,20120328195655,2012-03-31,,67.0,basic,0,en,sem-non-brand,google,omg,Web,Mac Desktop,Firefox
4ci9oqtl8q,4ci9oqtl8q,2012-03-28,20120328200645,2012-04-02,MALE,34.0,facebook,0,en,sem-non-brand,google,untracked,Web,iPhone,Mobile Safari


In [124]:
train_set['age'].value_counts()

30     4367
28     4251
31     4215
29     4212
27     4104
32     4101
33     3822
26     3582
34     3546
35     3316
25     3112
36     2858
37     2486
38     2347
24     2321
39     2055
40     1905
41     1783
23     1696
42     1531
44     1513
45     1505
43     1438
46     1307
22     1227
47     1147
48     1054
50      982
51      931
49      913
       ... 
102      29
85       24
103      21
16       20
86       19
87       19
81       19
100      19
96       18
101      18
84       17
99       16
90       16
82       15
106      14
93       13
98       12
83       12
94       11
108      11
97       10
92       10
107      10
89        9
88        6
15        5
2         5
91        4
14        2
1         1
Name: age, dtype: int64

In [160]:
train_set.loc[train_set['age']>110,['age']] = np.nan

In [164]:
train_set['date_created'] = pd.to_datetime(train_set['date_account_created'])
train_set['date_first_booking'] = pd.to_datetime(train_set['date_first_booking'])
train_set['year_created'] = train_set['date_created'].dt.year
train_set['month_created'] = train_set['date_created'].dt.month
train_set['year_first_booking'] = train_set['date_first_booking'].dt.year
train_set['month_first_booking'] = train_set['date_first_booking'].dt.month
train_set['days_before_booking'] = train_set['date_first_booking']-train_set['date_created']

In [179]:
train_set.head()

Unnamed: 0_level_0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,...,first_affiliate_tracked,signup_app,first_device_type,first_browser,date_created,year_created,month_created,year_first_booking,month_first_booking,days_before_booking
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
t9e8wr7dsl,t9e8wr7dsl,2012-03-28,20120328191328,missing,missing,missing,basic,1,en,direct,...,missing,Web,Other/Unknown,-unknown-,2012-03-28 00:00:00,2012,3,missing,missing,missing
dekuxqk9lk,dekuxqk9lk,2012-03-28,20120328194449,2012-04-02 00:00:00,FEMALE,30,facebook,0,en,direct,...,untracked,iOS,iPad,Mobile Safari,2012-03-28 00:00:00,2012,3,2012,4,5 days 00:00:00
dvsvhyf3od,dvsvhyf3od,2012-03-28,20120328195154,2012-04-10 00:00:00,FEMALE,27,basic,0,en,sem-brand,...,tracked-other,Web,Mac Desktop,Safari,2012-03-28 00:00:00,2012,3,2012,4,13 days 00:00:00
zusk44ltvs,zusk44ltvs,2012-03-28,20120328195655,2012-03-31 00:00:00,missing,67,basic,0,en,sem-non-brand,...,omg,Web,Mac Desktop,Firefox,2012-03-28 00:00:00,2012,3,2012,3,3 days 00:00:00
4ci9oqtl8q,4ci9oqtl8q,2012-03-28,20120328200645,2012-04-02 00:00:00,MALE,34,facebook,0,en,sem-non-brand,...,untracked,Web,iPhone,Mobile Safari,2012-03-28 00:00:00,2012,3,2012,4,5 days 00:00:00


In [168]:
train_set['days_before_booking'].value_counts()

0 days      14385
1 days      10027
2 days       4416
3 days       2729
4 days       1984
5 days       1540
6 days       1235
7 days       1154
8 days        912
9 days        734
10 days       623
11 days       557
12 days       488
13 days       474
14 days       456
15 days       404
16 days       374
17 days       308
21 days       279
19 days       277
18 days       266
20 days       262
23 days       210
29 days       205
24 days       204
28 days       204
25 days       201
22 days       198
27 days       190
30 days       167
            ...  
305 days       30
201 days       30
193 days       29
275 days       29
266 days       29
311 days       28
246 days       28
180 days       27
143 days       27
288 days       27
207 days       27
321 days       26
298 days       26
319 days       26
219 days       25
251 days       25
255 days       25
242 days       25
227 days       25
363 days       24
264 days       23
252 days       23
341 days       22
300 days       19
-85 days  

In [176]:
train_set.loc[train_set['days_before_booking']<pd.Timedelta(0)\
              ,['days_before_booking']] = np.nan

In [178]:
train_set = train_set.fillna('missing')

In [143]:
mcl2 = MultiColumnLabelEncoder()
ohe2 = OneHotEncoder()
t = mcl2.fit_transform(train_set)