In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os, gc
import seaborn as sns

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

sns.set(context='notebook', style='whitegrid', palette='deep', font='sans-serif', 
        font_scale=2, color_codes=True, rc=None)

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df  = pd.read_csv('data/test.csv')
sub_df   = pd.read_csv('data/sample_submission.csv')
info_df  = pd.read_csv('data/data_dict.csv')

train_df.shape, test_df.shape, sub_df.shape, info_df.shape

((341424, 24), (146765, 23), (146765, 2), (24, 2))

In [3]:
train_df.apply(lambda x: pd.Series.value_counts(x).shape[0])

reservation_id                        341424
booking_date                            1621
checkin_date                            1532
checkout_date                           1531
channel_code                               3
main_product_code                          5
numberofadults                            28
numberofchildren                          12
persontravellingid                         6
resort_region_code                         3
resort_type_code                           7
room_type_booked_code                      6
roomnights                                44
season_holidayed_code                      4
state_code_residence                      37
state_code_resort                         11
total_pax                                 18
member_age_buckets                        10
booking_type_code                          2
memberid                              101327
cluster_code                               6
reservationstatusid_code                   4
resort_id 

# Important points:

- There are only `season_holidayed_code` & `state_code_residence` columns that contains `nan` values.
- All variables are categorical except `ids` (which are `hash code`) and `amount_spent_per_room_night_scaled` (which is `float` and `target`) 
- `101327/341424` are those `member_ids`, which are train dataset and `43496/146765` are those `member_ids`, which are test dataset.


In [4]:
train_df['tr_flag'] = 1
test_df['tr_flag']  = 0
train_len = train_df.shape[0]
train_test = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
print("Size: ", train_test.shape)

del train_df, test_df
gc.collect()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Size:  (488189, 25)


32

In [5]:
cat_cols = ['channel_code', 'main_product_code', 'numberofadults','numberofchildren', 'persontravellingid','resort_region_code',
'resort_type_code','room_type_booked_code', 'roomnights', 'season_holidayed_code', 'member_age_buckets', 'total_pax',
'state_code_residence', 'state_code_resort', 'booking_type_code','cluster_code','reservationstatusid_code', 'resort_id']
len(cat_cols)

18

# Data Cleaning
- `numberofadults`> 8 & <1 = 9
- `roomnights`    >12 & <1 = 13 (**suspicious**)
- `total_pax`     > 7 & <1 = 8

In [6]:
# train_test.roomnights.value_counts().sort_index()

# roomnights
# idx = train_test[train_test.roomnights>12].index
# train_test.loc[idx, 'roomnights'] = 13
# idx = train_test[train_test.roomnights<1].index
# train_test.loc[idx, 'roomnights'] = 13

# numberofadults
idx = train_test[train_test.numberofadults>8].index
train_test.loc[idx, 'numberofadults'] = 9
idx = train_test[train_test.numberofadults<1].index
train_test.loc[idx, 'numberofadults'] = 9

# total_pax
idx = train_test[train_test.total_pax>7].index
train_test.loc[idx, 'total_pax'] = 8
idx = train_test[train_test.total_pax<1].index
train_test.loc[idx, 'total_pax'] = 8



# My understanding about data:
### For aggregator based features
- member_id
- resort_id
- cluster_code
- state_code_residence
- state_code_resort
- resort_region_code
- resort_type_code

In [7]:
train_test['booking_date'] = pd.to_datetime(train_test['booking_date'], dayfirst=True)
train_test['checkin_date'] = pd.to_datetime(train_test['checkin_date'], dayfirst=True)
train_test['checkout_date'] = pd.to_datetime(train_test['checkout_date'], dayfirst=True)

# ((train_test['checkout_date'] - train_test['checkin_date']))/365) / np.timedelta64(1, 'D')

In [8]:
train_test['days_diff'] = train_test['checkout_date'] - train_test['checkin_date']

train_test['booking_week']  = train_test['booking_date'].dt.week
train_test['booking_month'] = train_test['booking_date'].dt.month
train_test['booking_year']  = train_test['booking_date'].dt.year
train_test['booking_dow']   = train_test['booking_date'].dt.dayofweek

train_test['checkin_week']  = train_test['checkin_date'].dt.week
train_test['checkin_month'] = train_test['checkin_date'].dt.month
train_test['checkin_year']  = train_test['checkin_date'].dt.year
train_test['checkin_dow']   = train_test['checkin_date'].dt.dayofweek

# train_test['checkout_week']  = train_test['checkout_date'].dt.week
# train_test['checkout_month'] = train_test['checkout_date'].dt.month
# train_test['checkout_year']  = train_test['checkout_date'].dt.year
# train_test['checkout_dow']   = train_test['checkout_date'].dt.dayofweek

In [9]:
train_test['days_diff'] = train_test['days_diff'].dt.days

# days_diff
idx = train_test[train_test.days_diff>7].index
train_test.loc[idx, 'days_diff'] = 8


In [10]:
train_test.drop(['reservation_id'], axis=1, inplace=True)

label_enc_cols = ['resort_id','memberid']
import category_encoders as ce

enc = ce.OrdinalEncoder(cols=label_enc_cols, drop_invariant=True)
train_test = enc.fit_transform(train_test)



In [11]:
train_test.head()

Unnamed: 0,amount_spent_per_room_night_scaled,booking_date,booking_type_code,channel_code,checkin_date,checkout_date,cluster_code,main_product_code,member_age_buckets,memberid,numberofadults,numberofchildren,persontravellingid,reservationstatusid_code,resort_id,resort_region_code,resort_type_code,room_type_booked_code,roomnights,season_holidayed_code,state_code_residence,state_code_resort,total_pax,tr_flag,days_diff,booking_week,booking_month,booking_year,booking_dow,checkin_week,checkin_month,checkin_year,checkin_dow
0,7.706428,2018-04-05,1,3,2018-04-05,2018-04-06,F,1,F,1,2,0,46,C,1,3,3,3,1,2.0,7.0,3,3,1,1,14,4,2018,3,14,4,2018,3
1,6.662563,2015-01-23,1,1,2015-04-11,2015-04-16,F,1,F,1,2,0,46,A,2,3,3,4,5,2.0,7.0,5,2,1,5,4,1,2015,4,15,4,2015,5
2,7.871602,2015-01-28,1,1,2015-02-01,2015-02-05,E,1,F,1,2,0,47,A,3,1,5,4,4,2.0,7.0,1,2,1,4,5,1,2015,2,5,2,2015,6
3,5.344943,2015-05-02,1,1,2015-06-11,2015-06-16,D,1,F,1,2,2,46,A,4,2,2,3,5,2.0,7.0,2,2,1,5,18,5,2015,5,24,6,2015,3
4,7.059346,2015-09-02,1,1,2015-12-14,2015-12-19,D,1,F,1,2,0,46,A,4,2,2,4,5,2.0,7.0,2,2,1,5,36,9,2015,2,51,12,2015,0


In [12]:
print("Shape: ", train_test.shape, " ==> ", end=" ")

gps = train_test.groupby(['memberid','resort_region_code'])['roomnights'].aggregate(['median','count','max'])
gps.columns = ['mem_resortRegion_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['memberid','resort_region_code'], how='left')

gps = train_test.groupby(['memberid','resort_type_code'])['roomnights'].aggregate(['median','count','max'])
gps.columns = ['mem_resortType_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['memberid','resort_type_code'], how='left')

gps = train_test.groupby(['memberid','state_code_residence'])['roomnights'].aggregate(['median','count','max'])
gps.columns = ['mem_staResidence_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['memberid','state_code_residence'], how='left')

gps = train_test.groupby(['memberid','state_code_resort'])['roomnights'].aggregate(['median','count','max'])
gps.columns = ['mem_stResort_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['memberid','state_code_resort'], how='left')

gps = train_test.groupby(['resort_id','state_code_residence'])['roomnights'].aggregate(['median','count','max'])
gps.columns = ['res_staResidence_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['resort_id','state_code_residence'], how='left')

gps = train_test.groupby(['resort_id','state_code_resort'])['roomnights'].aggregate(['median','count','max'])
gps.columns = ['res_stResort_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['resort_id','state_code_resort'], how='left')


gps = train_test.groupby(['resort_id','cluster_code'])['roomnights'].aggregate(['median','count','max'])
gps.columns = ['res_cluster_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['resort_id','cluster_code'], how='left')

gps = train_test.groupby(['resort_id','resort_type_code'])['roomnights'].aggregate(['median','count','max'])
gps.columns = ['res_resortType_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['resort_id','resort_type_code'], how='left')


print(train_test.shape)


Shape:  (488189, 33)  ==>  (488189, 57)


In [13]:
print("Shape: ", train_test.shape, " ==> ", end=" ")

gps = train_test.groupby(['days_diff','resort_region_code'])['roomnights'].aggregate(['count'])
gps.columns = ['dayDiff_resortRegion_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['days_diff','resort_region_code'], how='left')

gps = train_test.groupby(['days_diff','resort_type_code'])['roomnights'].aggregate(['count'])
gps.columns = ['dayDiff_resortType_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['days_diff','resort_type_code'], how='left')

gps = train_test.groupby(['days_diff','state_code_residence'])['roomnights'].aggregate(['count'])
gps.columns = ['dayDiff_staResidence_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['days_diff','state_code_residence'], how='left')

gps = train_test.groupby(['days_diff','state_code_resort'])['roomnights'].aggregate(['count'])
gps.columns = ['dayDiff_stResort_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['days_diff','state_code_resort'], how='left')


gps = train_test.groupby(['days_diff','cluster_code'])['roomnights'].aggregate(['count'])
gps.columns = ['dayDiff_cluster_'+col for col in gps.columns]
train_test = pd.merge(train_test, gps, on=['days_diff','cluster_code'], how='left')


print(train_test.shape)


Shape:  (488189, 57)  ==>  (488189, 62)


In [14]:
train_test.to_csv('data/train_test.csv', index=False)
print("saved features")

saved features
