## Idea:
Given $k$, build a model to predict the number of visitors after k days using the following features:
1. (holidayflag, day_of_week, is_closed, #visitors) for the past n weeks.
2. store_id, gentre, area

Only do label encoding to categorical variables

In [1]:
import os
import numpy as np
import pandas as pd
import platform
import pickle
from pathlib import Path
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%qtconsole

In [3]:
if platform.system() == 'Linux':
    data_dir = '/home/alin/Data/Recruit_Holding'
else:
    data_dir = 'C:/Users/alin/Documents/Data/Recruit_Holding'

In [4]:
DFS_dump = data_dir + '/DFS.p'
if Path(DFS_dump).is_file():
    print('load previous dump')
    DFS = pickle.load(open(DFS_dump, 'rb'))
    air_reserve = DFS['air_reserve']
    air_reserve_day = DFS['air_reserve_day']
    hpg_reserve = DFS['hpg_reserve']
    hpg_reserve_day = DFS['hpg_reserve_day']
    air_visit_hist = DFS['air_visit_hist']
    date_info = DFS['date_info']
    test = DFS['test']
    air_store_info = DFS['air_store_info']
    hpg_store_info = DFS['hpg_store_info']
    store_id_relation = DFS['store_id_relation']
    test = DFS['test']
else:
    print('run EDA1 first')

load previous dump


### Build the training and testing datasets before label encoding

### step 1: add dates when a store is closed.

In [5]:
from itertools import product

In [6]:
def get_grid(k = 3):
    '''
    Keep the last k weeks of air_vist_hist, then for any store missing on any day,  create the corresponding 
    row with expacted valud 0
    '''
    last_train_day = max(air_visit_hist.day_ind)
    first_train_day = last_train_day - k * 7 + 1
    
    #filter into desire time frame
    hist1 = air_visit_hist[(air_visit_hist.day_ind >= first_train_day) & (air_visit_hist.day_ind <= last_train_day)].copy()
    all_stores = hist1.air_store_id.unique()
    all_days = [i for i in range(first_train_day, last_train_day+1)]
    
    #create store x day grid
    grid = np.array(list(product(*[all_stores, all_days])))
    grid = pd.DataFrame(grid, columns=['air_store_id', 'day_ind_str' ])
    grid['day_ind'] = grid.apply(lambda r: int(r['day_ind_str']), axis=1)
    grid.drop('day_ind_str', axis=1, inplace=True)
    
    # add visit information 
    all_data = grid.merge(hist1, how='left', on=['air_store_id', 'day_ind'])
    
    # add date type information
    all_data = all_data.merge(date_info, on='day_ind', suffixes=['_l', ''])
    drop_columns = [col for col in all_data.columns if col[-1] == 'l']
    all_data.drop(drop_columns, inplace=True, axis=1)
    
    # add store information
    all_data = all_data.merge(air_store_info, on = 'air_store_id', suffixes = ['_l', ''])
    drop_columns = [col for col in all_data.columns if col[-1] == 'l'] + ['calendar_date', 'date', 'latitude', 'longitude', 'hpg_store_id']
    all_data.drop(drop_columns, inplace=True, axis=1)
    
    # for those dates on which the visit informaiton of a store is missing, assume that it was closed abd with visit number 0
    all_data['closed'] = all_data.apply(lambda r: 1 if pd.isnull(r['visitors']) else 0, axis=1)
    all_data.fillna(0, inplace=True)
    return all_data
        
    
    

In [7]:
grid = get_grid(k=2)

### 2.  create data frames with lag information

Given gap, create training set with lag_gap, lagp_(gap+1) ....

In [59]:
def append_lag(grid, lag_begin, lag_length ):
    ''' 
    Add lag information to  grid to create training set
    Specifically, given a row with day_ind = D, and lag_begin = 7, lag_end = 14
    we add lag_7, lag_8, ..., lag_14 to this row   
    
    This is used to traing a model to forecast the visitors lag_begin days in the future
    '''
    index_cols = ['air_store_id', 'day_ind']
    cols_to_rename = ['visitors', 'day_of_week', 'holiday_flg', 'holiday_eve', 'closed']
    
    grid_cp = grid.copy()
    lag_end = lag_begin + lag_length - 1
    for day_shift in range(lag_begin, lag_end + 1):
        print('train day:', day_shift)
        grid_shift = grid[index_cols + cols_to_rename].copy()
        grid_shift['day_ind'] = grid_shift['day_ind'] + day_shift   
        foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        grid_shift = grid_shift.rename(columns=foo)
        grid = pd.merge(grid, grid_shift, on=index_cols, how='left')
        del grid_shift
    grid_train = grid[~pd.isnull(grid['visitors_lag_' + str(lag_end)])].copy()
    grid_train = grid_train[grid_train['closed'] != 1]
    grid_train.drop(['day_ind', 'month_ind', 'closed'], axis=1, inplace=True)

    max_day_ind = np.max(grid.day_ind)
    grid_test = grid_cp[grid_cp.day_ind == max_day_ind]
    
    f = lambda x: '{}_lag_{}'.format(x, str(lag_begin)) if x in cols_to_rename else x
    grid_test = grid_test.rename(columns=f)
  
    for day_shift in range(lag_begin + 1, lag_end + 1):
        print('test day:', day_shift)
        grid_shift = grid_cp[grid_cp.day_ind == (max_day_ind - day_shift + lag_begin)][['air_store_id'] + cols_to_rename].copy()
        f = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        grid_shift = grid_shift.rename(columns=f)
        grid_test = pd.merge(grid_test, grid_shift, on='air_store_id')
        del grid_shift       
    grid_test.drop(['day_ind', 'month_ind'], axis=1, inplace=True)
    return grid_train, grid_test

### Label encode the categorical variables

In [60]:
from sklearn.base import BaseEstimator, TransformerMixin


In [61]:
from sklearn.preprocessing import LabelEncoder

In [62]:
class CatLabler(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
    def fit(self, X, y=None):
        encoders = {}
        for col in self.cat_cols:
            encoder = LabelEncoder()
            encoder.fit(X[col])
            encoders[col] = encoder
        self.encoders = encoders
        return self
    def transform(self, X, y=None):
        X_new = X.copy()
        for col in self.cat_cols:
            X_new[col] = self.encoders[col].transform(X[col])
        return X_new

### Put all together to create final dataset

In [90]:
def create_train_test(k_grid=15, lag_length = 21, lag_begin0=1, lag_begin1=39):
    '''
    input:
    k_grid -- create k weeks grid starting from the last date in air_visit_hist
    k_train -- when creating the training data, if using the last_date in air_visit_hist as y, then X goes back to 
        last date - k_train * 7 day
        Example: if k_grid = 10 and lag_length = 21, the last day_ind in air_visit_hist is 477,
            then the grid will have every store with day_ind from 477 - 7*15 + 1 = 373 to 477.
        Now since lag_length = 21, then the training data with LAG = 2 will have the following:
        (i) day_ind = 477 as y, day_ind = 475, 474, ...., 475 - 21 + 1 = 475, 474, ..., 455 as X
        (ii) day_ind = 476 as y, day_ind = 474, 473, ....., 454 as X
        (iii) day_ind = 475 as y, day_ind = 473, 472, ...., 453 as X
         ....
         (x) day_ind = 464  as y, day_ind =  462                  408 as X
     
        The corresponding test/predict X_test and y_testwill be
        y_test for  day_ind = 477 + LAG = 479
        X_test consists of day_ind = 477, 476, ... 457
     
    lag: for each LAG between lag_begin0 and lag_begin1, create train set
    X_train_lag and y_train_lag where X_train has lag from LAG to 7 * k - 1,
    also create a test set X_test_lag 
    '''
    grid = get_grid(k_grid)
    last_train_day = np.max(air_visit_hist.day_ind)
    
    Data = {}
    for lag_begin in range(lag_begin0, lag_begin1 + 1):
        print('lag_begin=', lag_begin)
        lag_end = lag_begin + lag_length - 1
        gtrain, gtest = append_lag(grid, lag_begin, lag_length)
        cat_columns = ['air_store_id', 'air_genre_name', 'air_area_name']  + ['day_of_week_lag_' +  str(lag) 
                                                                        for lag in range(lag_begin, lag_end + 1)]
        catLabler = CatLabler(cat_columns)
       
        y_train = gtrain.visitors
        X_train0 = gtrain[gtest.columns]
        X_test0 = gtest
       
        catLabler.fit(X_train0)
       
        X_train1 = catLabler.transform(X_train0)
      
        X_test1 = catLabler.transform(X_test0)
      
        Data[lag_begin] = (y_train, X_train1, X_test1, X_test0.air_store_id)
    return Data

## Submission 2.1  grid = 15, lag_length = 21, Random Forest

In [104]:
#Data = create_train_test()

In [103]:
pickle.dump(Data, open(data_dir + '/SubData15_21.p', 'wb'))

In [108]:
from sklearn.ensemble import RandomForestRegressor

In [111]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [166]:
last_train_date = np.max(air_visit_hist.day_ind)
Results = []
rf = RandomForestRegressor(n_estimators=300)
for i in range(1, 40):
    print(i)
    y_train, X_train, X_test, stores = Data[i]
    rf.fit(X_train, y_train)
    y_predict = rf.predict(X_test)
    rdf = pd.DataFrame({'id': stores, 'visitors': y_predict})
    dt = date_info[date_info.day_ind == (last_train_date + i)]
    rdf['id'] = rdf.apply(lambda r: r['id'] + '_' + dt.calendar_date, axis=1)
    Results.append(rdf)
    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [183]:
Result_rf_df = pd.concat(Results)

sample_submission = pd.read_csv(data_dir + '/sample_submission.csv.zip')
sample_submission['ord'] = sample_submission.index

output_rf = Result_rf_df.merge(sample_submission, on='id', suffixes=['','_r'])[['id', 'visitors', 'ord']]

output = output_rf.sort_values('ord', axis=0)[['id', 'visitors']]
output.to_csv(data_dir + '/submission2_1.csv', index=False)

## Submission 2.2, grid = 15, lag_length = 21, ExtraTree

In [201]:
from sklearn.ensemble import ExtraTreesRegressor

In [211]:
last_train_date = np.max(air_visit_hist.day_ind)
Results = []
et = ExtraTreesRegressor(n_estimators=300)
for i in range(1, 40):
    print(i)
    y_train, X_train, X_test, stores = Data[i]
    et.fit(X_train, y_train)
    y_predict = et.predict(X_test)
    rdf = pd.DataFrame({'id': stores, 'visitors': y_predict})
    dt = date_info[date_info.day_ind == (last_train_date + i)]
    rdf['id'] = rdf.apply(lambda r: r['id'] + '_' + dt.calendar_date, axis=1)
    Results.append(rdf)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [212]:
Result_rf_df = pd.concat(Results)

sample_submission = pd.read_csv(data_dir + '/sample_submission.csv.zip')
sample_submission['ord'] = sample_submission.index

output_rf = Result_rf_df.merge(sample_submission, on='id', suffixes=['','_r'])[['id', 'visitors', 'ord']]

output = output_rf.sort_values('ord', axis=0)[['id', 'visitors']]
output.to_csv(data_dir + '/submission2_2.csv', index=False)

## Submission 2.3 Xgboost

In [29]:
Data = pickle.load(open(data_dir + '/SubData15_21.p', 'rb'))

In [None]:
import xgboost as xgb

In [69]:
last_train_date = np.max(air_visit_hist.day_ind)
Results = []
xgb_params = {
    'eta': 0.15,
    'max_depth': 5,
     'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

for i in range(1, 40):
    print(i)
    y_train, X_train, X_test, stores = Data[i]
    ly_train = np.log(y_train + 1)
    xgtrain = xgb.DMatrix(X_train.values, ly_train)
    xgtest = xgb.DMatrix(X_test.values)
    cvresult = xgb.cv(xgb_params, xgtrain, num_boost_round=1000, nfold=5, metrics='rmse', early_stopping_rounds=50, verbose_eval=False)
    num_rounds = cvresult.shape[0] + 55
    model = xgb.train(xgb_params, xgtrain, num_boost_round=num_rounds, evals = [(xgtrain, 'train')], verbose_eval=50)
    # make prediction
    ly_predict = model.predict(xgtest)
    y_predict = np.exp(ly_predict) - 1
    rdf = pd.DataFrame({'id': stores, 'visitors': y_predict})
    dt = date_info[date_info.day_ind == (last_train_date + i)]
    rdf['id'] = rdf.apply(lambda r: r['id'] + '_' + dt.calendar_date, axis=1)
    Results.append(rdf)

1
[0]	train-rmse:2.10166
[50]	train-rmse:0.498601
[100]	train-rmse:0.484249
[150]	train-rmse:0.472824
2
[0]	train-rmse:2.10117
[50]	train-rmse:0.500068
[100]	train-rmse:0.484594
[150]	train-rmse:0.473195
3
[0]	train-rmse:2.10354
[50]	train-rmse:0.49982
[100]	train-rmse:0.485468
[150]	train-rmse:0.473023
[200]	train-rmse:0.462507
4
[0]	train-rmse:2.10573
[50]	train-rmse:0.499252
[100]	train-rmse:0.484961
[150]	train-rmse:0.473032
5
[0]	train-rmse:2.10725
[50]	train-rmse:0.499533
[100]	train-rmse:0.484865
[150]	train-rmse:0.472543
[200]	train-rmse:0.461715
6
[0]	train-rmse:2.10987
[50]	train-rmse:0.499754
[100]	train-rmse:0.484306
[150]	train-rmse:0.472144
7
[0]	train-rmse:2.1105
[50]	train-rmse:0.501647
[100]	train-rmse:0.486622
[150]	train-rmse:0.474381
[200]	train-rmse:0.463868
8
[0]	train-rmse:2.10816
[50]	train-rmse:0.505435
[100]	train-rmse:0.489964
[150]	train-rmse:0.477365
9
[0]	train-rmse:2.1083
[50]	train-rmse:0.506427
[100]	train-rmse:0.491326
[150]	train-rmse:0.477573
[200]	t

In [72]:
Result_df = pd.concat(Results)

sample_submission = pd.read_csv(data_dir + '/sample_submission.csv.zip')
sample_submission['ord'] = sample_submission.index

output_df = Result_df.merge(sample_submission, on='id', suffixes=['','_r'])[['id', 'visitors', 'ord']]

output = output_df.sort_values('ord', axis=0)[['id', 'visitors']]
output.to_csv(data_dir + '/submission2_3.csv', index=False)

## Submission 2.4 lightgbm

In [6]:
import lightgbm as lgb

In [43]:
last_train_date = np.max(air_visit_hist.day_ind)
Results = []
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': {'rmse'},
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9
    }

for i in range(1, 40):
    print(i)
    y_train, X_train, X_test, stores = Data[i]
    ly_train = np.log(y_train + 1)
    lgb_train = lgb.Dataset(X_train, ly_train)
    cv_lgb = lgb.cv(params, lgb_train, num_boost_round = 1000, nfold=5, stratified=False, early_stopping_rounds = 50, 
                verbose_eval = False)
    num_rounds = len(cv_lgb['rmse-mean']) + 50
    gbm = lgb.train(params,lgb_train,num_boost_round=num_rounds)
    ly_predict = gbm.predict(X_test)
    y_predict = np.exp(ly_predict) - 1
    rdf = pd.DataFrame({'id': stores, 'visitors': y_predict})
    dt = date_info[date_info.day_ind == (last_train_date + i)]
    rdf['id'] = rdf.apply(lambda r: r['id'] + '_' + dt.calendar_date, axis=1)
    Results.append(rdf)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [44]:
Result_df = pd.concat(Results)

sample_submission = pd.read_csv(data_dir + '/sample_submission.csv.zip')
sample_submission['ord'] = sample_submission.index

output_df = Result_df.merge(sample_submission, on='id', suffixes=['','_r'])[['id', 'visitors', 'ord']]

output = output_df.sort_values('ord', axis=0)[['id', 'visitors']]
output.to_csv(data_dir + '/submission2_4.csv', index=False)

## Submission 2.5  Average of 2.3 and 2.4

In [4]:
sub1 = pd.read_csv(data_dir + '/submission2_1.csv')

In [7]:
sub2 = pd.read_csv(data_dir + '/submission2_2.csv')
sub3 = pd.read_csv(data_dir + '/submission2_3.csv')
sub4 = pd.read_csv(data_dir + '/submission2_4.csv')

In [9]:
sub34 = pd.merge(sub3, sub4, on='id', suffixes=['_3', '_4'])

In [12]:
sub34['visitors'] = sub34.apply(lambda r: (r['visitors_3'] + r['visitors_4']) / 2, axis=1)

In [14]:
sub34.drop(['visitors_3', 'visitors_4'], axis=1, inplace=True)

In [17]:
sub34.to_csv(data_dir + '/submission2_34.csv', index=False)

## Submission 2.6 Average 2.2, 2.3 and 2.4

In [18]:
sub34 = pd.merge(sub3, sub4, on='id', suffixes=['_3', '_4'])

In [19]:
sub234 = pd.merge(sub2, sub34, on='id')

In [23]:
sub234['visitors'] = (sub234['visitors'] + sub234['visitors_3'] + sub234['visitors_4']) / 3

In [26]:
sub234.drop(['visitors_3', 'visitors_4'], axis=1, inplace=True)

In [28]:
sub234.to_csv(data_dir + '/submission2_234.csv', index=False)