## Idea:
Given $k$, build a model to predict the number of visitors after k days using the following features:
1. (holidayflag, day_of_week, is_closed, #visitors) for the past n weeks.
2. store_id, gentre, area

Try one-hot encoding with several categorial variables

In [1]:
import os
import numpy as np
import pandas as pd
import platform
import pickle
from pathlib import Path
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%qtconsole

In [3]:
if platform.system() == 'Linux':
    data_dir = '/home/alin/Data/Recruit_Holding'
else:
    data_dir = 'C:/Users/alin/Documents/Data/Recruit_Holding'

In [4]:
DFS_dump = data_dir + '/DFS.p'
if Path(DFS_dump).is_file():
    print('load previous dump')
    DFS = pickle.load(open(DFS_dump, 'rb'))
    air_reserve = DFS['air_reserve']
    air_reserve_day = DFS['air_reserve_day']
    hpg_reserve = DFS['hpg_reserve']
    hpg_reserve_day = DFS['hpg_reserve_day']
    air_visit_hist = DFS['air_visit_hist']
    date_info = DFS['date_info']
    test = DFS['test']
    air_store_info = DFS['air_store_info']
    hpg_store_info = DFS['hpg_store_info']
    store_id_relation = DFS['store_id_relation']
    test = DFS['test']
else:
    print('run EDA1 first')

load previous dump


### Build the training and testing datasets before label encoding

### step 1: add dates when a store is closed.

In [5]:
from itertools import product

In [6]:
def get_grid(k = 3):
    '''
    Keep the last k weeks of air_vist_hist, then for any store missing on any day,  create the corresponding 
    row with expacted valud 0
    '''
    last_train_day = max(air_visit_hist.day_ind)
    first_train_day = last_train_day - k * 7 + 1
    
    #filter into desire time frame
    hist1 = air_visit_hist[(air_visit_hist.day_ind >= first_train_day) & (air_visit_hist.day_ind <= last_train_day)].copy()
    all_stores = hist1.air_store_id.unique()
    all_days = [i for i in range(first_train_day, last_train_day+1)]
    
    #create store x day grid
    grid = np.array(list(product(*[all_stores, all_days])))
    grid = pd.DataFrame(grid, columns=['air_store_id', 'day_ind_str' ])
    grid['day_ind'] = grid.apply(lambda r: int(r['day_ind_str']), axis=1)
    grid.drop('day_ind_str', axis=1, inplace=True)
    
    # add visit information 
    all_data = grid.merge(hist1, how='left', on=['air_store_id', 'day_ind'])
    
    # add date type information
    all_data = all_data.merge(date_info, on='day_ind', suffixes=['_l', ''])
    drop_columns = [col for col in all_data.columns if col[-1] == 'l']
    all_data.drop(drop_columns, inplace=True, axis=1)
    
    # add store information
    all_data = all_data.merge(air_store_info, on = 'air_store_id', suffixes = ['_l', ''])
    drop_columns = [col for col in all_data.columns if col[-1] == 'l'] + ['calendar_date', 'date', 'latitude', 'longitude', 'hpg_store_id']
    all_data.drop(drop_columns, inplace=True, axis=1)
    
    # for those dates on which the visit informaiton of a store is missing, assume that it was closed abd with visit number 0
    all_data['closed'] = all_data.apply(lambda r: 1 if pd.isnull(r['visitors']) else 0, axis=1)
    all_data.fillna(0, inplace=True)
    return all_data
        
    
    

In [7]:
grid = get_grid(k=2)

### 2.  create data frames with lag information

Given gap, create training set with lag_gap, lagp_(gap+1) ....

In [17]:
def span_lag(field, lag_begin, lag_length):
    return [field + '_lag_' + str(i) for i in range(lag_begin, lag_begin + lag_length)]


In [31]:
def append_lag(grid, lag_begin, lag_length ):
    ''' 
    Add lag information to  grid to create training set
    Specifically, given a row with day_ind = D, and lag_begin = 7, lag_end = 14
    we add lag_7, lag_8, ..., lag_14 to this row   
    
    This is used to traing a model to forecast the visitors lag_begin days in the future
    '''
    index_cols = ['air_store_id', 'day_ind']
    cols_to_rename = ['visitors', 'day_of_week', 'holiday_flg', 'holiday_eve', 'closed']
    
    grid_cp = grid.copy()
    lag_end = lag_begin + lag_length - 1
    for day_shift in range(lag_begin, lag_end + 1):
      
        print('train day:', day_shift)
        grid_shift = grid[index_cols + cols_to_rename].copy()
        grid_shift['day_ind'] = grid_shift['day_ind'] + day_shift   
        foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        grid_shift = grid_shift.rename(columns=foo)
        grid = pd.merge(grid, grid_shift, on=index_cols, how='left')
        del grid_shift
    grid_train = grid[~pd.isnull(grid['visitors_lag_' + str(lag_end)])].copy()
    grid_train = grid_train[grid_train['closed'] != 1]
    grid_train.drop(['day_ind', 'month_ind', 'closed'], axis=1, inplace=True)

    max_day_ind = np.max(grid.day_ind)
    grid_test = grid_cp[grid_cp.day_ind == max_day_ind]
    
    f = lambda x: '{}_lag_{}'.format(x, str(lag_begin)) if x in cols_to_rename else x
    grid_test = grid_test.rename(columns=f)
    grid_test['target_day_ind'] = grid_test['day_ind'] + lag_begin   
    for day_shift in range(lag_begin + 1, lag_end + 1):
        print('test day:', day_shift)
        grid_shift = grid_cp[grid_cp.day_ind == (max_day_ind - day_shift + lag_begin)][['air_store_id'] + cols_to_rename].copy()
        f = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        grid_shift = grid_shift.rename(columns=f)
        grid_test = pd.merge(grid_test, grid_shift, on='air_store_id')
        del grid_shift
    grid_test.drop(['day_ind', 'month_ind'], axis=1, inplace=True)
    grid_test = pd.merge(grid_test, date_info, left_on='target_day_ind', right_on = 'day_ind')
    grid_test.drop(['target_day_ind', 'calendar_date', 'date', 'day_ind', 'month_ind'], axis=1, inplace=True)   
    
    num_cols = span_lag('visitors', lag_begin, lag_length) + span_lag('holiday_flg', lag_begin, lag_length) \
                + span_lag('closed', lag_begin, lag_length) + span_lag('holiday_eve', lag_begin, lag_length) \
                + ['holiday_flg', 'holiday_eve']
    wday_lag_cols = span_lag('day_of_week', lag_begin, lag_length)
    return grid_train, grid_test, num_cols, wday_lag_cols

### Label encode the categorical variables

In [10]:
#from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer

In [49]:
wday_onehotter = LabelBinarizer()
genre_onehotter = LabelBinarizer()
wday_labler = LabelEncoder()
store_labler = LabelEncoder()
area_labler = LabelEncoder()
wdays = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
wday_onehotter.fit(wdays)
genre_onehotter.fit(np.unique(air_store_info.air_genre_name))
wday_labler.fit(wdays)
store_labler.fit(np.unique(air_store_info.air_store_id))
area_labler.fit(np.unique(air_store_info.air_area_name))

LabelEncoder()

In [47]:
def encode_cat(X, num_cols, wday_lag_cols):
    '''encode categorical variables'''
    day_of_week_new = wday_onehotter.transform(X['day_of_week'])
    genre_new = genre_onehotter.transform(X['air_genre_name'])
    store_new = store_labler.transform(X['air_store_id'])
    area_new = area_labler.transform(X['air_area_name'])
    X_wday_lag = X[wday_lag_cols].copy()
    for wday_lag in wday_lag_cols:
        X_wday_lag[wday_lag] = wday_labler.transform(X[wday_lag])
    X_num = X[num_cols].copy()
    return np.c_[day_of_week_new, genre_new, store_new, area_new, X_wday_lag, X_num]

In [14]:
grid = get_grid(5)

In [32]:
gtrain, gtest, num_cols, wday_lag_cols = append_lag(grid, 2, 3)

train day: 2
train day: 3
train day: 4
test day: 3
test day: 4


In [45]:
X_train0 = gtrain[gtest.columns]
y_train = gtrain['visitors']
X_test0 = gtest

In [50]:
X_train = encode_cat(X_train0, num_cols, wday_lag_cols)

In [66]:
X_test = encode_cat(X_test0, num_cols, wday_lag_cols)

### Put all together to create final dataset

In [69]:
def create_train_test(k_grid=15, lag_length = 21, lag_begin0=1, lag_begin1=39):
    '''
    input:
    k_grid -- create k weeks grid starting from the last date in air_visit_hist
    k_train -- when creating the training data, if using the last_date in air_visit_hist as y, then X goes back to 
        last date - k_train * 7 day
        Example: if k_grid = 10 and lag_length = 21, the last day_ind in air_visit_hist is 477,
            then the grid will have every store with day_ind from 477 - 7*15 + 1 = 373 to 477.
        Now since lag_length = 21, then the training data with LAG = 2 will have the following:
        (i) day_ind = 477 as y, day_ind = 475, 474, ...., 475 - 21 + 1 = 475, 474, ..., 455 as X
        (ii) day_ind = 476 as y, day_ind = 474, 473, ....., 454 as X
        (iii) day_ind = 475 as y, day_ind = 473, 472, ...., 453 as X
         ....
         (x) day_ind = 464  as y, day_ind =  462                  408 as X
     
        The corresponding test/predict X_test and y_testwill be
        y_test for  day_ind = 477 + LAG = 479
        X_test consists of day_ind = 477, 476, ... 457
     
    lag: for each LAG between lag_begin0 and lag_begin1, create train set
    X_train_lag and y_train_lag where X_train has lag from LAG to 7 * k - 1,
    also create a test set X_test_lag 
    '''
    grid = get_grid(k_grid)
    last_train_day = np.max(air_visit_hist.day_ind)
    
    Data = {}
    for lag_begin in range(lag_begin0, lag_begin1 + 1):
        print('lag_begin=', lag_begin)
        lag_end = lag_begin + lag_length - 1
        gtrain, X_test0, num_cols, wday_lag_cols = append_lag(grid, lag_begin, lag_length)

        y_train = gtrain.visitors
        X_train0 = gtrain[X_test0.columns]
        
        X_train = encode_cat(X_train0, num_cols, wday_lag_cols)
        X_test = encode_cat(X_test0, num_cols, wday_lag_cols)
     
        Data[lag_begin] = (y_train, X_train, X_test, X_test0.air_store_id)
    return Data

In [84]:
Data = create_train_test()

lag_begin= 1
train day: 1
train day: 2
train day: 3
train day: 4
train day: 5
train day: 6
train day: 7
train day: 8
train day: 9
train day: 10
train day: 11
train day: 12
train day: 13
train day: 14
train day: 15
train day: 16
train day: 17
train day: 18
train day: 19
train day: 20
train day: 21
test day: 2
test day: 3
test day: 4
test day: 5
test day: 6
test day: 7
test day: 8
test day: 9
test day: 10
test day: 11
test day: 12
test day: 13
test day: 14
test day: 15
test day: 16
test day: 17
test day: 18
test day: 19
test day: 20
test day: 21
lag_begin= 2
train day: 2
train day: 3
train day: 4
train day: 5
train day: 6
train day: 7
train day: 8
train day: 9
train day: 10
train day: 11
train day: 12
train day: 13
train day: 14
train day: 15
train day: 16
train day: 17
train day: 18
train day: 19
train day: 20
train day: 21
train day: 22
test day: 3
test day: 4
test day: 5
test day: 6
test day: 7
test day: 8
test day: 9
test day: 10
test day: 11
test day: 12
test day: 13
test day: 14
te

test day: 33
test day: 34
test day: 35
lag_begin= 16
train day: 16
train day: 17
train day: 18
train day: 19
train day: 20
train day: 21
train day: 22
train day: 23
train day: 24
train day: 25
train day: 26
train day: 27
train day: 28
train day: 29
train day: 30
train day: 31
train day: 32
train day: 33
train day: 34
train day: 35
train day: 36
test day: 17
test day: 18
test day: 19
test day: 20
test day: 21
test day: 22
test day: 23
test day: 24
test day: 25
test day: 26
test day: 27
test day: 28
test day: 29
test day: 30
test day: 31
test day: 32
test day: 33
test day: 34
test day: 35
test day: 36
lag_begin= 17
train day: 17
train day: 18
train day: 19
train day: 20
train day: 21
train day: 22
train day: 23
train day: 24
train day: 25
train day: 26
train day: 27
train day: 28
train day: 29
train day: 30
train day: 31
train day: 32
train day: 33
train day: 34
train day: 35
train day: 36
train day: 37
test day: 18
test day: 19
test day: 20
test day: 21
test day: 22
test day: 23
test da

train day: 44
train day: 45
train day: 46
train day: 47
train day: 48
train day: 49
train day: 50
test day: 31
test day: 32
test day: 33
test day: 34
test day: 35
test day: 36
test day: 37
test day: 38
test day: 39
test day: 40
test day: 41
test day: 42
test day: 43
test day: 44
test day: 45
test day: 46
test day: 47
test day: 48
test day: 49
test day: 50
lag_begin= 31
train day: 31
train day: 32
train day: 33
train day: 34
train day: 35
train day: 36
train day: 37
train day: 38
train day: 39
train day: 40
train day: 41
train day: 42
train day: 43
train day: 44
train day: 45
train day: 46
train day: 47
train day: 48
train day: 49
train day: 50
train day: 51
test day: 32
test day: 33
test day: 34
test day: 35
test day: 36
test day: 37
test day: 38
test day: 39
test day: 40
test day: 41
test day: 42
test day: 43
test day: 44
test day: 45
test day: 46
test day: 47
test day: 48
test day: 49
test day: 50
test day: 51
lag_begin= 32
train day: 32
train day: 33
train day: 34
train day: 35
trai

In [85]:
pickle.dump(Data, open(data_dir + '/SubData15_21_c.p', 'wb'))

## Submission 3.1 Xgboost

In [29]:
#Data = pickle.load(open(data_dir + '/SubData15_21_b.p', 'rb'))

In [86]:
import xgboost as xgb



In [87]:
xgb_params = {
    'eta': 0.15,
    'max_depth': 5,
     'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [90]:
y_train, X_train, X_test, store = Data[3]

ly_train = np.log(y_train + 1)
X_train0, X_test0, ly_train0, ly_test0 = train_test_split(X_train, ly_train, test_size=0.33, random_state=42) 

xgtrain = xgb.DMatrix(X_train0, ly_train0)
xgtest = xgb.DMatrix(X_test0)

cvresult = xgb.cv(xgb_params, xgtrain, num_boost_round=1000, nfold=5, metrics='rmse', early_stopping_rounds=50, verbose_eval=False)

num_rounds = cvresult.shape[0] + 55
model = xgb.train(xgb_params, xgtrain, num_boost_round=num_rounds, evals = [(xgtrain, 'train')], verbose_eval=50)
    # make prediction
ly_predict = model.predict(xgtest)

mean_squared_error(ly_predict, ly_test0)

[0]	train-rmse:2.1044
[50]	train-rmse:0.491612
[100]	train-rmse:0.473289
[150]	train-rmse:0.456899


0.26815484035457643

In [91]:
last_train_date = np.max(air_visit_hist.day_ind)
Results = []
xgb_params = {
    'eta': 0.15,
    'max_depth': 5,
     'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

for i in range(1, 40):
    print(i)
    y_train, X_train, X_test, stores = Data[i]
    ly_train = np.log(y_train + 1)
    xgtrain = xgb.DMatrix(X_train, ly_train)
    xgtest = xgb.DMatrix(X_test)
    cvresult = xgb.cv(xgb_params, xgtrain, num_boost_round=1000, nfold=5, metrics='rmse', early_stopping_rounds=50, verbose_eval=False)
    num_rounds = cvresult.shape[0] + 55
    model = xgb.train(xgb_params, xgtrain, num_boost_round=num_rounds, evals = [(xgtrain, 'train')], verbose_eval=50)
    # make prediction
    ly_predict = model.predict(xgtest)
    y_predict = np.exp(ly_predict) - 1
    rdf = pd.DataFrame({'id': stores, 'visitors': y_predict})
    dt = date_info[date_info.day_ind == (last_train_date + i)]
    rdf['id'] = rdf.apply(lambda r: r['id'] + '_' + dt.calendar_date, axis=1)
    Results.append(rdf)

1
[0]	train-rmse:2.10185
[50]	train-rmse:0.498355
[100]	train-rmse:0.483795
[150]	train-rmse:0.471408
2
[0]	train-rmse:2.10159
[50]	train-rmse:0.498679
[100]	train-rmse:0.483639
[150]	train-rmse:0.472085
3
[0]	train-rmse:2.10413
[50]	train-rmse:0.49881
[100]	train-rmse:0.484752
[150]	train-rmse:0.473232
[200]	train-rmse:0.463003
4
[0]	train-rmse:2.10585
[50]	train-rmse:0.497598
[100]	train-rmse:0.48323
[150]	train-rmse:0.471215
5
[0]	train-rmse:2.10775
[50]	train-rmse:0.498305
[100]	train-rmse:0.484154
[150]	train-rmse:0.472071
6
[0]	train-rmse:2.11057
[50]	train-rmse:0.499006
[100]	train-rmse:0.482961
[150]	train-rmse:0.47119
7
[0]	train-rmse:2.11003
[50]	train-rmse:0.501154
[100]	train-rmse:0.48638
[150]	train-rmse:0.474276
8
[0]	train-rmse:2.10813
[50]	train-rmse:0.504144
[100]	train-rmse:0.488509
[150]	train-rmse:0.47595
9
[0]	train-rmse:2.10879
[50]	train-rmse:0.50489
[100]	train-rmse:0.490121
[150]	train-rmse:0.477537
[200]	train-rmse:0.466027
10
[0]	train-rmse:2.11077
[50]	train

In [92]:
Result_df = pd.concat(Results)

sample_submission = pd.read_csv(data_dir + '/sample_submission.csv.zip')
sample_submission['ord'] = sample_submission.index

output_df = Result_df.merge(sample_submission, on='id', suffixes=['','_r'])[['id', 'visitors', 'ord']]

output = output_df.sort_values('ord', axis=0)[['id', 'visitors']]
output.to_csv(data_dir + '/submission4_1.csv', index=False)

## Submission 4.2 lightgbm

In [93]:
import lightgbm as lgb

In [94]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [95]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': {'rmse'},
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9
    }



In [96]:
y_train, X_train, X_test, store = Data[3]

ly_train = np.log(y_train + 1)
X_train0, X_test0, ly_train0, ly_test0 = train_test_split(X_train, ly_train, test_size=0.33, random_state=42) 

lgb_train = lgb.Dataset(X_train0, ly_train0)

cv_lgb = lgb.cv(params, lgb_train, num_boost_round = 1000, nfold=5, stratified=False, early_stopping_rounds = 50, 
            verbose_eval = False)
num_rounds = len(cv_lgb['rmse-mean']) + 50
gbm = lgb.train(params,lgb_train,num_boost_round=num_rounds)
ly_predict = gbm.predict(X_test0)
print('mlse=', mean_squared_error(ly_predict, ly_test0))

mlse= 0.26666857354


In [97]:
last_train_date = np.max(air_visit_hist.day_ind)
Results = []
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': {'rmse'},
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9
    }

for i in range(1, 40):
    print(i)
    y_train, X_train, X_test, stores = Data[i]
    ly_train = np.log(y_train + 1)
    lgb_train = lgb.Dataset(X_train, ly_train)
    cv_lgb = lgb.cv(params, lgb_train, num_boost_round = 1000, nfold=5, stratified=False, early_stopping_rounds = 50, 
                verbose_eval = False)
    num_rounds = len(cv_lgb['rmse-mean']) + 50
    gbm = lgb.train(params,lgb_train,num_boost_round=num_rounds)
    ly_predict = gbm.predict(X_test)
    y_predict = np.exp(ly_predict) - 1
    rdf = pd.DataFrame({'id': stores, 'visitors': y_predict})
    dt = date_info[date_info.day_ind == (last_train_date + i)]
    rdf['id'] = rdf.apply(lambda r: r['id'] + '_' + dt.calendar_date, axis=1)
    Results.append(rdf)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [29]:
Result_df = pd.concat(Results)

sample_submission = pd.read_csv(data_dir + '/sample_submission.csv.zip')
sample_submission['ord'] = sample_submission.index

output_df = Result_df.merge(sample_submission, on='id', suffixes=['','_r'])[['id', 'visitors', 'ord']]

output = output_df.sort_values('ord', axis=0)[['id', 'visitors']]
output.to_csv(data_dir + '/submission4_2.csv', index=False)