## Idea:
Given $k$, build a model to predict the number of visitors after k days using the following features:
1. (holidayflag, day_of_week, is_closed, #visitors) for the past n weeks.
2. store_id, gentre, area

Only do label encoding to categorical variables

In [1]:
import os
import numpy as np
import pandas as pd
import platform
import pickle
from pathlib import Path
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%qtconsole

In [3]:
if platform.system() == 'Linux':
    data_dir = '/home/alin/Data/Recruit_Holding'
else:
    data_dir = 'C:/Users/alin/Documents/Data/Recruit_Holding'

In [4]:
DFS_dump = data_dir + '/DFS.p'
if Path(DFS_dump).is_file():
    print('load previous dump')
    DFS = pickle.load(open(DFS_dump, 'rb'))
    air_reserve = DFS['air_reserve']
    air_reserve_day = DFS['air_reserve_day']
    hpg_reserve = DFS['hpg_reserve']
    hpg_reserve_day = DFS['hpg_reserve_day']
    air_visit_hist = DFS['air_visit_hist']
    date_info = DFS['date_info']
    test = DFS['test']
    air_store_info = DFS['air_store_info']
    hpg_store_info = DFS['hpg_store_info']
    store_id_relation = DFS['store_id_relation']
    test = DFS['test']
else:
    print('run EDA1 first')

load previous dump


### Build the training and testing datasets before label encoding

### step 1: add dates when a store is closed.

In [5]:
from itertools import product

In [6]:
def get_grid(k = 3):
    '''
    Keep the last k weeks of air_vist_hist, then for any store missing on any day,  create the corresponding 
    row with expacted valud 0
    '''
    last_train_day = max(air_visit_hist.day_ind)
    first_train_day = last_train_day - k * 7 + 1
    
    #filter into desire time frame
    hist1 = air_visit_hist[(air_visit_hist.day_ind >= first_train_day) & (air_visit_hist.day_ind <= last_train_day)].copy()
    all_stores = hist1.air_store_id.unique()
    all_days = [i for i in range(first_train_day, last_train_day+1)]
    
    #create store x day grid
    grid = np.array(list(product(*[all_stores, all_days])))
    grid = pd.DataFrame(grid, columns=['air_store_id', 'day_ind_str' ])
    grid['day_ind'] = grid.apply(lambda r: int(r['day_ind_str']), axis=1)
    grid.drop('day_ind_str', axis=1, inplace=True)
    
    # add visit information 
    all_data = grid.merge(hist1, how='left', on=['air_store_id', 'day_ind'])
    
    # add date type information
    all_data = all_data.merge(date_info, on='day_ind', suffixes=['_l', ''])
    drop_columns = [col for col in all_data.columns if col[-1] == 'l']
    all_data.drop(drop_columns, inplace=True, axis=1)
    
    # add store information
    all_data = all_data.merge(air_store_info, on = 'air_store_id', suffixes = ['_l', ''])
    drop_columns = [col for col in all_data.columns if col[-1] == 'l'] + ['calendar_date', 'date', 'latitude', 'longitude', 'hpg_store_id']
    all_data.drop(drop_columns, inplace=True, axis=1)
    
    # for those dates on which the visit informaiton of a store is missing, assume that it was closed abd with visit number 0
    all_data['closed'] = all_data.apply(lambda r: 1 if pd.isnull(r['visitors']) else 0, axis=1)
    all_data.fillna(0, inplace=True)
    return all_data
        
    
    

In [7]:
grid = get_grid(k=2)

### 2.  create data frames with lag information

Given gap, create training set with lag_gap, lagp_(gap+1) ....

In [82]:
def append_lag(grid, lag_begin, lag_end ):
    ''' 
    Add lag information to  grid to create training set
    Specifically, given a row with day_ind = D, and lag_begin = 7, lag_end = 14
    we add lag_7, lag_8, ..., lag_14 to this row   
    
    This is used to traing a model to forecast the visitors lag_begin days in the future
    '''
    index_cols = ['air_store_id', 'day_ind']
    cols_to_rename = ['visitors', 'day_of_week', 'holiday_flg', 'holiday_eve', 'closed']
    
    grid_cp = grid.copy()
    for day_shift in range(lag_begin, lag_end + 1):
        grid_shift = grid[index_cols + cols_to_rename].copy()
        grid_shift['day_ind'] = grid_shift['day_ind'] + day_shift   
        foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        grid_shift = grid_shift.rename(columns=foo)
        grid = pd.merge(grid, grid_shift, on=index_cols, how='left')
    grid_train = grid[~pd.isnull(grid['visitors_lag_' + str(lag_end)])]
    grid_train = grid_train[grid_train['closed'] != 1]
    grid_train.drop(['day_ind', 'month_ind', 'closed'], axis=1, inplace=True)
    max_day_ind = np.max(grid.day_ind)
    grid_test = grid_cp[grid_cp.day_ind == max_day_ind]
    
    f = lambda x: '{}_lag_{}'.format(x, str(lag_begin)) if x in cols_to_rename else x
    grid_test = grid_test.rename(columns=f)
  
    for day_shift in range(lag_begin + 1, lag_end + 1):
        grid_shift = grid_cp[grid_cp.day_ind == (max_day_ind - day_shift + lag_begin)][['air_store_id'] + cols_to_rename].copy()
        f = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        grid_shift = grid_shift.rename(columns=f)
        grid_test = pd.merge(grid_test, grid_shift, on='air_store_id')
        
    grid_test.drop(['day_ind'], axis=1, inplace=True)
    return grid_train, grid_test

In [83]:
gtrain, gtest = append_lag(grid, 3, 5)

In [86]:
y_train = gtrain.visitors

In [95]:
lag_columns = [fld + '_lag_' +  str(lag) for lag in range(3,5) for fld in ['visitors', 'day_of_week', 'holiday_flg', 'holiday_eve', 'closed']]

In [None]:
used_cols = ['air_store_id', 'air_genre_name', 'air_area_name']

In [43]:
gtrain.head(10)

Unnamed: 0,air_store_id,day_ind,visitors,day_of_week,holiday_flg,month_ind,holiday_eve,air_genre_name,air_area_name,closed,...,visitors_lag_4,day_of_week_lag_4,holiday_flg_lag_4,holiday_eve_lag_4,closed_lag_4,visitors_lag_5,day_of_week_lag_5,holiday_flg_lag_5,holiday_eve_lag_5,closed_lag_5
5,air_ba937bf13d40fb24,469,27.0,Friday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0,...,3.0,Monday,0.0,0.0,0.0,0.0,Sunday,0.0,0.0,1.0
6,air_ba937bf13d40fb24,470,16.0,Saturday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0,...,9.0,Tuesday,0.0,0.0,0.0,3.0,Monday,0.0,0.0,0.0
7,air_ba937bf13d40fb24,471,1.0,Sunday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0,...,7.0,Wednesday,0.0,0.0,0.0,9.0,Tuesday,0.0,0.0,0.0
8,air_ba937bf13d40fb24,472,10.0,Monday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0,...,17.0,Thursday,0.0,0.0,0.0,7.0,Wednesday,0.0,0.0,0.0
9,air_ba937bf13d40fb24,473,11.0,Tuesday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0,...,27.0,Friday,0.0,0.0,0.0,17.0,Thursday,0.0,0.0,0.0
10,air_ba937bf13d40fb24,474,11.0,Wednesday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0,...,16.0,Saturday,0.0,0.0,0.0,27.0,Friday,0.0,0.0,0.0
11,air_ba937bf13d40fb24,475,14.0,Thursday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0,...,1.0,Sunday,0.0,0.0,0.0,16.0,Saturday,0.0,0.0,0.0
12,air_ba937bf13d40fb24,476,40.0,Friday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0,...,10.0,Monday,0.0,0.0,0.0,1.0,Sunday,0.0,0.0,0.0
13,air_ba937bf13d40fb24,477,23.0,Saturday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0,...,11.0,Tuesday,0.0,0.0,0.0,10.0,Monday,0.0,0.0,0.0
19,air_8e4360a64dbd4c50,469,23.0,Friday,0,15,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōgimachi,0,...,11.0,Monday,0.0,0.0,0.0,29.0,Sunday,0.0,0.0,0.0


In [14]:
cols_to_rename = ['visitors', 'day_of_week', 'holiday_flg', 'holiday_eve', 'closed']

In [10]:
max_day_ind = np.max(grid.day_ind)

In [11]:
g1 = grid[grid.day_ind == max_day_ind].copy()

In [15]:
f = lambda x: '{}_lag_{}'.format(x, 3) if x in cols_to_rename else x

In [16]:
g1 = g1.rename(columns=f)

In [28]:
g2 = grid[grid.day_ind == (max_day_ind - 1)][['air_store_id'] + cols_to_rename].copy()
                                             

In [29]:
f = lambda x: '{}_lag_{}'.format(x, 4) if x in cols_to_rename else x
g2 = g2.rename(columns=f)

In [31]:
grid_test = pd.merge(g1, g2, on = 'air_store_id')

In [141]:
grid_lag_new = append_lag(grid, 2, 8)

In [133]:
grid_lag = append_lag(grid, 1)

In [114]:
grid.head(14)

Unnamed: 0,air_store_id,day_ind,visitors,day_of_week,holiday_flg,month_ind,holiday_eve,air_genre_name,air_area_name,closed
0,air_ba937bf13d40fb24,464,0.0,Sunday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,1
1,air_ba937bf13d40fb24,465,3.0,Monday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0
2,air_ba937bf13d40fb24,466,9.0,Tuesday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0
3,air_ba937bf13d40fb24,467,7.0,Wednesday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0
4,air_ba937bf13d40fb24,468,17.0,Thursday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0
5,air_ba937bf13d40fb24,469,27.0,Friday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0
6,air_ba937bf13d40fb24,470,16.0,Saturday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0
7,air_ba937bf13d40fb24,471,1.0,Sunday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0
8,air_ba937bf13d40fb24,472,10.0,Monday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0
9,air_ba937bf13d40fb24,473,11.0,Tuesday,0,15,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,0


In [116]:
grid_lag[index_cols + ['visitors', 'day_of_week_lag_2', 'visitors_lag_2']]

Unnamed: 0,air_store_id,day_ind,visitors,day_of_week_lag_2,visitors_lag_2
0,air_ba937bf13d40fb24,464,0.0,,
1,air_ba937bf13d40fb24,465,3.0,,
2,air_ba937bf13d40fb24,466,9.0,Sunday,0.0
3,air_ba937bf13d40fb24,467,7.0,Monday,3.0
4,air_ba937bf13d40fb24,468,17.0,Tuesday,9.0
5,air_ba937bf13d40fb24,469,27.0,Wednesday,7.0
6,air_ba937bf13d40fb24,470,16.0,Thursday,17.0
7,air_ba937bf13d40fb24,471,1.0,Friday,27.0
8,air_ba937bf13d40fb24,472,10.0,Saturday,16.0
9,air_ba937bf13d40fb24,473,11.0,Sunday,1.0


In [81]:
grid0 = grid[grid.air_store_id == st]

In [82]:
cols_to_rename = ['visitors', 'day_of_week', 'holiday_flg', 'holiday_eve']

In [83]:
day_shift = 2

In [84]:
grid0_shift = grid0[index_cols + cols_to_rename].copy()

In [89]:
grid0_shift['day_ind'] = grid0_shift['day_ind'] + day_shift

In [91]:
foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x

In [92]:
grid0_shift = grid0_shift.rename(columns=foo)

In [95]:
grid0 = pd.merge(grid0, grid0_shift, on=index_cols, how='left')

In [None]:
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)


In [32]:
k = 3
last_train_day = max(air_visit_hist.day_ind)
first_train_day = last_train_day - k * 14 + 1

In [36]:
hist1 = air_visit_hist[(air_visit_hist.day_ind >= first_train_day) & (air_visit_hist.day_ind <= last_train_day)].copy()

In [38]:
all_stores = hist1.air_store_id.unique()

In [40]:
all_days = [i for i in range(first_train_day, last_train_day+1)]

In [58]:
grid = np.array(list(product(*[all_stores, all_days])))

In [60]:
grid = pd.DataFrame(grid, columns=['air_store_id', 'day_ind_str' ])

In [63]:
grid['day_ind'] = grid.apply(lambda r: int(r['day_ind_str']), axis=1)

In [66]:
grid.drop('day_ind_str', axis=1, inplace=True)

In [101]:
all_data = grid.merge(hist1, how='left', on=['air_store_id', 'day_ind'])

In [102]:
all_data = all_data.merge(date_info, on='day_ind', suffixes=['_l', ''])

In [103]:
drop_columns = [col for col in all_data.columns if col[-1] == 'l']

In [104]:
all_data.drop(drop_columns, inplace=True, axis=1)

In [105]:
all_data = all_data.merge(air_store_info, on = 'air_store_id', suffixes = ['_l', ''])

In [106]:
drop_columns = [col for col in all_data.columns if col[-1] == 'l'] + ['calendar_date', 'date', 'latitude', 'longitude', 'hpg_store_id']

In [107]:
all_data.drop(drop_columns, inplace=True, axis=1)

In [112]:
all_data['closed'] = all_data.apply(lambda r: 1 if pd.isnull(r['visitors']) else 0, axis=1)

In [116]:
all_data.fillna(0, inplace=True)