## Idea:
Given $k$, build a model to predict the number of visitors after k days using the following features:
1. (holidayflag, day_of_week, is_closed, #visitors) for the past n weeks.
2. store_id, gentre, area
3. Add store reverve information if available
4. label encoding to categorical variables


In [1]:
import os
import numpy as np
import pandas as pd
import platform
import pickle
from pathlib import Path
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%qtconsole

In [3]:
if platform.system() == 'Linux':
    data_dir = '/home/alin/Data/Recruit_Holding'
else:
    data_dir = 'C:/Users/alin/Documents/Data/Recruit_Holding'

In [4]:
DFS_dump = data_dir + '/DFS.p'
if Path(DFS_dump).is_file():
    print('load previous dump')
    DFS = pickle.load(open(DFS_dump, 'rb'))
    air_reserve = DFS['air_reserve']
    air_reserve_day = DFS['air_reserve_day']
    hpg_reserve = DFS['hpg_reserve']
    hpg_reserve_day = DFS['hpg_reserve_day']
    air_visit_hist = DFS['air_visit_hist']
    date_info = DFS['date_info']
    test = DFS['test']
    air_store_info = DFS['air_store_info']
    hpg_store_info = DFS['hpg_store_info']
    store_id_relation = DFS['store_id_relation']
    test = DFS['test']
else:
    print('run EDA1 first')

load previous dump


### Build the training and testing datasets before label encoding

### step 0: add reserve data to air_visit_hist

In [5]:
air_reserve_day1 = air_reserve_day[['air_store_id', 'day_ind', 'reserve_visitors']]
hpg_reserve_day1 = hpg_reserve_day[['hpg_store_id', 'day_ind', 'reserve_visitors']]

air_visit_hist = air_visit_hist.merge(air_reserve_day1, on=['air_store_id', 'day_ind'], how='left')

air_visit_hist = air_visit_hist.merge(hpg_reserve_day1, on=['hpg_store_id',  'day_ind'], how='left', suffixes=['_air', '_hpg'])

air_visit_hist.fillna({'reserve_visitors_air': 0, 'reserve_visitors_hpg': 0}, inplace=True)

air_visit_hist['reserve_visitors'] = air_visit_hist.apply(lambda r: r['reserve_visitors_air'] + r['reserve_visitors_hpg'], axis=1)

air_visit_hist.drop(['month_ind', 'latitude', 'longitude', 'reserve_visitors_air', 'reserve_visitors_hpg'], axis=1, inplace=True)

### step 1: add dates when a store is closed.

In [6]:
from itertools import product

In [7]:
def get_grid(k = 3):
    '''
    Keep the last k weeks of air_vist_hist, then for any store missing on any day,  create the corresponding 
    row with expacted valud 0
    '''
    last_train_day = max(air_visit_hist.day_ind)
    first_train_day = last_train_day - k * 7 + 1
    
    #filter into desire time frame
    hist1 = air_visit_hist[(air_visit_hist.day_ind >= first_train_day) & (air_visit_hist.day_ind <= last_train_day)].copy()
    all_stores = hist1.air_store_id.unique()
    all_days = [i for i in range(first_train_day, last_train_day+1)]
    
    #create store x day grid
    grid = np.array(list(product(*[all_stores, all_days])))
    grid = pd.DataFrame(grid, columns=['air_store_id', 'day_ind_str' ])
    grid['day_ind'] = grid.apply(lambda r: int(r['day_ind_str']), axis=1)
    grid.drop('day_ind_str', axis=1, inplace=True)
    
    # add visit information 
    all_data = grid.merge(hist1, how='left', on=['air_store_id', 'day_ind'])
    
    # add date type information
    all_data = all_data.merge(date_info, on='day_ind', suffixes=['_l', ''])
    drop_columns = [col for col in all_data.columns if col[-1] == 'l']
    all_data.drop(drop_columns, inplace=True, axis=1)
    
    # add store information
    all_data = all_data.merge(air_store_info, on = 'air_store_id', suffixes = ['_l', ''])
    #drop_columns = [col for col in all_data.columns if col[-1] == 'l'] + ['calendar_date', 'date', 'latitude', 'longitude', 'hpg_store_id']
    drop_columns = [col for col in all_data.columns if col[-1] == 'l'] + ['calendar_date', 'date', 'latitude', 'longitude', 'month_ind']
    
    all_data.drop(drop_columns, inplace=True, axis=1)
    
    # for those dates on which the visit informaiton of a store is missing, assume that it was closed abd with visit number 0
    all_data['closed'] = all_data.apply(lambda r: 1 if pd.isnull(r['visitors']) else 0, axis=1)
    all_data.fillna(0, inplace=True)
    return all_data
        
    
    

In [8]:
#grid = get_grid(k=5)

### 2.  create data frames with lag information

Given gap, create training set with lag_gap, lagp_(gap+1) ....

In [9]:
def append_lag(grid, lag_begin, lag_length ):
    ''' 
    Add lag information to  grid to create training set
    Specifically, given a row with day_ind = D, and lag_begin = 7, lag_end = 14
    we add lag_7, lag_8, ..., lag_14 to this row   
    
    This is used to traing a model to forecast the visitors lag_begin days in the future
    '''
    index_cols = ['air_store_id', 'day_ind']
    cols_to_rename = ['visitors', 'day_of_week', 'holiday_flg', 'holiday_eve', 'closed', 'reserve_visitors']
    
    grid_cp = grid.copy()
    lag_end = lag_begin + lag_length - 1
    for day_shift in range(lag_begin, lag_end + 1):
        print('train day:', day_shift)
        grid_shift = grid[index_cols + cols_to_rename].copy()
        grid_shift['day_ind'] = grid_shift['day_ind'] + day_shift   
        foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        grid_shift = grid_shift.rename(columns=foo)
        grid = pd.merge(grid, grid_shift, on=index_cols, how='left')
        del grid_shift
    grid_train = grid[~pd.isnull(grid['visitors_lag_' + str(lag_end)])].copy()
    grid_train = grid_train[grid_train['closed'] != 1]
    grid_train.drop(['day_ind', 'closed', 'hpg_store_id'], axis=1, inplace=True)

    
    max_day_ind = np.max(grid.day_ind)
    target_day_ind = max_day_ind + lag_begin
    grid_test = grid_cp[grid_cp.day_ind == max_day_ind]
    print('shape', grid_test.shape)
    f = lambda x: '{}_lag_{}'.format(x, str(lag_begin)) if x in cols_to_rename else x
    grid_test = grid_test.rename(columns=f)
    grid_test['target_day_ind'] = grid_test['day_ind'] + lag_begin   
    for day_shift in range(lag_begin + 1, lag_end + 1):
        print('test day:', day_shift)
        grid_shift = grid_cp[grid_cp.day_ind == (max_day_ind - day_shift + lag_begin)][['air_store_id'] + cols_to_rename].copy()
        f = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        grid_shift = grid_shift.rename(columns=f)
        grid_test = pd.merge(grid_test, grid_shift, on='air_store_id')
        del grid_shift
    grid_test.drop(['day_ind'], axis=1, inplace=True)
    grid_test = pd.merge(grid_test, date_info, left_on='target_day_ind', right_on = 'day_ind')
    grid_test.drop(['day_ind'], axis=1, inplace=True)
    grid_test = pd.merge(grid_test, air_reserve_day1, left_on = ['air_store_id', 'target_day_ind'],
                        right_on = ['air_store_id', 'day_ind'], how = 'left')
    grid_test.drop(['day_ind'], axis=1, inplace=True)
    grid_test = pd.merge(grid_test, hpg_reserve_day1, left_on = ['hpg_store_id', 'target_day_ind'],
                        right_on = ['hpg_store_id', 'day_ind'], suffixes = ['_air', '_hpg'], how = 'left')
    grid_test.fillna({'reserve_visitors_air': 0, 'reserve_visitors_hpg': 0}, inplace=True)
    grid_test['reserve_visitors'] = grid_test.apply(lambda r: r['reserve_visitors_air'] + r['reserve_visitors_hpg'], axis=1)
    grid_test.drop(['target_day_ind', 'calendar_date', 'date', 'day_ind', 'month_ind', 'reserve_visitors_air', 
                   'reserve_visitors_hpg', 'hpg_store_id'], axis=1, inplace=True)   
    return grid_train, grid_test

### Label encode the categorical variables

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

In [11]:
class CatLabler(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols, lag_begin, lag_length):
        self.cat_cols = cat_cols
        self.lag_begin = lag_begin
        self.lag_length = lag_length
    def fit(self, X, y=None):
        encoders = {}
        self.weekday_encoder = LabelEncoder()
        self.weekday_encoder.fit(['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
        for col in self.cat_cols:
            encoder = LabelEncoder()
            encoder.fit(X[col])
            encoders[col] = encoder
        self.encoders = encoders
        return self
    def transform(self, X, y=None):
        X_new = X.copy()
        for col in self.cat_cols:
            X_new[col] = self.encoders[col].transform(X[col])
        X_new['day_of_week'] = self.weekday_encoder.transform(X['day_of_week'])
        for i in range(self.lag_begin, self.lag_begin + self.lag_length):
            X_new['day_of_week_lag_'+str(i)] = self.weekday_encoder.transform(X['day_of_week_lag_'+str(i)])
        return X_new

In [133]:
genre_labler = LabelEncoder()
wday_labler = LabelEncoder()
store_labler = LabelEncoder()
area_labler = LabelEncoder()
wdays = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
genre_labler.fit(np.unique(air_store_info.air_genre_name))
wday_labler.fit(wdays)
store_labler.fit(np.unique(air_store_info.air_store_id))
area_labler.fit(np.unique(air_store_info.air_area_name))

LabelEncoder()

In [177]:
def encode_cat(X, lag_begin, lag_length):
    X_new = X.copy()
    X_new['day_of_week'] = wday_labler.transform(X['day_of_week'])
    X_new['air_store_id'] = store_labler.transform(X['air_store_id'])
    X_new['air_area_name'] = area_labler.transform(X['air_area_name'])
    X_new['air_genre_name'] = genre_labler.transform(X['air_genre_name'])
    for i in range(lag_begin, lag_begin + lag_length):
        X_new['day_of_week_lag_'+str(i)] = wday_labler.transform(X['day_of_week_lag_'+str(i)])
    return X_new

### Put all together to create final dataset

In [156]:
def create_train_test(k_grid=15, lag_length = 21, lag_begin0=1, lag_begin1=39):
    '''
    input:
    k_grid -- create k weeks grid starting from the last date in air_visit_hist
    k_train -- when creating the training data, if using the last_date in air_visit_hist as y, then X goes back to 
        last date - k_train * 7 day
        Example: if k_grid = 10 and lag_length = 21, the last day_ind in air_visit_hist is 477,
            then the grid will have every store with day_ind from 477 - 7*15 + 1 = 373 to 477.
        Now since lag_length = 21, then the training data with LAG = 2 will have the following:
        (i) day_ind = 477 as y, day_ind = 475, 474, ...., 475 - 21 + 1 = 475, 474, ..., 455 as X
        (ii) day_ind = 476 as y, day_ind = 474, 473, ....., 454 as X
        (iii) day_ind = 475 as y, day_ind = 473, 472, ...., 453 as X
         ....
         (x) day_ind = 464  as y, day_ind =  462                  408 as X
     
        The corresponding test/predict X_test and y_testwill be
        y_test for  day_ind = 477 + LAG = 479
        X_test consists of day_ind = 477, 476, ... 457
     
    lag: for each LAG between lag_begin0 and lag_begin1, create train set
    X_train_lag and y_train_lag where X_train has lag from LAG to 7 * k - 1,
    also create a test set X_test_lag 
    '''
    grid = get_grid(k_grid)
    last_train_day = np.max(air_visit_hist.day_ind)
    
    Data = {}
    Miss = {}
    for lag_begin in range(lag_begin0, lag_begin1 + 1):
        print('lag_begin=', lag_begin)
        lag_end = lag_begin + lag_length - 1
        gtrain, gtest = append_lag(grid, lag_begin, lag_length)
        cat_columns = ['air_store_id', 'air_genre_name', 'air_area_name']  
        catLabler = CatLabler(cat_columns, lag_begin, lag_length)
       
        y_train = gtrain.visitors
        X_train0 = gtrain[gtest.columns]
        X_test0 = gtest
       
        X_train1 = encode_cat(X_train0, lag_begin, lag_length)
        X_test1 = encode_cat(X_test0, lag_begin, lag_length)
        strain = set(gtrain.air_store_id)
        stest = set(gtest.air_store_id)
        target_day = last_train_day + lag_begin     
        Miss[target_day] = stest.difference(strain)
        #catLabler.fit(X_train0)
       
        #X_train1 = catLabler.transform(X_train0)
      
        #X_test1 = catLabler.transform(X_test0)
       
        Data[lag_begin] = (y_train, X_train1, X_test1, X_test0.air_store_id)
    return Data, Miss

In [184]:
Data, Miss = create_train_test(k_grid=24, lag_length = 63, lag_begin0=1, lag_begin1=39)

lag_begin= 1
train day: 1
train day: 2
train day: 3
train day: 4
train day: 5
train day: 6
train day: 7
train day: 8
train day: 9
train day: 10
train day: 11
train day: 12
train day: 13
train day: 14
train day: 15
train day: 16
train day: 17
train day: 18
train day: 19
train day: 20
train day: 21
train day: 22
train day: 23
train day: 24
train day: 25
train day: 26
train day: 27
train day: 28
train day: 29
train day: 30
train day: 31
train day: 32
train day: 33
train day: 34
train day: 35
train day: 36
train day: 37
train day: 38
train day: 39
train day: 40
train day: 41
train day: 42
train day: 43
train day: 44
train day: 45
train day: 46
train day: 47
train day: 48
train day: 49
train day: 50
train day: 51
train day: 52
train day: 53
train day: 54
train day: 55
train day: 56
train day: 57
train day: 58
train day: 59
train day: 60
train day: 61
train day: 62
train day: 63
shape (828, 11)
test day: 2
test day: 3
test day: 4
test day: 5
test day: 6
test day: 7
test day: 8
test day: 9
te

test day: 45
test day: 46
test day: 47
test day: 48
test day: 49
test day: 50
test day: 51
test day: 52
test day: 53
test day: 54
test day: 55
test day: 56
test day: 57
test day: 58
test day: 59
test day: 60
test day: 61
test day: 62
test day: 63
test day: 64
test day: 65
test day: 66
test day: 67
lag_begin= 6
train day: 6
train day: 7
train day: 8
train day: 9
train day: 10
train day: 11
train day: 12
train day: 13
train day: 14
train day: 15
train day: 16
train day: 17
train day: 18
train day: 19
train day: 20
train day: 21
train day: 22
train day: 23
train day: 24
train day: 25
train day: 26
train day: 27
train day: 28
train day: 29
train day: 30
train day: 31
train day: 32
train day: 33
train day: 34
train day: 35
train day: 36
train day: 37
train day: 38
train day: 39
train day: 40
train day: 41
train day: 42
train day: 43
train day: 44
train day: 45
train day: 46
train day: 47
train day: 48
train day: 49
train day: 50
train day: 51
train day: 52
train day: 53
train day: 54
train 

test day: 25
test day: 26
test day: 27
test day: 28
test day: 29
test day: 30
test day: 31
test day: 32
test day: 33
test day: 34
test day: 35
test day: 36
test day: 37
test day: 38
test day: 39
test day: 40
test day: 41
test day: 42
test day: 43
test day: 44
test day: 45
test day: 46
test day: 47
test day: 48
test day: 49
test day: 50
test day: 51
test day: 52
test day: 53
test day: 54
test day: 55
test day: 56
test day: 57
test day: 58
test day: 59
test day: 60
test day: 61
test day: 62
test day: 63
test day: 64
test day: 65
test day: 66
test day: 67
test day: 68
test day: 69
test day: 70
test day: 71
test day: 72
lag_begin= 11
train day: 11
train day: 12
train day: 13
train day: 14
train day: 15
train day: 16
train day: 17
train day: 18
train day: 19
train day: 20
train day: 21
train day: 22
train day: 23
train day: 24
train day: 25
train day: 26
train day: 27
train day: 28
train day: 29
train day: 30
train day: 31
train day: 32
train day: 33
train day: 34
train day: 35
train day: 3

train day: 64
train day: 65
train day: 66
train day: 67
train day: 68
train day: 69
train day: 70
train day: 71
train day: 72
train day: 73
train day: 74
train day: 75
train day: 76
train day: 77
shape (828, 11)
test day: 16
test day: 17
test day: 18
test day: 19
test day: 20
test day: 21
test day: 22
test day: 23
test day: 24
test day: 25
test day: 26
test day: 27
test day: 28
test day: 29
test day: 30
test day: 31
test day: 32
test day: 33
test day: 34
test day: 35
test day: 36
test day: 37
test day: 38
test day: 39
test day: 40
test day: 41
test day: 42
test day: 43
test day: 44
test day: 45
test day: 46
test day: 47
test day: 48
test day: 49
test day: 50
test day: 51
test day: 52
test day: 53
test day: 54
test day: 55
test day: 56
test day: 57
test day: 58
test day: 59
test day: 60
test day: 61
test day: 62
test day: 63
test day: 64
test day: 65
test day: 66
test day: 67
test day: 68
test day: 69
test day: 70
test day: 71
test day: 72
test day: 73
test day: 74
test day: 75
test day

train day: 41
train day: 42
train day: 43
train day: 44
train day: 45
train day: 46
train day: 47
train day: 48
train day: 49
train day: 50
train day: 51
train day: 52
train day: 53
train day: 54
train day: 55
train day: 56
train day: 57
train day: 58
train day: 59
train day: 60
train day: 61
train day: 62
train day: 63
train day: 64
train day: 65
train day: 66
train day: 67
train day: 68
train day: 69
train day: 70
train day: 71
train day: 72
train day: 73
train day: 74
train day: 75
train day: 76
train day: 77
train day: 78
train day: 79
train day: 80
train day: 81
train day: 82
shape (828, 11)
test day: 21
test day: 22
test day: 23
test day: 24
test day: 25
test day: 26
test day: 27
test day: 28
test day: 29
test day: 30
test day: 31
test day: 32
test day: 33
test day: 34
test day: 35
test day: 36
test day: 37
test day: 38
test day: 39
test day: 40
test day: 41
test day: 42
test day: 43
test day: 44
test day: 45
test day: 46
test day: 47
test day: 48
test day: 49
test day: 50
test d

test day: 85
test day: 86
lag_begin= 25
train day: 25
train day: 26
train day: 27
train day: 28
train day: 29
train day: 30
train day: 31
train day: 32
train day: 33
train day: 34
train day: 35
train day: 36
train day: 37
train day: 38
train day: 39
train day: 40
train day: 41
train day: 42
train day: 43
train day: 44
train day: 45
train day: 46
train day: 47
train day: 48
train day: 49
train day: 50
train day: 51
train day: 52
train day: 53
train day: 54
train day: 55
train day: 56
train day: 57
train day: 58
train day: 59
train day: 60
train day: 61
train day: 62
train day: 63
train day: 64
train day: 65
train day: 66
train day: 67
train day: 68
train day: 69
train day: 70
train day: 71
train day: 72
train day: 73
train day: 74
train day: 75
train day: 76
train day: 77
train day: 78
train day: 79
train day: 80
train day: 81
train day: 82
train day: 83
train day: 84
train day: 85
train day: 86
train day: 87
shape (828, 11)
test day: 26
test day: 27
test day: 28
test day: 29
test day: 

test day: 64
test day: 65
test day: 66
test day: 67
test day: 68
test day: 69
test day: 70
test day: 71
test day: 72
test day: 73
test day: 74
test day: 75
test day: 76
test day: 77
test day: 78
test day: 79
test day: 80
test day: 81
test day: 82
test day: 83
test day: 84
test day: 85
test day: 86
test day: 87
test day: 88
test day: 89
test day: 90
test day: 91
lag_begin= 30
train day: 30
train day: 31
train day: 32
train day: 33
train day: 34
train day: 35
train day: 36
train day: 37
train day: 38
train day: 39
train day: 40
train day: 41
train day: 42
train day: 43
train day: 44
train day: 45
train day: 46
train day: 47
train day: 48
train day: 49
train day: 50
train day: 51
train day: 52
train day: 53
train day: 54
train day: 55
train day: 56
train day: 57
train day: 58
train day: 59
train day: 60
train day: 61
train day: 62
train day: 63
train day: 64
train day: 65
train day: 66
train day: 67
train day: 68
train day: 69
train day: 70
train day: 71
train day: 72
train day: 73
train 

test day: 51
test day: 52
test day: 53
test day: 54
test day: 55
test day: 56
test day: 57
test day: 58
test day: 59
test day: 60
test day: 61
test day: 62
test day: 63
test day: 64
test day: 65
test day: 66
test day: 67
test day: 68
test day: 69
test day: 70
test day: 71
test day: 72
test day: 73
test day: 74
test day: 75
test day: 76
test day: 77
test day: 78
test day: 79
test day: 80
test day: 81
test day: 82
test day: 83
test day: 84
test day: 85
test day: 86
test day: 87
test day: 88
test day: 89
test day: 90
test day: 91
test day: 92
test day: 93
test day: 94
test day: 95
test day: 96
lag_begin= 35
train day: 35
train day: 36
train day: 37
train day: 38
train day: 39
train day: 40
train day: 41
train day: 42
train day: 43
train day: 44
train day: 45
train day: 46
train day: 47
train day: 48
train day: 49
train day: 50
train day: 51
train day: 52
train day: 53
train day: 54
train day: 55
train day: 56
train day: 57
train day: 58
train day: 59
train day: 60
train day: 61
train day:

train day: 90
train day: 91
train day: 92
train day: 93
train day: 94
train day: 95
train day: 96
train day: 97
train day: 98
train day: 99
train day: 100
train day: 101
shape (828, 11)
test day: 40
test day: 41
test day: 42
test day: 43
test day: 44
test day: 45
test day: 46
test day: 47
test day: 48
test day: 49
test day: 50
test day: 51
test day: 52
test day: 53
test day: 54
test day: 55
test day: 56
test day: 57
test day: 58
test day: 59
test day: 60
test day: 61
test day: 62
test day: 63
test day: 64
test day: 65
test day: 66
test day: 67
test day: 68
test day: 69
test day: 70
test day: 71
test day: 72
test day: 73
test day: 74
test day: 75
test day: 76
test day: 77
test day: 78
test day: 79
test day: 80
test day: 81
test day: 82
test day: 83
test day: 84
test day: 85
test day: 86
test day: 87
test day: 88
test day: 89
test day: 90
test day: 91
test day: 92
test day: 93
test day: 94
test day: 95
test day: 96
test day: 97
test day: 98
test day: 99
test day: 100
test day: 101


In [187]:
pickle.dump(Data, open(data_dir + '/SubData24_63.p', 'wb'))
pickle.dump(Miss, open(data_dir + '/Miss24_63.p', 'wb'))


MemoryError: 

## Submission 3.1 Xgboost

In [39]:
Data = pickle.load(open(data_dir + '/SubData15_21_d_2.p', 'rb'))

In [61]:
import xgboost as xgb



In [62]:
xgb_params = {
    'eta': 0.15,
    'max_depth': 5,
     'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [67]:
y_train, X_train, X_test, store = Data[36]

ly_train = np.log(y_train + 1)
X_train0, X_test0, ly_train0, ly_test0 = train_test_split(X_train, ly_train, test_size=0.33, random_state=42) 

xgtrain = xgb.DMatrix(X_train0.values, ly_train0)
xgtest = xgb.DMatrix(X_test0.values)

cvresult = xgb.cv(xgb_params, xgtrain, num_boost_round=1000, nfold=5, metrics='rmse', early_stopping_rounds=50, verbose_eval=False)

num_rounds = cvresult.shape[0] + 55
model = xgb.train(xgb_params, xgtrain, num_boost_round=num_rounds, evals = [(xgtrain, 'train')], verbose_eval=50)
    # make prediction
ly_predict = model.predict(xgtest)

mean_squared_error(ly_predict, ly_test0)

[0]	train-rmse:2.13085
[50]	train-rmse:0.494121
[100]	train-rmse:0.462974
[150]	train-rmse:0.440568
[200]	train-rmse:0.419467


0.26281886106566621

### try without day of week etc.

In [71]:
X_train1 = X_train0.copy()
X_test1 = X_test0.copy()

X_train1.drop(['reserve_visitors'], axis=1, inplace=True)
X_test1.drop(['reserve_visitors'], axis=1, inplace=True)


xgtrain1 = xgb.DMatrix(X_train1.values, ly_train0)
xgtest1 = xgb.DMatrix(X_test1.values)

cvresult1 = xgb.cv(xgb_params, xgtrain1, num_boost_round=1000, nfold=5, metrics='rmse', early_stopping_rounds=50, verbose_eval=False)

num_rounds = cvresult1.shape[0] + 55
model = xgb.train(xgb_params, xgtrain1, num_boost_round=num_rounds, evals = [(xgtrain1, 'train')], verbose_eval=50)
    # make prediction
ly_predict1 = model.predict(xgtest1)

print('mlse=',mean_squared_error(ly_predict1, ly_test0))

[0]	train-rmse:2.13085
[50]	train-rmse:0.510835
[100]	train-rmse:0.482686
[150]	train-rmse:0.456938
mlse= 0.289232914857


In [72]:
last_train_date = np.max(air_visit_hist.day_ind)
Results = []
xgb_params = {
    'eta': 0.15,
    'max_depth': 5,
     'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

for i in range(1, 40):
    print(i)
    y_train, X_train, X_test, stores = Data[i]
    ly_train = np.log(y_train + 1)
    xgtrain = xgb.DMatrix(X_train.values, ly_train)
    xgtest = xgb.DMatrix(X_test.values)
    cvresult = xgb.cv(xgb_params, xgtrain, num_boost_round=1000, nfold=5, metrics='rmse', early_stopping_rounds=50, verbose_eval=False)
    num_rounds = cvresult.shape[0] + 55
    model = xgb.train(xgb_params, xgtrain, num_boost_round=num_rounds, evals = [(xgtrain, 'train')], verbose_eval=50)
    # make prediction
    ly_predict = model.predict(xgtest)
    y_predict = np.exp(ly_predict) - 1
    rdf = pd.DataFrame({'id': stores, 'visitors': y_predict})
    dt = date_info[date_info.day_ind == (last_train_date + i)]
    rdf['id'] = rdf.apply(lambda r: r['id'] + '_' + dt.calendar_date, axis=1)
    Results.append(rdf)

1
[0]	train-rmse:2.10145
[50]	train-rmse:0.481059
[100]	train-rmse:0.466275
[150]	train-rmse:0.455069
[200]	train-rmse:0.444478
2
[0]	train-rmse:2.10091
[50]	train-rmse:0.480904
[100]	train-rmse:0.466096
[150]	train-rmse:0.454319
[200]	train-rmse:0.444357
3
[0]	train-rmse:2.1027
[50]	train-rmse:0.480754
[100]	train-rmse:0.466619
[150]	train-rmse:0.455077
[200]	train-rmse:0.445184
4
[0]	train-rmse:2.10534
[50]	train-rmse:0.480275
[100]	train-rmse:0.4659
[150]	train-rmse:0.454443
5
[0]	train-rmse:2.10885
[50]	train-rmse:0.480546
[100]	train-rmse:0.465576
[150]	train-rmse:0.454428
[200]	train-rmse:0.443052
6
[0]	train-rmse:2.11023
[50]	train-rmse:0.481422
[100]	train-rmse:0.465833
[150]	train-rmse:0.453805
[200]	train-rmse:0.442663
7
[0]	train-rmse:2.10962
[50]	train-rmse:0.482287
[100]	train-rmse:0.467819
[150]	train-rmse:0.456126
[200]	train-rmse:0.445478
8
[0]	train-rmse:2.10824
[50]	train-rmse:0.487318
[100]	train-rmse:0.471529
[150]	train-rmse:0.459225
[200]	train-rmse:0.447774
9
[0]

In [74]:
Result_df = pd.concat(Results)

sample_submission = pd.read_csv(data_dir + '/sample_submission.csv.zip')
sample_submission['ord'] = sample_submission.index

output_df = Result_df.merge(sample_submission, on='id', suffixes=['','_r'])[['id', 'visitors', 'ord']]

output = output_df.sort_values('ord', axis=0)[['id', 'visitors']]
output.to_csv(data_dir + '/submission5_1.csv', index=False)

## Submission 3.2 lightgbm

In [188]:
import lightgbm as lgb

In [189]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [190]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': {'rmse'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.5
    }



In [111]:
Data1 = pickle.load(open(data_dir + '/SubData15_21_d_3.p', 'rb'))

In [191]:
d = 16

y_train, X_train_a, X_test_a, store = Data[d]
#remove_cols = ['holiday_eve'] + ['holiday_eve_lag_' + str(i) for i in range(d, d + 35)]
#remove_cols = ['air_store_id']
X_train = X_train_a.copy()
X_test = X_test_a.copy()
#X_train.drop(remove_cols, axis=1, inplace=True)
#X_test.drop(remove_cols, axis=1, inplace=True)

ly_train = np.log(y_train + 1)
X_train0, X_test0, ly_train0, ly_test0 = train_test_split(X_train, ly_train, test_size=0.33, random_state=42) 

lgb_train = lgb.Dataset(X_train0, ly_train0)

cv_lgb = lgb.cv(params, lgb_train, num_boost_round = 1000, nfold=5, stratified=False, early_stopping_rounds = 50, 
            verbose_eval = False)
num_rounds = len(cv_lgb['rmse-mean']) + 50
gbm = lgb.train(params,lgb_train,num_boost_round=num_rounds)
ly_predict = gbm.predict(X_test0)
#0: 0.24386
#1: 0.2337
print('mlse=', mean_squared_error(ly_predict, ly_test0))

mlse= 0.240233324842


In [23]:
X_train1 = X_train0.copy()
X_test1 = X_test0.copy()

X_train1.drop(['day_of_week', 'holiday_flg', 'holiday_eve'], axis=1, inplace=True)
X_test1.drop(['day_of_week', 'holiday_flg', 'holiday_eve'], axis=1, inplace=True)

lgb_train = lgb.Dataset(X_train1, ly_train0)

cv_lgb = lgb.cv(params, lgb_train, num_boost_round = 1000, nfold=5, stratified=False, early_stopping_rounds = 50, 
            verbose_eval = False)
num_rounds = len(cv_lgb['rmse-mean']) + 50
gbm = lgb.train(params,lgb_train,num_boost_round=num_rounds)
ly_predict = gbm.predict(X_test0)
print('mlse=', mean_squared_error(ly_predict, ly_test0))


mlse= 0.26846038696


In [192]:
last_train_date = np.max(air_visit_hist.day_ind)
Results = []
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': {'rmse'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
    }

for i in range(1, 40):
    print(i)
    y_train, X_train, X_test, stores = Data[i]
    ly_train = np.log(y_train + 1)
    lgb_train = lgb.Dataset(X_train, ly_train)
    cv_lgb = lgb.cv(params, lgb_train, num_boost_round = 1000, nfold=5, stratified=False, early_stopping_rounds = 50, 
                verbose_eval = False)
    num_rounds = len(cv_lgb['rmse-mean']) + 50
    gbm = lgb.train(params,lgb_train,num_boost_round=num_rounds)
    ly_predict = gbm.predict(X_test)
    y_predict = np.exp(ly_predict) - 1
    rdf = pd.DataFrame({'id': stores, 'visitors': y_predict})
    dt = date_info[date_info.day_ind == (last_train_date + i)]
    rdf['id'] = rdf.apply(lambda r: r['id'] + '_' + dt.calendar_date, axis=1)
    Results.append(rdf)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [193]:
Result_df = pd.concat(Results)

sample_submission = pd.read_csv(data_dir + '/sample_submission.csv.zip')
sample_submission['ord'] = sample_submission.index

output_df = Result_df.merge(sample_submission, on='id', suffixes=['','_r'])[['id', 'visitors', 'ord']]

output = output_df.sort_values('ord', axis=0)[['id', 'visitors']]
output.to_csv(data_dir + '/submission5_2_d.csv', index=False)

## try combine submission5_2_c and submission5_2_d

In [6]:
output_d = pd.read_csv(data_dir + '/submission5_2_d.csv')

In [9]:
output_c = pd.read_csv(data_dir + '/submission5_2_c.csv')

In [30]:
output_dc = pd.merge(output_d, output_c, on='id', suffixes=['_d', '_c'])

In [32]:
output_dc['visitors'] = output_dc.apply(lambda r: (r['visitors_c'] + r['visitors_d']) / 2.0, axis=1)

In [34]:
output_dc.drop(['visitors_c', 'visitors_d'], axis=1, inplace = True)

In [36]:
output_dc.to_csv(data_dir + '/submission5_2_cd.csv', index=False)

In [40]:
output_dc['visitors'] = output_dc.apply(lambda r: int(r['visitors']) if r['visitors'] - int(r['visitors']) < 0.5 else int(r['visitors'])+1, axis=1)

In [None]:
output_dc.to_csv(data_dir + '/submission5_2_cdr.csv', index=False)

In [195]:
%qtconsole



In [196]:
Miss

{478: {'air_cb083b4789a8d3a2'},
 479: {'air_cb083b4789a8d3a2'},
 480: {'air_cb083b4789a8d3a2'},
 481: {'air_cb083b4789a8d3a2'},
 482: {'air_cb083b4789a8d3a2'},
 483: {'air_cb083b4789a8d3a2'},
 484: {'air_cb083b4789a8d3a2'},
 485: {'air_cb083b4789a8d3a2'},
 486: {'air_cb083b4789a8d3a2'},
 487: {'air_cb083b4789a8d3a2'},
 488: {'air_cb083b4789a8d3a2'},
 489: {'air_cb083b4789a8d3a2'},
 490: {'air_cb083b4789a8d3a2'},
 491: {'air_cb083b4789a8d3a2'},
 492: {'air_cb083b4789a8d3a2'},
 493: {'air_cb083b4789a8d3a2'},
 494: {'air_cb083b4789a8d3a2'},
 495: {'air_cb083b4789a8d3a2'},
 496: {'air_cb083b4789a8d3a2'},
 497: {'air_cb083b4789a8d3a2'},
 498: {'air_cb083b4789a8d3a2'},
 499: {'air_cb083b4789a8d3a2'},
 500: {'air_cb083b4789a8d3a2'},
 501: {'air_cb083b4789a8d3a2'},
 502: {'air_cb083b4789a8d3a2'},
 503: {'air_cb083b4789a8d3a2'},
 504: {'air_cb083b4789a8d3a2'},
 505: {'air_cb083b4789a8d3a2'},
 506: {'air_cb083b4789a8d3a2'},
 507: {'air_cb083b4789a8d3a2'},
 508: {'air_cb083b4789a8d3a2'},
 509: {'