In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# preprocess visitors (training) to store / date
av = pd.read_csv('air_visit_data.csv', converters={'visitors': lambda u: np.log1p(float(u))}, parse_dates = ['visit_date'])
sample = pd.read_csv('sample_submission.csv')

In [3]:
istore = sample['id'].map(lambda x: '_'.join(x.split('_')[:2])).unique()
idate_test = sample['id'].map(lambda x: x.split('_')[2]).unique()
idate_train = pd.date_range(av['visit_date'].min(), av['visit_date'].max())
idate = pd.date_range(date(2016,1,1), date(2017,5,31))

In [4]:
df_train = av.set_index(['air_store_id', 'visit_date']).unstack(level = 1)
df_train.columns = df_train.columns.get_level_values(1)
df_train = df_train.reindex(idate_train.values, axis = 1).reindex(istore, axis = 0).fillna(0).astype('int')

In [5]:
# preprocess reserves (all) store / date
ares = pd.read_csv('air_reserve.csv',
                   parse_dates=['visit_datetime','reserve_datetime'])
hres = pd.read_csv('hpg_reserve.csv',
                   parse_dates=['visit_datetime','reserve_datetime'])
ahrel= pd.read_csv('store_id_relation.csv')

In [6]:
hres['air_store_id'] = hres['hpg_store_id'].map(ahrel.set_index('hpg_store_id')['air_store_id'])
hres = hres.drop('hpg_store_id', axis = 1).dropna()
res = pd.concat([ares,hres])
res['visit_date'] = res['visit_datetime'].dt.date
res = res.drop(['reserve_datetime', 'visit_datetime'], axis = 1).groupby(['air_store_id', 'visit_date']).sum().unstack(level = 1)
res.columns = res.columns.get_level_values(1)
res = res.reindex(idate, axis = 1).reindex(istore, axis = 0).fillna(0).astype('int')

In [62]:
# preprocess categorical features
## store: genre, lat, long
astore = pd.read_csv('air_store_info.csv')
astore = astore.set_index('air_store_id').reindex(istore).drop('air_area_name',axis = 1)

In [63]:
le = LabelEncoder()
astore['genre'] = le.fit_transform(astore['air_genre_name'])
astore.drop(['air_genre_name'], axis = 1, inplace = True)

In [119]:
## day: holiday, dow
dinfo = pd.read_csv('date_info.csv', parse_dates = ['calendar_date'])
ledow = LabelEncoder()
dinfo['dow'] = ledow.fit_transform(dinfo['day_of_week'])
dinfo.drop(['day_of_week'], axis = 1, inplace = True)
dinfo.set_index('calendar_date', inplace = True)

In [None]:
# predict day by day, using both MA and cat features

In [38]:
def get_timespan(df, dt, minus, periods, freq='D'): #slicing data by date, trace back "minus" days
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [209]:
def prepare_dataset(data, res, store, holi, t_anchor, is_train=True):
    # MA & time series
    X = pd.DataFrame({
        "median_30_2017": get_timespan(data, t_anchor, 30, 30).median(axis=1).values,
        "median_60_2017": get_timespan(data, t_anchor, 60, 60).median(axis=1).values,
        "median_140_2017": get_timespan(data, t_anchor, 140, 140).median(axis=1).values
    }, index = data.index)
    
    for i in range(1,15):
        X["day_{}_2017".format(i)] = get_timespan(data, t_anchor, i, 1).values.ravel()
        
    for i in range(15):
        X["res_{}_2017".format(i)] = get_timespan(res, t_anchor, i, 1).values.ravel()
    
    for i in range(7):
        X['sum_4_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 28-i, 4, freq='7D').sum(axis=1).values
        X['median_4_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 28-i, 4, freq='7D').median(axis=1).values
    
    # add genre, lat, long
    X = X.join(store)
    
    X_39 = []
    for i in range(39):
        day = t_anchor + timedelta(days = i)
        X_temp = X.join(pd.DataFrame([dinfo.loc[day].tolist()]*X.shape[0], columns = ['holiday', 'dow'], index = X.index))
        for i in range(15):
            X_temp['res_{}'.format(i)]= res[day-timedelta(days = i)]
        X_temp['mres_30_2017'] = get_timespan(res, day, 30, 30).median(axis=1).values
        X_temp['mres_60_2017'] = get_timespan(res, day, 60, 60).median(axis=1).values
        X_temp['mres_140_2017'] = get_timespan(res, day, 140, 140).median(axis=1).values
        X_39.append(X_temp)
    if is_train:
        y = data[pd.date_range(t_anchor, periods=39)].values
        return X_39, y
    return X_39

In [210]:
t2017 = date(2017, 1, 23)
X_l, y_l = [], []
for i in range(50):
    delta = timedelta(days=i)
    X_tmp, y_tmp = prepare_dataset(df_train, res, astore, dinfo, t2017 + delta)       
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = list(map(lambda x: pd.concat(x), zip(*X_l)))
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(df_train,res, astore, dinfo, date(2017, 3, 14))
X_test = prepare_dataset(df_train, res, astore, dinfo, date(2017, 4, 23), is_train=False)

In [218]:
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 12,
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = ['genre', 'holiday', 'dow']

In [219]:
for i in range(39):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(X_train[i], label=y_train[:, i], categorical_feature=cate_vars)
    dval = lgb.Dataset(X_val[i], label=y_val[:, i], reference=dtrain, categorical_feature=cate_vars)
    bst = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS, valid_sets=[dtrain, dval],
                    early_stopping_rounds=50, verbose_eval=100)
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train[i].columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val[i], num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test[i], num_iteration=bst.best_iteration or MAX_ROUNDS))

Step 1
Training until validation scores don't improve for 50 rounds.




[100]	training's l2: 0.40278	valid_1's l2: 0.504911
[200]	training's l2: 0.377091	valid_1's l2: 0.501864
Early stopping, best iteration is:
[150]	training's l2: 0.388694	valid_1's l2: 0.501019
sum_4_dow0_2017: 68169.26
median_4_dow0_2017: 40849.25
day_7_2017: 7564.11
day_14_2017: 4069.19
res_0_2017: 3965.99
sum_4_dow6_2017: 2067.83
median_140_2017: 1863.61
dow: 1812.98
day_1_2017: 1454.81
genre: 1451.82
day_6_2017: 1134.04
median_4_dow1_2017: 820.48
mres_60_2017: 705.40
median_4_dow6_2017: 676.95
day_13_2017: 639.60
median_30_2017: 611.63
res_0: 584.85
sum_4_dow1_2017: 525.91
sum_4_dow5_2017: 510.01
latitude: 475.86
day_2_2017: 470.12
day_5_2017: 455.78
day_3_2017: 432.53
longitude: 429.69
res_7_2017: 389.85
day_4_2017: 389.11
mres_30_2017: 384.62
sum_4_dow3_2017: 379.94
median_60_2017: 353.72
res_10_2017: 242.15
sum_4_dow2_2017: 238.55
sum_4_dow4_2017: 237.73
median_4_dow5_2017: 224.60
median_4_dow2_2017: 222.07
day_8_2017: 212.46
res_4_2017: 183.90
res_14_2017: 181.57
res_6_2017: 171

[100]	training's l2: 0.411852	valid_1's l2: 0.635092
Early stopping, best iteration is:
[122]	training's l2: 0.40515	valid_1's l2: 0.631465
sum_4_dow5_2017: 59692.99
median_4_dow5_2017: 56031.46
res_0: 4430.89
day_2_2017: 3883.88
day_9_2017: 2015.47
dow: 1503.04
genre: 1451.45
median_30_2017: 1231.40
median_140_2017: 1210.75
sum_4_dow6_2017: 903.02
sum_4_dow4_2017: 851.16
day_1_2017: 722.67
median_4_dow4_2017: 604.37
median_60_2017: 549.35
median_4_dow6_2017: 518.09
longitude: 512.93
mres_60_2017: 472.93
latitude: 451.89
res_2_2017: 422.56
sum_4_dow3_2017: 413.62
day_10_2017: 373.69
day_3_2017: 372.09
mres_30_2017: 350.01
day_8_2017: 349.31
median_4_dow0_2017: 244.11
sum_4_dow2_2017: 242.95
sum_4_dow1_2017: 220.89
day_6_2017: 208.44
res_9_2017: 198.13
day_5_2017: 193.48
day_7_2017: 192.91
res_10_2017: 187.60
sum_4_dow0_2017: 183.93
median_4_dow3_2017: 183.73
day_12_2017: 179.10
res_12_2017: 156.96
res_8_2017: 143.29
median_4_dow1_2017: 133.07
res_14_2017: 130.49
res_6_2017: 126.09
res_

[200]	training's l2: 0.411242	valid_1's l2: 0.427612
Early stopping, best iteration is:
[206]	training's l2: 0.409644	valid_1's l2: 0.427084
sum_4_dow3_2017: 75829.35
median_4_dow3_2017: 29447.10
day_4_2017: 7140.57
res_0: 5600.27
genre: 2675.46
dow: 2292.13
day_11_2017: 2019.38
median_140_2017: 1841.26
sum_4_dow2_2017: 1676.90
median_4_dow2_2017: 1527.65
latitude: 1303.01
longitude: 1034.22
holiday: 1000.35
day_3_2017: 882.82
median_60_2017: 804.95
sum_4_dow4_2017: 774.92
day_5_2017: 740.24
median_4_dow4_2017: 681.85
day_12_2017: 595.44
day_1_2017: 590.75
sum_4_dow1_2017: 572.20
mres_60_2017: 553.84
sum_4_dow6_2017: 520.98
day_10_2017: 476.56
mres_30_2017: 428.82
sum_4_dow0_2017: 424.55
day_2_2017: 357.94
sum_4_dow5_2017: 347.48
median_4_dow1_2017: 310.96
median_4_dow6_2017: 293.62
res_4_2017: 281.76
mres_140_2017: 281.51
median_4_dow5_2017: 273.43
median_30_2017: 253.14
res_11_2017: 239.13
res_8_2017: 234.33
day_6_2017: 232.46
day_13_2017: 213.63
median_4_dow0_2017: 196.33
day_7_2017

[100]	training's l2: 0.458144	valid_1's l2: 0.490168
[200]	training's l2: 0.424897	valid_1's l2: 0.482391
Early stopping, best iteration is:
[211]	training's l2: 0.421641	valid_1's l2: 0.480943
sum_4_dow1_2017: 63399.99
median_4_dow1_2017: 38292.68
res_0: 6452.63
day_13_2017: 5419.17
day_6_2017: 4171.60
genre: 3364.74
median_140_2017: 2626.08
dow: 2394.53
sum_4_dow0_2017: 1914.87
latitude: 1559.27
median_4_dow0_2017: 1398.43
longitude: 1253.70
sum_4_dow2_2017: 955.24
day_5_2017: 871.81
day_7_2017: 867.41
sum_4_dow6_2017: 754.94
median_4_dow2_2017: 693.40
mres_60_2017: 683.11
holiday: 566.23
day_14_2017: 548.93
day_1_2017: 467.95
median_60_2017: 458.54
median_4_dow3_2017: 454.91
day_11_2017: 419.36
sum_4_dow4_2017: 407.10
sum_4_dow5_2017: 405.92
sum_4_dow3_2017: 404.47
median_30_2017: 389.94
median_4_dow6_2017: 384.16
median_4_dow5_2017: 383.67
day_4_2017: 382.22
day_3_2017: 352.66
res_6_2017: 345.30
day_2_2017: 286.24
res_11_2017: 277.17
median_4_dow4_2017: 267.00
res_3_2017: 264.09
da

Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.459472	valid_1's l2: 0.557424
Early stopping, best iteration is:
[112]	training's l2: 0.453288	valid_1's l2: 0.55553
sum_4_dow6_2017: 96390.70
median_4_dow6_2017: 12082.60
res_0: 6229.40
day_8_2017: 4037.07
day_1_2017: 3001.64
genre: 2368.59
median_140_2017: 1913.16
dow: 1564.47
sum_4_dow5_2017: 1182.07
latitude: 1070.70
median_4_dow5_2017: 1030.97
holiday: 820.63
median_60_2017: 812.76
longitude: 740.61
day_2_2017: 736.36
sum_4_dow0_2017: 586.85
day_9_2017: 545.70
sum_4_dow1_2017: 531.24
mres_60_2017: 494.58
sum_4_dow4_2017: 452.70
median_4_dow0_2017: 451.41
median_30_2017: 444.41
day_7_2017: 394.03
mres_140_2017: 377.37
day_12_2017: 321.15
median_4_dow4_2017: 319.27
day_14_2017: 298.83
day_6_2017: 248.10
sum_4_dow3_2017: 245.84
res_1_2017: 238.56
day_10_2017: 234.05
sum_4_dow2_2017: 231.01
res_6_2017: 210.97
median_4_dow3_2017: 210.55
median_4_dow2_2017: 178.13
mres_30_2017: 177.63
day_5_2017: 164.13

[100]	training's l2: 0.464805	valid_1's l2: 0.413368
[200]	training's l2: 0.429253	valid_1's l2: 0.411859
Early stopping, best iteration is:
[170]	training's l2: 0.438058	valid_1's l2: 0.409994
sum_4_dow4_2017: 60317.77
median_4_dow4_2017: 32321.55
day_3_2017: 14309.66
res_0: 6674.28
genre: 3271.06
day_10_2017: 2376.87
dow: 2201.72
median_140_2017: 1960.48
latitude: 1784.58
sum_4_dow3_2017: 1617.98
median_60_2017: 1103.08
longitude: 1059.13
median_4_dow3_2017: 967.80
day_4_2017: 928.68
holiday: 927.85
mres_60_2017: 893.67
sum_4_dow5_2017: 803.76
median_4_dow0_2017: 690.44
median_4_dow5_2017: 655.30
median_4_dow2_2017: 612.57
sum_4_dow2_2017: 571.73
sum_4_dow1_2017: 495.54
day_2_2017: 474.05
sum_4_dow6_2017: 465.61
day_11_2017: 465.54
day_1_2017: 413.35
mres_140_2017: 406.44
sum_4_dow0_2017: 405.76
day_12_2017: 376.59
day_9_2017: 352.37
day_7_2017: 343.40
median_4_dow1_2017: 329.56
day_6_2017: 304.44
median_30_2017: 281.01
res_12_2017: 247.74
res_3_2017: 235.85
day_13_2017: 220.48
res_1

Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.473903	valid_1's l2: 0.453379
Early stopping, best iteration is:
[130]	training's l2: 0.460247	valid_1's l2: 0.450711
sum_4_dow2_2017: 53653.61
median_4_dow2_2017: 43360.24
res_0: 7801.01
day_5_2017: 4659.07
genre: 3187.40
median_140_2017: 3107.52
latitude: 1944.37
dow: 1912.46
day_12_2017: 1727.42
sum_4_dow1_2017: 1296.19
median_4_dow1_2017: 1260.33
median_4_dow0_2017: 1221.25
mres_60_2017: 940.39
day_4_2017: 925.31
day_6_2017: 871.17
longitude: 870.82
sum_4_dow0_2017: 822.36
median_60_2017: 797.38
holiday: 788.27
median_4_dow3_2017: 786.04
sum_4_dow3_2017: 662.10
mres_140_2017: 481.21
res_14: 454.77
sum_4_dow4_2017: 411.66
day_11_2017: 411.58
median_4_dow5_2017: 376.09
day_13_2017: 367.38
day_14_2017: 302.20
day_1_2017: 300.93
sum_4_dow6_2017: 299.45
median_4_dow4_2017: 283.27
res_14_2017: 267.01
sum_4_dow5_2017: 262.86
median_30_2017: 245.42
day_7_2017: 243.49
res_13: 232.66
day_3_2017: 222.94
res_5

[100]	training's l2: 0.47655	valid_1's l2: 0.500632
Early stopping, best iteration is:
[143]	training's l2: 0.457707	valid_1's l2: 0.499202
sum_4_dow0_2017: 67680.63
median_4_dow0_2017: 26285.21
res_0: 8115.48
day_7_2017: 5518.15
genre: 3740.18
median_140_2017: 2850.29
day_14_2017: 2450.23
dow: 2266.10
sum_4_dow6_2017: 2169.30
latitude: 1991.76
median_4_dow5_2017: 1396.96
median_60_2017: 1333.62
longitude: 1115.00
median_4_dow6_2017: 1060.31
sum_4_dow5_2017: 980.13
median_4_dow1_2017: 948.83
holiday: 938.99
mres_60_2017: 837.60
median_4_dow3_2017: 781.94
day_6_2017: 696.60
day_1_2017: 676.36
sum_4_dow1_2017: 676.14
sum_4_dow2_2017: 496.18
mres_140_2017: 480.39
res_7: 388.28
day_13_2017: 367.82
day_11_2017: 334.09
sum_4_dow4_2017: 324.16
day_2_2017: 322.86
mres_30_2017: 304.22
day_5_2017: 287.84
sum_4_dow3_2017: 281.90
median_30_2017: 276.75
res_10_2017: 261.08
day_3_2017: 253.21
median_4_dow4_2017: 252.03
res_14: 239.25
res_7_2017: 238.07
res_13: 233.02
day_12_2017: 224.17
day_9_2017: 

In [220]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Validation mse: 0.5240798477282607


In [216]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(y_test, index=df_train.index, columns=pd.date_range("2017-04-23", periods=39)
                       ).stack().to_frame("visitors").reset_index()
df_preds['id'] = df_preds[['air_store_id', 'level_1']].apply(lambda x: x[0] + '_' + str(x[1].date()), axis = 1)
df_preds.drop(['air_store_id', 'level_1'], axis = 1, inplace = True)

Making submission...


In [217]:
submission = sample[["id"]].merge(df_preds, how="left", left_on = 'id', right_on = 'id')
submission["visitors"] = np.clip(np.around(np.expm1(submission["visitors"])), 0, 1000)
submission["visitors"] = submission['visitors'].astype('int')
submission.to_csv('try4.csv', index=None)