In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
from pandas.api.types import CategoricalDtype

In [3]:
# from matplotlib import pyplot as plt

In [4]:
# %matplotlib inline

In [5]:
df_train = pd.read_csv('train.csv', dtype={'onpromotion': bool},
                       converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
                       parse_dates=["date"], skiprows=range(1, 66458909)) #59038132 entries

In [6]:
#df_train = df_train_raw.loc[df_train_raw['date']>=pd.datetime(2016,1,1)]

In [7]:
# # train on 2017 data
# df_train = pd.read_csv('train.csv', dtype={'onpromotion': bool},
#                        converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
#                        parse_dates=["date"], skiprows = range(1,101688780))

In [8]:
date_index = pd.date_range(df_train['date'].min(), df_train['date'].max())

In [9]:
df_test = pd.read_csv("test.csv", dtype={'onpromotion': bool}, parse_dates=["date"])

In [10]:
items = pd.read_csv("items.csv").set_index("item_nbr")

In [11]:
# def add_holiday(data):
#     holiday = pd.read_csv('holidays_events.csv', index_col = 'date', parse_dates = ['date'])
#     holiday['holiday'] = holiday['transferred'] == False
#     holiday = holiday[holiday['holiday']]
#     holiday = holiday[['locale','locale_name','holiday']]
#     stores = pd.read_csv('stores.csv', index_col = 'store_nbr')
#     data = data.merge(stores[['city','state']], left_on = 'store_nbr', right_index = True, how = 'left')
#     city_holiday = holiday[holiday['locale'] == 'Local'][['locale_name', 'holiday']]
#     city_holiday.reset_index(inplace = True)
#     city_holiday.rename(columns = {'locale_name':'city','holiday':'city_holiday'}, inplace = True)
#     data = data.merge(city_holiday.drop_duplicates(), left_on = ['date', 'city'], right_on = ['date','city'], how = 'left')
#     state_holiday = holiday[holiday['locale'] == 'Regional'][['locale_name', 'holiday']]
#     state_holiday.reset_index(inplace = True)
#     state_holiday.rename(columns = {'locale_name':'state','holiday':'state_holiday'}, inplace = True)
#     data = data.merge(state_holiday.drop_duplicates(), left_on = ['date', 'state'], right_on = ['date','state'], how = 'left')
#     national_holiday = holiday[holiday['locale'] == 'National'].reset_index()[['date','holiday']].rename(columns = {'holiday':'national_holiday'})
#     data = data.merge(national_holiday.drop_duplicates(), left_on = ['date'], right_on = ['date'], how = 'left')    
    
#     data['holiday'] = ((data['city_holiday'] == True) | (data['state_holiday'] == True) | (data['national_holiday'] == True))
#     data.drop(['city_holiday', 'state_holiday', 'national_holiday', 'city', 'state'], axis = 1, inplace = True)
#     return data

In [12]:
# df_train = add_holiday(df_train)
# df_test = add_holiday(df_test)

In [13]:
def flatten_bool(df_train, df_test, fname):
    f_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[[fname]].unstack(level=-1).fillna(False)
    f_train.columns = f_train.columns.get_level_values(1)
    f_test = df_test.set_index(['store_nbr', 'item_nbr', 'date'])[[fname]].unstack(level=-1).fillna(False)
    f_test.columns = f_test.columns.get_level_values(1)
    f_train = f_train.reindex(date_index.values, axis = 1).fillna(False)
    f_test = f_test.reindex(f_train.index).fillna(False) #align two ds, disregard missing test items 
    f = pd.concat([f_train, f_test], axis=1)
    return f

In [14]:
promo = flatten_bool(df_train, df_test, 'onpromotion')
# holiday = flatten_bool(df_train, df_test, 'holiday')

In [15]:
df_train = df_train.set_index(["store_nbr", "item_nbr", "date"])["unit_sales"].unstack(level=-1).fillna(0)
df_train = df_train.reindex(date_index.values, axis = 1).fillna(0)

In [16]:
items = items.reindex(df_train.index.get_level_values(1)) #delete items not in train, align item index to train index

In [17]:
stores = pd.read_csv('stores.csv', index_col = 'store_nbr')

In [19]:
cat_cluster = CategoricalDtype(categories=list(set(stores['cluster'])), ordered=False)
cat_stype = CategoricalDtype(categories=list(set(stores['type'])), ordered=False)
cat_family = CategoricalDtype(categories=list(set(items['family'])), ordered=False)
cat_class = CategoricalDtype(categories=list(set(items['class'])), ordered=False)
cat_store = CategoricalDtype(categories=list(set(df_train.reset_index()['store_nbr'])), ordered=False)

In [20]:
oil_date_index = pd.date_range(date_index.min(), df_test.reset_index()['date'].max())
oil = pd.read_csv('oil.csv', index_col = 'date', parse_dates = ['date'])
oil = oil.transpose().reindex(oil_date_index.values, axis = 1).fillna(method='bfill', axis = 1)

In [21]:
oil_change = pd.DataFrame({'oil_change':(oil.iloc[:,1:].values-oil.iloc[:,:-1].values).ravel()},
                          index = oil.columns[1:])
oil_change = oil_change.transpose()

In [22]:
df_train.reset_index(inplace = True)
df_train['salebin'] = pd.cut(df_train.mean(axis = 1),bins=5, labels = list(range(5))) #categorize items by mean sales
df_train['class'] = df_train['item_nbr'].map(items['class'].to_dict()).astype(cat_class).cat.codes
df_train['stype'] = df_train['store_nbr'].map(stores['type'].to_dict()).astype(cat_stype).cat.codes
df_train['cluster'] = df_train['store_nbr'].map(stores['cluster'].to_dict()).astype(cat_cluster).cat.codes
df_train.set_index(["store_nbr", "item_nbr"], inplace = True)

In [23]:
dtypes = {'transactions': np.dtype('int64')}
transactions = pd.read_csv('transactions.csv', parse_dates = ['date'])
transactions = transactions.set_index(['store_nbr', 'date']).unstack(level = -1)
transactions.columns = transactions.columns.get_level_values(1)
transactions = transactions.reindex(date_index, axis = 1).reindex(df_train.index.get_level_values(0))

In [24]:
def get_timespan(df, dt, minus, periods, freq='D'): #slicing data by date, trace back "minus" days
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [25]:
def prepare_dataset(data, tran, promo, oil, t_anchor, is_train=True):
    nsamples = data.shape[0]
    X = pd.DataFrame({
        "day_1_2017": get_timespan(data, t_anchor, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(data, t_anchor, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(data, t_anchor, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(data, t_anchor, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(data, t_anchor, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(data, t_anchor, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(data, t_anchor, 140, 140).mean(axis=1).values,
        "tday_1_2017": get_timespan(tran, t_anchor, 1, 1).values.ravel(),
        "tmean_3_2017": get_timespan(tran, t_anchor, 3, 3).mean(axis=1).values,
        "tmean_7_2017": get_timespan(tran, t_anchor, 7, 7).mean(axis=1).values,
        "tmean_14_2017": get_timespan(tran, t_anchor, 14, 14).mean(axis=1).values,
        "tmean_30_2017": get_timespan(tran, t_anchor, 30, 30).mean(axis=1).values,
        "tmean_60_2017": get_timespan(tran, t_anchor, 60, 60).mean(axis=1).values,
        "tmean_140_2017": get_timespan(tran, t_anchor, 140, 140).mean(axis=1).values,
        "promo_3_2017": get_timespan(promo, t_anchor, 3, 3).sum(axis=1).values,
        "promo_7_2017": get_timespan(promo, t_anchor, 7, 7).sum(axis=1).values,
        "promo_14_2017": get_timespan(promo, t_anchor, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo, t_anchor, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo, t_anchor, 140, 140).sum(axis=1).values,
#         "holi_3_2017": get_timespan(holiday, t_anchor, 3, 3).sum(axis=1).values,
#         "holi_7_2017": get_timespan(holiday, t_anchor, 7, 7).sum(axis=1).values
        "oilchange_1_2017": np.tile(get_timespan(oil, t_anchor, 1, 1).sum(axis=1).values,(nsamples)),
        "oilchange_3_2017": np.tile(get_timespan(oil, t_anchor, 3, 3).sum(axis=1).values,(nsamples)),
        "oilchange_7_2017": np.tile(get_timespan(oil, t_anchor, 7, 7).sum(axis=1).values,(nsamples)),
        "oilchange_14_2017": np.tile(get_timespan(oil, t_anchor, 14, 14).sum(axis=1).values,(nsamples)),
        "oilchange_30_2017": np.tile(get_timespan(oil, t_anchor, 30, 30).sum(axis=1).values,(nsamples)),
        "oilchange_60_2017": np.tile(get_timespan(oil, t_anchor, 30, 30).sum(axis=1).values,(nsamples)),
        "oilchange_140_2017": np.tile(get_timespan(oil, t_anchor, 30, 30).sum(axis=1).values,(nsamples)),
    })
    
#     for i in range(1,15,1):
#         X['day_{}_2017'.format(i)] = get_timespan(data, t_anchor, i, 1).values.ravel()
        
    for i in range(2,15,1):
        X['mean_{}_2017'.format(i)] = get_timespan(data, t_anchor, i, i).mean(axis=1).values
#         X['oilchange_{}_2017'.format(i)] = get_timespan(oil, t_anchor, i, i).sum(axis=1)
        X['tmean_{}_2017'.format(i)] = get_timespan(tran, t_anchor, i, i).mean(axis=1).values
    
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 140-i, 20, freq='7D').mean(axis=1).values
        X['tmean_4_dow{}_2017'.format(i)] = get_timespan(tran, t_anchor, 28-i, 4, freq='7D').mean(axis=1).values
        X['tmean_20_dow{}_2017'.format(i)] = get_timespan(tran, t_anchor, 140-i, 20, freq='7D').mean(axis=1).values
        
    for i in range(16):
        cur_day = t_anchor + timedelta(days=i)
        X["promo_{}".format(i)] = promo[cur_day].values.astype(int)
#         X["oilchange_{}".format(i)] = np.tile(oil[cur_day].values,(nsamples))
#         X["holi_{}".format(i)] = holiday[cur_day].values.astype(int)
#         X["day2MS_{}".format(i)] = (cur_day-cur_day.replace(day = 1)).days
#         # days to last salary day and next salary day
#         if cur_day.day <= 15:
#             X["day2last_{}".format(i)] = (cur_day - (cur_day.replace(day=1) - timedelta(days=1))).days
#             X["day2next_{}".format(i)] = (cur_day.replace(day=15) - cur_day).days
#         else:
#             X["day2last_{}".format(i)] = (cur_day - cur_day.replace(day=15)).days
#             X["day2next_{}".format(i)] = ((cur_day.replace(day=1, month= cur_day.month+1)- timedelta(days=1))-cur_day).days
    # categorical features
#     X['family'] = data.reset_index()['item_nbr'].map(items['family'].to_dict()).astype(cat_family).cat.codes
    X['class'] = data.reset_index()['class']
#     X['store'] = data.reset_index()['store']
    X['salebin'] = data.reset_index()['salebin']
    X['cluster'] = data.reset_index()['class']
    X['stype'] = data.reset_index()['stype']
    if is_train:
        y = data[pd.date_range(t_anchor, periods=16)].values
        return X, y
    return X

In [None]:
# def prepare_dataset(data, promo, t_anchor, is_train=True):
#     X = pd.DataFrame({
#         "day_1_2017": get_timespan(data, t_anchor, 1, 1).values.ravel(),
#         "mean_3_2017": get_timespan(data, t_anchor, 3, 3).mean(axis=1).values,
#         "mean_7_2017": get_timespan(data, t_anchor, 7, 7).mean(axis=1).values,
#         "mean_14_2017": get_timespan(data, t_anchor, 14, 14).mean(axis=1).values,
#         "mean_30_2017": get_timespan(data, t_anchor, 30, 30).mean(axis=1).values,
#         "mean_60_2017": get_timespan(data, t_anchor, 60, 60).mean(axis=1).values,
#         "mean_140_2017": get_timespan(data, t_anchor, 140, 140).mean(axis=1).values,
#         "promo_1_2017": get_timespan(promo, t_anchor, 1, 1).values.ravel().astype(int),
#         "promo_7_2017": get_timespan(promo, t_anchor, 7, 7).sum(axis=1).values,
#         "promo_14_2017": get_timespan(promo, t_anchor, 14, 14).sum(axis=1).values,
#         "promo_60_2017": get_timespan(promo, t_anchor, 60, 60).sum(axis=1).values,
#         "promo_140_2017": get_timespan(promo, t_anchor, 140, 140).sum(axis=1).values
#     })
#     for i in range(7):
#         X['mean_4_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 28-i, 4, freq='7D').mean(axis=1).values
#         X['mean_20_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 140-i, 20, freq='7D').mean(axis=1).values
#     for i in range(16):
#         X["promo_{}".format(i)] = promo[t_anchor + timedelta(days=i)].values.astype(np.uint8)
#     if is_train:
#         y = data[pd.date_range(t_anchor, periods=16)].values
#         return X, y
#     return X

In [26]:
t2017 = date(2016, 6, 15)
# t2017 = date(2017, 1, 18)
# t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(56):
# for i in range(6):
# for i in range(7):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(df_train, transactions, promo, oil_change, t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(df_train, transactions,promo, oil_change, date(2017, 7, 26))
X_test = prepare_dataset(df_train, transactions, promo, oil_change, date(2017, 8, 16), is_train=False)

In [27]:
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 600,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 12
}

MAX_ROUNDS = 800
val_pred = []
test_pred = []
feat = X_train.columns.tolist()
cat = ['class','salebin','cluster','stype']

In [28]:
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(X_train, label=y_train[:, i], feature_name=feat, categorical_feature=cat,
#                         weight=pd.concat([items["perishable"]] * 50) * 0.25 + 1)
                         weight=pd.concat([items["perishable"]] * 56) * 0.25 + 1)
#                          weight=pd.concat([items["perishable"]] * 20) * 0.25 + 1)
    dval = lgb.Dataset(X_val, label=y_val[:, i], reference=dtrain, weight=items["perishable"] * 0.25 + 1,
                       feature_name=feat, categorical_feature=cat)
    bst = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS, valid_sets=[dtrain, dval],
                    early_stopping_rounds=50, verbose_eval=100)
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.285234	valid_1's l2: 0.286474
[200]	training's l2: 0.279353	valid_1's l2: 0.282445
[300]	training's l2: 0.276808	valid_1's l2: 0.280749
[400]	training's l2: 0.275139	valid_1's l2: 0.279552
[500]	training's l2: 0.273949	valid_1's l2: 0.278755
[600]	training's l2: 0.272981	valid_1's l2: 0.278207
[700]	training's l2: 0.272168	valid_1's l2: 0.277901
[800]	training's l2: 0.271509	valid_1's l2: 0.277668
mean_7_2017: 24527563.77
mean_14_2017: 15394584.57
mean_9_2017: 9546290.09
mean_8_2017: 6086992.09
mean_2_2017: 3601931.70
mean_30_2017: 2288848.90
mean_20_dow0_2017: 1783509.65
day_1_2017: 1691935.52
promo_0: 1633451.32
mean_4_dow0_2017: 972123.62
mean_5_2017: 853228.89
class: 655034.93
mean_60_2017: 449920.33
promo_7_2017: 237589.98
mean_3_2017: 183193.19
stype: 155979.82
oilchange_1_2017: 116471.53
cluster: 104617.36
promo_3_2017: 103705.71
promo_7: 103248.93
mean_140_2017: 98910.37
promo_14_2017: 98000.07

Step 4
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.332996	valid_1's l2: 0.351098
[200]	training's l2: 0.324564	valid_1's l2: 0.347225
[300]	training's l2: 0.321161	valid_1's l2: 0.346
[400]	training's l2: 0.318848	valid_1's l2: 0.344884
[500]	training's l2: 0.317071	valid_1's l2: 0.344135
[600]	training's l2: 0.315687	valid_1's l2: 0.343682
[700]	training's l2: 0.314486	valid_1's l2: 0.343378
[800]	training's l2: 0.313405	valid_1's l2: 0.342979
mean_12_2017: 29712805.17
mean_13_2017: 9797187.00
mean_5_2017: 8784380.05
mean_6_2017: 8255622.67
mean_30_2017: 5403908.79
mean_11_2017: 4248249.24
mean_20_dow3_2017: 3066393.63
mean_14_2017: 2901052.24
mean_4_dow3_2017: 1904789.61
promo_3: 1016100.91
mean_4_2017: 1012944.08
mean_60_2017: 771168.62
class: 737727.72
oilchange_14_2017: 229783.29
promo_7_2017: 196250.60
mean_7_2017: 148262.95
cluster: 129253.14
oilchange_7_2017: 125803.42
promo_14_2017: 120704.06
oilchange_140_2017: 119813.19
mean_2_2017: 1

Step 7
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.332283	valid_1's l2: 0.404417
[200]	training's l2: 0.324316	valid_1's l2: 0.402356
[300]	training's l2: 0.321067	valid_1's l2: 0.401128
[400]	training's l2: 0.318888	valid_1's l2: 0.400882
[500]	training's l2: 0.317314	valid_1's l2: 0.400254
Early stopping, best iteration is:
[514]	training's l2: 0.317135	valid_1's l2: 0.400175
mean_30_2017: 15690354.41
mean_10_2017: 13132970.93
mean_9_2017: 11091343.17
mean_14_2017: 8586826.40
mean_20_dow6_2017: 4149712.12
mean_11_2017: 2130862.28
mean_2_2017: 1804125.98
promo_6: 1734743.24
mean_60_2017: 1000852.71
mean_4_dow6_2017: 928636.55
mean_13_2017: 666515.68
class: 666065.46
mean_8_2017: 233373.92
day_1_2017: 226488.11
mean_3_2017: 221282.89
mean_4_2017: 203157.26
promo_7_2017: 184896.53
oilchange_140_2017: 177272.52
mean_140_2017: 161485.90
promo_7: 149527.85
cluster: 115867.19
oilchange_14_2017: 111096.12
promo_14_2017: 102537.68
mean_20_dow5_2017: 94

Step 10
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.339164	valid_1's l2: 0.357852
[200]	training's l2: 0.329962	valid_1's l2: 0.354638
[300]	training's l2: 0.326358	valid_1's l2: 0.353362
[400]	training's l2: 0.323855	valid_1's l2: 0.352677
[500]	training's l2: 0.322117	valid_1's l2: 0.35221
Early stopping, best iteration is:
[462]	training's l2: 0.322742	valid_1's l2: 0.352117
mean_14_2017: 16207643.28
mean_30_2017: 13553685.33
mean_20_dow2_2017: 9579571.20
mean_7_2017: 8524240.83
mean_13_2017: 7329524.72
promo_9: 2084129.36
mean_6_2017: 2069657.01
mean_12_2017: 1581285.49
mean_4_dow2_2017: 1168395.59
class: 1151334.36
mean_5_2017: 455748.60
mean_60_2017: 413923.76
mean_8_2017: 339335.82
oilchange_140_2017: 207310.94
cluster: 206410.17
day_1_2017: 179203.40
promo_14_2017: 153162.66
promo_7_2017: 149846.22
mean_2_2017: 144875.47
promo_10: 122656.83
oilchange_7_2017: 114829.12
promo_7: 102944.33
promo_2: 91051.16
promo_14: 86380.76
oilchange_1_20

Step 13
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.361638	valid_1's l2: 0.369322
[200]	training's l2: 0.35155	valid_1's l2: 0.366305
[300]	training's l2: 0.347548	valid_1's l2: 0.365482
[400]	training's l2: 0.344975	valid_1's l2: 0.364925
[500]	training's l2: 0.34306	valid_1's l2: 0.364321
[600]	training's l2: 0.341557	valid_1's l2: 0.363898
[700]	training's l2: 0.340304	valid_1's l2: 0.363634
[800]	training's l2: 0.339267	valid_1's l2: 0.363455
mean_30_2017: 23301242.74
mean_10_2017: 12484103.66
mean_14_2017: 5084401.36
mean_11_2017: 4486925.40
mean_60_2017: 4306673.17
mean_20_dow5_2017: 3979912.58
mean_9_2017: 2769351.14
promo_12: 1548140.60
mean_3_2017: 1454864.44
mean_4_2017: 1193019.57
class: 920644.67
mean_13_2017: 572507.24
mean_5_2017: 532078.18
mean_6_2017: 508344.39
mean_2_2017: 417072.50
mean_140_2017: 364705.33
oilchange_140_2017: 356735.41
mean_4_dow5_2017: 225371.39
promo_14: 186059.69
cluster: 157884.22
promo_13: 119772.74
promo_

Step 16
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.353665	valid_1's l2: 0.365792
[200]	training's l2: 0.344745	valid_1's l2: 0.36291
[300]	training's l2: 0.341106	valid_1's l2: 0.361657
[400]	training's l2: 0.338761	valid_1's l2: 0.3609
[500]	training's l2: 0.337038	valid_1's l2: 0.36044
[600]	training's l2: 0.335637	valid_1's l2: 0.360132
[700]	training's l2: 0.334439	valid_1's l2: 0.359833
[800]	training's l2: 0.333377	valid_1's l2: 0.35948
mean_30_2017: 20269434.05
mean_14_2017: 12225054.95
mean_60_2017: 5116233.92
mean_13_2017: 4725181.83
mean_20_dow1_2017: 3783055.80
mean_6_2017: 1967176.51
promo_15: 1930569.13
mean_7_2017: 1447156.78
class: 1019353.37
mean_8_2017: 681388.09
mean_140_2017: 361615.72
mean_20_dow2_2017: 343484.37
mean_5_2017: 252417.63
mean_2_2017: 222785.04
day_1_2017: 192113.07
oilchange_14_2017: 185714.22
promo_14: 180653.75
oilchange_140_2017: 174815.77
cluster: 166615.99
promo_14_2017: 145288.35
mean_9_2017: 135155.68
o

In [31]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose(), sample_weight = items["perishable"] * 0.25 + 1))

Validation mse: 0.346519566126


In [32]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Validation mse: 0.347130123169


In [33]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose())**0.5)

Validation mse: 0.589177497168


In [None]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(y_test, index=df_train.index, columns=pd.date_range("2017-08-16", periods=16)
                       ).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

In [None]:
submission = df_test.set_index(['store_nbr', 'item_nbr', 'date'])[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1500)
submission.to_csv('large.csv', float_format='%.4f', index=None)

In [None]:
large = pd.read_csv('large.csv')
hetty = pd.read_csv('try2.csv')
large['unit_sales'] = large['unit_sales'] * .5 + hetty['unit_sales'] * .5
large.to_csv('final.csv', float_format='%.4f', index=None)