In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
# df_train = pd.read_csv('train.csv', dtype={'onpromotion': bool},
#                        converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
#                        parse_dates=["date"], skiprows=range(1, 66458909))

In [3]:
#df_train = df_train_raw.loc[df_train_raw['date']>=pd.datetime(2016,1,1)]

In [4]:
# train on 2017 data
df_train = pd.read_csv('train.csv', dtype={'onpromotion': bool},
                       converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
                       parse_dates=["date"], skiprows = range(1,101688780))

In [5]:
date_index = pd.date_range(df_train['date'].min(), df_train['date'].max())

In [6]:
df_test = pd.read_csv("test.csv", dtype={'onpromotion': bool}, parse_dates=["date"]).set_index(['store_nbr', 'item_nbr', 'date'])

In [7]:
items = pd.read_csv("items.csv").set_index("item_nbr")

In [8]:
# flatten promo across dates
promo_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(level=-1).fillna(False)
promo_train.columns = promo_train.columns.get_level_values(1)
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)

In [9]:
promo_train = promo_train.reindex(date_index.values, axis = 1).fillna(False)

In [10]:
promo_test = promo_test.reindex(promo_train.index).fillna(False) #align two ds, disregard missing test items 
promo = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [11]:
df_train = df_train.set_index(["store_nbr", "item_nbr", "date"])["unit_sales"].unstack(level=-1).fillna(0)

In [12]:
df_train = df_train.reindex(date_index.values, axis = 1).fillna(0)

In [13]:
items = items.reindex(df_train.index.get_level_values(1)) #delete items not in train, align item index to train index

In [14]:
def get_timespan(df, dt, minus, periods, freq='D'): #slicing data by date, trace back "minus" days
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [15]:
def prepare_dataset(data, promo, t_anchor, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(data, t_anchor, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(data, t_anchor, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(data, t_anchor, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(data, t_anchor, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(data, t_anchor, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(data, t_anchor, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(data, t_anchor, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo, t_anchor, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo, t_anchor, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo, t_anchor, 140, 140).sum(axis=1).values
#         "oil_14_2017": get_timespan(oil, t_anchor, 14, 14).mean(axis=1).values,
#         "oil_60_2017": get_timespan(oil, t_anchor, 60, 60).mean(axis=1).values,
#         "oil_140_2017": get_timespan(oil, t_anchor, 140, 140).mean(axis=1).values        
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo[t_anchor + timedelta(days=i)].values.astype(np.uint8)
#         X["oil_{}".format(i)] = oil[t_anchor + timedelta(days=i)].values
    if is_train:
        y = data[pd.date_range(t_anchor, periods=16)].values
        return X, y
    return X

In [16]:
#t2017 = date(2016, 6, 16)
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
#for i in range(50):
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(df_train, promo, t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(df_train,promo, date(2017, 6, 16))
X_test = prepare_dataset(df_train, promo, date(2017, 8, 16), is_train=False)

In [21]:
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2_root',
    'num_threads': 12,
    'train_meric':True
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []

In [22]:
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(X_train, label=y_train[:, i], categorical_feature=cate_vars,
#                         weight=pd.concat([items["perishable"]] * 50) * 0.25 + 1)
                         weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1)
    dval = lgb.Dataset(X_val, label=y_val[:, i], reference=dtrain, weight=items["perishable"] * 0.25 + 1,
                       categorical_feature=cate_vars)
    bst = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS, valid_sets=[dtrain, dval],
                    early_stopping_rounds=50, verbose_eval=100)
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.549316	valid_1's rmse: 0.571352
[200]	training's rmse: 0.546129	valid_1's rmse: 0.569432
[300]	training's rmse: 0.543895	valid_1's rmse: 0.568983
Early stopping, best iteration is:
[320]	training's rmse: 0.543518	valid_1's rmse: 0.568938
mean_7_2017: 1903546.14
mean_14_2017: 1207198.66
promo_0: 104078.13
day_1_2017: 89370.55
mean_20_dow0_2017: 86659.22
mean_30_2017: 79924.94
mean_3_2017: 71306.47
mean_4_dow0_2017: 58256.87
promo_14_2017: 28477.77
mean_60_2017: 25679.34
promo_7: 8518.08
mean_140_2017: 7619.70
promo_60_2017: 6142.98
mean_4_dow5_2017: 5841.40
promo_140_2017: 5243.72
mean_20_dow4_2017: 4785.79
mean_4_dow6_2017: 4260.24
mean_20_dow2_2017: 3420.60
mean_4_dow2_2017: 3141.94
promo_9: 2849.14
promo_14: 2308.76
mean_20_dow3_2017: 2057.76
mean_4_dow1_2017: 1955.94
mean_20_dow1_2017: 1878.97
mean_4_dow3_2017: 1773.88
mean_20_dow6_2017: 1709.04
mean_4_dow4_2017: 1708.13
promo_15: 1385.90
mean_20_

Step 8
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.576755	valid_1's rmse: 0.592306
[200]	training's rmse: 0.573131	valid_1's rmse: 0.591929
Early stopping, best iteration is:
[168]	training's rmse: 0.574022	valid_1's rmse: 0.591541
mean_30_2017: 1109989.43
mean_14_2017: 1071423.47
mean_7_2017: 629795.22
promo_7: 180545.74
mean_20_dow0_2017: 150973.86
mean_60_2017: 150601.54
mean_4_dow0_2017: 79970.25
promo_0: 22557.85
mean_3_2017: 20700.54
day_1_2017: 16245.19
promo_14_2017: 13247.19
promo_60_2017: 11974.65
promo_14: 9229.45
promo_140_2017: 7837.04
promo_3: 5793.06
mean_140_2017: 5438.36
mean_20_dow2_2017: 4656.11
mean_20_dow4_2017: 4629.65
promo_5: 3654.87
mean_20_dow1_2017: 2917.56
promo_6: 2863.47
mean_4_dow5_2017: 2046.51
promo_9: 1904.09
mean_20_dow3_2017: 1865.66
mean_4_dow6_2017: 1848.67
promo_4: 1778.66
promo_2: 1725.99
mean_4_dow2_2017: 1549.78
mean_20_dow6_2017: 1449.08
promo_15: 1440.22
mean_4_dow3_2017: 1331.96
mean_4_dow1_2017: 13

Step 15
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.588958	valid_1's rmse: 0.60106
[200]	training's rmse: 0.585267	valid_1's rmse: 0.600251
[300]	training's rmse: 0.582426	valid_1's rmse: 0.600241
Early stopping, best iteration is:
[278]	training's rmse: 0.582937	valid_1's rmse: 0.600141
mean_30_2017: 1547763.74
mean_14_2017: 589243.44
mean_7_2017: 405147.83
mean_20_dow0_2017: 291282.75
promo_14: 195973.82
mean_60_2017: 171542.12
mean_4_dow0_2017: 69227.48
promo_7: 18019.69
promo_14_2017: 16765.04
day_1_2017: 15568.24
mean_3_2017: 12462.40
promo_0: 12404.69
promo_140_2017: 11337.14
promo_60_2017: 10862.15
mean_20_dow2_2017: 9055.40
promo_13: 8862.57
mean_140_2017: 7233.21
promo_12: 6307.16
mean_20_dow4_2017: 5291.20
mean_20_dow1_2017: 4524.15
promo_10: 4314.65
promo_9: 4022.94
promo_15: 3970.95
mean_4_dow6_2017: 2929.14
mean_4_dow2_2017: 2692.33
mean_20_dow6_2017: 2595.21
mean_4_dow1_2017: 2445.81
mean_4_dow5_2017: 2348.90
mean_20_dow3_2017: 2

In [23]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Validation mse: 0.366799456115


In [24]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(y_test, index=df_train.index, columns=pd.date_range("2017-08-16", periods=16)
                       ).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [25]:
submission = df_test[["id"]].join(df_preds, how="left")#.fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('default_nofill.csv', float_format='%.4f', index=None)

In [22]:
# predict unknown items by class average

In [26]:
# add class
df_preds2 = df_preds.unstack(-1)
item_idx = df_preds2.index.get_level_values(1)
df_preds2['class'] = items.reindex(item_idx)['class'].values
df_preds2['family'] = items.reindex(item_idx)['family'].values

In [28]:
class_pred = df_preds2.reset_index().drop(['item_nbr','family'], axis = 1).groupby(['store_nbr', 'class']).agg('mean')
class_pred = class_pred.stack()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [59]:
# add family
family_pred = df_preds2.reset_index().drop(['item_nbr','class'], axis = 1).groupby(['store_nbr', 'family']).agg('mean')
family_pred = family_pred.stack()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [30]:
submission = df_test[["id"]].join(df_preds, how="left")

In [31]:
item_idx = submission.index.get_level_values(1)

In [32]:
items = pd.read_csv("items.csv").set_index("item_nbr")

In [63]:
submission['class'] = items.reindex(item_idx)['class'].values
submission['family'] = items.reindex(item_idx)['family'].values

In [34]:
class_pred.rename(columns = {'unit_sales': 'class_sales'}, inplace = True)
submission = submission.reset_index().set_index(['store_nbr','class','date']).merge(class_pred, left_index = True,
                                                                      right_index = True, how = 'left')
submission['unit_sales'].fillna(submission['class_sales'], inplace = True)
submission.drop('class_sales', axis = 1, inplace = True)

In [65]:
family_pred.rename(columns = {'unit_sales': 'family_sales'}, inplace = True)
submission = submission.reset_index().set_index(['store_nbr','family','date']).merge(family_pred, left_index = True,
                                                                      right_index = True, how = 'left')
submission['unit_sales'].fillna(submission['family_sales'], inplace = True)
submission.drop('family_sales', axis = 1, inplace = True)

In [66]:
submission = submission.reset_index().drop(['family','class','store_nbr','item_nbr','date'], axis = 1).set_index(['id']).sort_index().fillna(0)

In [37]:
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('default+meanclass.csv', float_format='%.4f')