In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# df_train = pd.read_csv('train.csv', dtype={'onpromotion': bool},
#                        converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
#                        parse_dates=["date"], skiprows=range(1, 66458909))

In [3]:
#df_train = df_train_raw.loc[df_train_raw['date']>=pd.datetime(2016,1,1)]

In [4]:
# train on 2017 data
df_train = pd.read_csv('train.csv', dtype={'onpromotion': bool},
                       converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
                       parse_dates=["date"], skiprows = range(1,101688780))

In [5]:
date_index = pd.date_range(df_train['date'].min(), df_train['date'].max())

In [6]:
df_test = pd.read_csv("test.csv", dtype={'onpromotion': bool}, parse_dates=["date"]).set_index(['store_nbr', 'item_nbr', 'date'])

In [7]:
items = pd.read_csv("items.csv").set_index("item_nbr")

In [8]:
# flatten promo across dates
promo_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(level=-1).fillna(False)
promo_train.columns = promo_train.columns.get_level_values(1)
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)

In [9]:
promo_train = promo_train.reindex(date_index.values, axis = 1).fillna(False)

In [10]:
promo_test = promo_test.reindex(promo_train.index).fillna(False) #align two ds, disregard missing test items 
promo = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [11]:
df_train = df_train.set_index(["store_nbr", "item_nbr", "date"])["unit_sales"].unstack(level=-1).fillna(0)

In [12]:
df_train = df_train.reindex(date_index.values, axis = 1).fillna(0)

In [13]:
items = items.reindex(df_train.index.get_level_values(1)) #delete items not in train, align item index to train index

In [14]:
oil_date_index = pd.date_range(date_index.min(), df_test.reset_index()['date'].max())

In [15]:
oil = pd.read_csv('oil.csv', index_col = 'date', parse_dates = ['date'])
oil = oil.transpose().reindex(oil_date_index.values, axis = 1).fillna(method='bfill', axis = 1)

In [16]:
def get_timespan(df, dt, minus, periods, freq='D'): #slicing data by date, trace back "minus" days
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [17]:
def prepare_dataset(data, promo, oil, t_anchor, is_train=True):
    nsamples = data.shape[0]
    X = pd.DataFrame({
        "day_1_2017": get_timespan(data, t_anchor, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(data, t_anchor, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(data, t_anchor, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(data, t_anchor, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(data, t_anchor, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(data, t_anchor, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(data, t_anchor, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo, t_anchor, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo, t_anchor, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo, t_anchor, 140, 140).sum(axis=1).values,
        "oil_14_2017": np.tile(get_timespan(oil, t_anchor, 14, 14).mean(axis=1).values,(nsamples)),
        "oil_60_2017": np.tile(get_timespan(oil, t_anchor, 60, 60).mean(axis=1).values,(nsamples)),
        "oil_140_2017": np.tile(get_timespan(oil, t_anchor, 140, 140).mean(axis=1).values,(nsamples))        
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(data, t_anchor, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo[t_anchor + timedelta(days=i)].values.astype(np.uint8)
        X["oil_{}".format(i)] = np.tile(oil[t_anchor + timedelta(days=i)].values,(nsamples))
    if is_train:
        y = data[pd.date_range(t_anchor, periods=16)].values
        return X, y
    return X

In [18]:
#t2017 = date(2016, 6, 16)
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
#for i in range(50):
for i in range(6):
    print(i)
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(df_train, promo, oil, t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(df_train,promo, oil, date(2017, 6, 16))
X_test = prepare_dataset(df_train, promo, oil, date(2017, 8, 16), is_train=False)

0
1
2
3
4
5


In [19]:
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 12,
    'train_meric':True
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []

In [20]:
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(X_train, label=y_train[:, i], categorical_feature=cate_vars,
#                         weight=pd.concat([items["perishable"]] * 50) * 0.25 + 1)
                         weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1)
    dval = lgb.Dataset(X_val, label=y_val[:, i], reference=dtrain, weight=items["perishable"] * 0.25 + 1,
                       categorical_feature=cate_vars)
    bst = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS, valid_sets=[dtrain, dval],
                    early_stopping_rounds=50, verbose_eval=100)
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.300619	valid_1's l2: 0.329976
[200]	training's l2: 0.29676	valid_1's l2: 0.326779
[300]	training's l2: 0.294084	valid_1's l2: 0.326029
[400]	training's l2: 0.29192	valid_1's l2: 0.325921
Early stopping, best iteration is:
[377]	training's l2: 0.292406	valid_1's l2: 0.325797
mean_7_2017: 1525394.68
mean_14_2017: 1190235.08
mean_30_2017: 437676.97
promo_0: 109750.98
mean_3_2017: 107663.86
mean_20_dow0_2017: 89645.78
day_1_2017: 70401.01
mean_4_dow0_2017: 64347.10
mean_60_2017: 33151.40
promo_14_2017: 26253.65
promo_7: 8914.63
mean_4_dow5_2017: 6751.95
promo_60_2017: 6569.21
promo_140_2017: 5539.00
mean_140_2017: 5244.78
mean_20_dow4_2017: 5055.16
mean_4_dow6_2017: 4755.94
mean_20_dow2_2017: 4517.09
mean_4_dow2_2017: 4121.38
oil_7: 3121.15
promo_14: 2848.06
oil_140_2017: 2473.81
mean_20_dow3_2017: 2314.16
mean_4_dow3_2017: 2066.04
promo_9: 2045.19
mean_4_dow1_2017: 1980.56
mean_20_dow6_2017: 1969.87
mean_

Step 7
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[31]	training's l2: 0.356472	valid_1's l2: 0.347065
mean_30_2017: 1148711.59
mean_14_2017: 1076479.71
mean_7_2017: 290762.25
mean_3_2017: 170741.97
mean_20_dow6_2017: 169297.48
mean_60_2017: 128695.37
promo_6: 128289.33
mean_4_dow6_2017: 93439.82
promo_14_2017: 16709.02
promo_3: 10051.68
day_1_2017: 10026.88
oil_2: 5794.87
mean_20_dow5_2017: 5092.97
promo_7: 4644.15
oil_9: 4053.36
promo_60_2017: 3795.56
mean_140_2017: 3424.72
promo_5: 3035.72
mean_4_dow5_2017: 2227.36
promo_140_2017: 1486.01
oil_14_2017: 1322.91
mean_4_dow1_2017: 1298.91
promo_13: 1170.75
promo_9: 819.16
promo_0: 771.12
promo_4: 751.55
mean_20_dow1_2017: 712.90
oil_140_2017: 691.01
promo_14: 584.25
mean_4_dow0_2017: 364.64
promo_2: 329.38
oil_6: 328.83
promo_15: 328.28
mean_20_dow4_2017: 214.73
mean_20_dow3_2017: 206.42
promo_8: 165.45
mean_20_dow0_2017: 145.96
oil_0: 126.01
mean_4_dow2_2017: 82.99
promo_10: 53.51
me

Step 13
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.367607	valid_1's l2: 0.350263
[200]	training's l2: 0.362618	valid_1's l2: 0.348466
[300]	training's l2: 0.359053	valid_1's l2: 0.347847
[400]	training's l2: 0.356189	valid_1's l2: 0.347366
Early stopping, best iteration is:
[436]	training's l2: 0.35526	valid_1's l2: 0.347187
mean_30_2017: 1535001.21
mean_14_2017: 641717.23
mean_60_2017: 442304.49
mean_7_2017: 204274.29
mean_3_2017: 164287.96
promo_12: 108482.38
mean_4_dow5_2017: 105424.36
mean_20_dow5_2017: 84968.05
promo_14_2017: 17211.53
promo_14: 12852.35
promo_10: 10171.46
mean_140_2017: 9988.20
oil_0: 9025.76
promo_13: 9002.85
promo_60_2017: 8506.22
mean_20_dow0_2017: 7948.32
promo_140_2017: 6825.72
mean_20_dow6_2017: 6368.01
day_1_2017: 6184.79
mean_4_dow6_2017: 5007.90
mean_4_dow0_2017: 4536.91
mean_20_dow3_2017: 4325.45
promo_11: 3613.34
mean_20_dow2_2017: 3374.52
mean_4_dow2_2017: 3354.16
mean_20_dow1_2017: 3254.71
promo_9: 3251.57
oil

In [21]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Validation mse: 0.36693805534


In [22]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(y_test, index=df_train.index, columns=pd.date_range("2017-08-16", periods=16)
                       ).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [42]:
# submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
# submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
# submission.to_csv('tryx1.csv', float_format='%.4f', index=None)

In [22]:
# predict unknown items by class average

In [322]:
df_preds2 = df_preds.unstack(-1)
df_preds2.columns = df_preds2.columns.get_level_values(1)
df_preds2.reset_index(inplace = True)

In [323]:
# add class and family columns
df_preds2 = df_preds.unstack(-1)
df_preds2.columns = df_preds2.columns.get_level_values(1)
df_preds2.reset_index(inplace = True)
df_preds2['class'] = df_preds2['item_nbr'].map(items['class'].to_dict())
df_preds2['family'] = df_preds2['item_nbr'].map(items['family'].to_dict())

In [324]:
# calculate mean from existing prediction
class_pred = df_preds2.drop(['item_nbr','family'], axis = 1).groupby(['store_nbr', 'class']).agg('mean')
class_pred = class_pred.stack()
family_pred = df_preds2.drop(['item_nbr','class'], axis = 1).groupby(['store_nbr', 'family']).agg('mean')
family_pred = family_pred.stack()

In [None]:
# calculate mean from existing prediction
class_pred = df_preds2.drop(['item_nbr','family'], axis = 1).groupby(['store_nbr', 'class']).agg('mean')
class_pred = class_pred.stack()
family_pred = df_preds2.drop(['item_nbr','class'], axis = 1).groupby(['store_nbr', 'family']).agg('mean')
family_pred = family_pred.stack()

In [294]:
# items to fill
submission = df_test[["id"]].join(df_preds, how="left")
nullsub = submission[submission['unit_sales'].isnull()].reset_index()

In [296]:
# add tag
items = pd.read_csv("items.csv").set_index("item_nbr")
nullsub['class'] = nullsub['item_nbr'].map(items['class'].to_dict())
nullsub['family'] = nullsub['item_nbr'].map(items['family'].to_dict())

In [297]:
# fill with mean
nullsub.set_index(['store_nbr','family','date'], inplace = True)
nullsub['unit_sales'].update(family_pred)
nullsub.reset_index(inplace = True)
nullsub.set_index(['store_nbr','class','date'], inplace = True)
nullsub['unit_sales'].update(class_pred)
nullsub = nullsub.reset_index().drop(['class','family'],axis = 1).set_index(['store_nbr','item_nbr','date'])

In [305]:
# final submission
submission.fillna(nullsub,inplace = True)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('try13.csv', float_format='%.4f')