# Abstract

This is a clone of the script at https://www.kaggle.com/ceshine/lgbm-starter which is intended to give an idea of how to structure the data for trainig

# Prelude 

## Configuration

In [125]:
DataSetPath = "/home/bryanfeeney/Workspace/OttomanDiviner/favorita/"

StoresPath   = DataSetPath + "stores.csv.gz"
ItemsPath    = DataSetPath + "items.csv.gz"
OilPricePath = DataSetPath + "oil.csv.gz"
HolidaysPath = DataSetPath + "holidays_events.csv.gz"
Transactions = DataSetPath + "transactions.csv.gz"
TrainData    = DataSetPath + "train-2017.csv.gz"
TestData     = DataSetPath + "test.csv.gz"

FutureDaysToCalculate=16
WeeksOfHistoryForFeature=8
WeeksOfHistoryForFeatureOnValidation=3

## Imports

In [126]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Intro to the Data

In [127]:
cumul_sales = pd.read_csv(
    TrainData, 
    usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    compression='gzip'
)

cumul_sales_query = pd.read_csv(
    TestData,
    usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"],  # , date_parser=parser
    compression='gzip'
)

In [128]:
query_start_date

'2017-08-16'

In [129]:
cumul_sales_query = cumul_sales_query.set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [130]:
cumul_sales.iloc[-1,:]

date           2017-08-15 00:00:00
store_nbr                       54
item_nbr                   2116416
unit_sales                 1.09861
onpromotion                  False
Name: 23808259, dtype: object

In [131]:
cumul_sales_query.iloc[0,:]

id             125497040
onpromotion        False
Name: (1, 96995, 2017-08-16 00:00:00), dtype: object

In [132]:
items = pd.read_csv(
    ItemsPath,
).set_index("item_nbr")

stores = pd.read_csv(
    StoresPath
).set_index("store_nbr")

In [133]:
cumul_sales_query

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,False
1,99197,2017-08-16,125497041,False
1,103501,2017-08-16,125497042,False
1,103520,2017-08-16,125497043,False
1,103665,2017-08-16,125497044,False
1,105574,2017-08-16,125497045,False
1,105575,2017-08-16,125497046,False
1,105576,2017-08-16,125497047,False
1,105577,2017-08-16,125497048,False
1,105693,2017-08-16,125497049,False


In [134]:
cumul_sales.shape

(23808260, 5)

In [135]:
cumul_sales_query.shape

(3370464, 2)

In [136]:
items.shape

(4100, 3)

## Select only Last Three Months

This is a peculiar one, and it **games the benchmark** in a not great way. Essentially it uses the last 11 weeks of data before the prediction threshold to predict what's happening next

In [137]:
now = date(2017, 8, 15)

# How far back to go to start generating trend features for demand
data_start             = now - timedelta(7*11) + timedelta(1)
training_history_start = now - timedelta(7*WeeksOfHistoryForFeature) + timedelta(1)
validation_start       = now - timedelta(7*WeeksOfHistoryForFeatureOnValidation) + timedelta(1)



In [138]:
data_start, training_history_start, query_start

(datetime.date(2017, 5, 31),
 datetime.date(2017, 6, 21),
 datetime.date(2017, 7, 26))

In [139]:
cumul_sales = cumul_sales[cumul_sales.date.isin(
    pd.date_range(data_start, periods=7 * 11))].copy()


In [140]:
cumul_sales.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
15682590,2017-05-31,1,96995,0.693147,False
15682591,2017-05-31,1,99197,0.693147,False
15682592,2017-05-31,1,103520,1.386294,False
15682593,2017-05-31,1,103665,2.197225,False
15682594,2017-05-31,1,105574,1.386294,False


In [141]:
cumul_sales.shape

(8125670, 5)

In [142]:
cumul_sales.iloc[-1,:]

date           2017-08-15 00:00:00
store_nbr                       54
item_nbr                   2116416
unit_sales                 1.09861
onpromotion                  False
Name: 23808259, dtype: object

## Creating Promotion Variables

So this is a tricky. If one presumes that on-promotion will lead to a boost in demand, if if we presume we'll know *whats on promotion in advance*, then we can create variables to say that this product will be on promotion 1, 2, 3, ... 16 days from now (16 days in the future is the target)

In this case, this is also peculiar, there is a column for every single day!

In [143]:
promo_variables = cumul_sales.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]]

In [144]:
promo_variables.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1
1,96995,2017-05-31,False
1,99197,2017-05-31,False
1,103520,2017-05-31,False
1,103665,2017-05-31,False
1,105574,2017-05-31,False


In [145]:
promo_variables = cumul_sales.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)



In [146]:
promo_variables.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2017-05-31,2017-06-01,2017-06-02,2017-06-03,2017-06-04,2017-06-05,2017-06-06,2017-06-07,2017-06-08,2017-06-09,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [147]:
promo_variables.columns = promo_variables.columns.get_level_values(1)

promo_variables_query = cumul_sales_query[["onpromotion"]].unstack(level=-1).fillna(False)
promo_variables_query.columns = promo_variables_query.columns.get_level_values(1)
promo_variables_query = promo_variables_query.reindex(promo_variables.index).fillna(False)

promo_variables_train_and_query = pd.concat([promo_variables, promo_variables_query], axis=1)


In [148]:
promo_variables.shape, items.shape[0] * stores.shape[0]

((156790, 77), 221400)

In [149]:
cumul_sales.shape, cumul_sales_query.shape

((8125670, 5), (3370464, 2))

#  Unstack unit sales - do it across all days in a sliding window

Ah... they're creating a multi-task learning problem

In [150]:
cumul_sales = cumul_sales.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
cumul_sales.columns = cumul_sales.columns.get_level_values(1)
cumul_sales.shape

(156790, 77)

In [151]:
cumul_sales.head()

Unnamed: 0_level_0,date,2017-05-31 00:00:00,2017-06-01 00:00:00,2017-06-02 00:00:00,2017-06-03 00:00:00,2017-06-04 00:00:00,2017-06-05 00:00:00,2017-06-06 00:00:00,2017-06-07 00:00:00,2017-06-08 00:00:00,2017-06-09 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.693147,1.386294,1.098612,1.94591,1.098612,1.098612,0.0,0.0,0.693147,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,1.386294,1.098612,1.098612,0.693147,0.0,0.693147,1.609438,0.693147,0.693147,1.098612,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,2.197225,0.0,1.791759,1.791759,1.098612,1.386294,1.791759,1.386294,0.0,1.098612,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,1.386294,2.484907,1.791759,1.386294,1.386294,1.386294,2.079442,2.397895,1.94591,2.079442,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


## Make items match other data frames

They're sacraficing generability

In [152]:
items = items.reindex(cumul_sales.index.get_level_values(1))
items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1
105574,GROCERY I,1045,0


In [153]:
items.shape

(156790, 3)

## Time futzing

In [154]:
# Return that portion of the data frame that corresponds to the time period
#   beginning "minus" days before "dt" and extending for "periods" days
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [155]:
def prepare_dataset(cumul_sales, promo_variables_train_and_query, start_date, is_train=True):
    X = pd.DataFrame({  # Mean target for different retrospective timespans & total # promotions
        "mean_3_2017": get_timespan(cumul_sales, start_date, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(cumul_sales, start_date, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(cumul_sales, start_date, 14, 14).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_variables_train_and_query, start_date, 14, 14).sum(axis=1).values
    })
    for i in range(16):  # Promotions on future days
        X["promo_{}".format(i)] = promo_variables_train_and_query[
            start_date + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = cumul_sales[  # Target values for future days
            pd.date_range(start_date, periods=16)
        ].values
        return X, y
    return X

In [156]:
print("Preparing dataset...")
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(cumul_sales, promo_variables_train_and_query, training_history_start + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

X_validate, y_validate = prepare_dataset(cumul_sales, promo_variables_train_and_query, validation_start)

X_query = prepare_dataset(cumul_sales, promo_variables_train_and_query, now, is_train=False)

Preparing dataset...


In [157]:
X_train.shape, X_validate.shape, X_query.shape

((627160, 20), (156790, 20), (156790, 20))

This dataset is **super gamey**. They're using the means for the week, fortnight, and last three days, and then seeing how to permute it to generate values for the following window of time. It's hardcoded to product IDs, not categories.

It does however, permit multi-task learning, and therefore better representation learning

It does not incorporate any information about seasonality at all, and so would fall arse over face at Christmas



In [158]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

Training and predicting models...


In [159]:
MAX_ROUNDS = 1000
validate_pred = []
query_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    
    dvalidate = lgb.Dataset(
        X_validate, label=y_validate[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dvalidate], early_stopping_rounds=50, verbose_eval=50
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    
    
    validate_pred.append(bst.predict(
        X_validate, num_iteration=bst.best_iteration or MAX_ROUNDS))
    
    query_pred.append(bst.predict(
        X_query, num_iteration=bst.best_iteration or MAX_ROUNDS))

Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.345147	valid_1's l2: 0.341487
[100]	training's l2: 0.333379	valid_1's l2: 0.330174
[150]	training's l2: 0.331405	valid_1's l2: 0.328823
[200]	training's l2: 0.330344	valid_1's l2: 0.328317
[250]	training's l2: 0.329476	valid_1's l2: 0.327889
[300]	training's l2: 0.328793	valid_1's l2: 0.327649
[350]	training's l2: 0.328187	valid_1's l2: 0.327459
[400]	training's l2: 0.327652	valid_1's l2: 0.327329
[450]	training's l2: 0.327151	valid_1's l2: 0.327218
[500]	training's l2: 0.326681	valid_1's l2: 0.327129
[550]	training's l2: 0.326264	valid_1's l2: 0.327102
[600]	training's l2: 0.325878	valid_1's l2: 0.327031
[650]	training's l2: 0.325453	valid_1's l2: 0.326991
[700]	training's l2: 0.325065	valid_1's l2: 0.326944
[750]	training's l2: 0.324698	valid_1's l2: 0.326981
Early stopping, best iteration is:
[712]	training's l2: 0.324969	valid_1's l2: 0.326938
mean_14_2017: 1984359.03
mean_7_2017: 1460047.19
mean_3_

Step 7
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.399249	valid_1's l2: 0.499368
[100]	training's l2: 0.387491	valid_1's l2: 0.484385
[150]	training's l2: 0.384487	valid_1's l2: 0.483271
[200]	training's l2: 0.382909	valid_1's l2: 0.482962
[250]	training's l2: 0.381732	valid_1's l2: 0.482566
[300]	training's l2: 0.380894	valid_1's l2: 0.482571
Early stopping, best iteration is:
[265]	training's l2: 0.381465	valid_1's l2: 0.482494
mean_14_2017: 2176120.92
mean_7_2017: 788873.90
mean_3_2017: 309822.71
promo_6: 154213.49
promo_14_2017: 50427.60
promo_3: 14230.01
promo_7: 10208.19
promo_13: 8896.97
promo_5: 7975.26
promo_0: 4785.81
promo_1: 4559.44
promo_4: 4185.97
promo_9: 3828.04
promo_2: 3637.22
promo_14: 3187.51
promo_8: 2329.80
promo_11: 1460.21
promo_15: 1382.96
promo_12: 1252.57
promo_10: 1153.89
Step 8
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.387984	valid_1's l2: 0.460446
[100]	training's l2: 0.37561

Step 15
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.410998	valid_1's l2: 0.410436
[100]	training's l2: 0.397157	valid_1's l2: 0.398619
[150]	training's l2: 0.394315	valid_1's l2: 0.397761
[200]	training's l2: 0.392755	valid_1's l2: 0.397444
[250]	training's l2: 0.391625	valid_1's l2: 0.397205
[300]	training's l2: 0.390647	valid_1's l2: 0.397032
[350]	training's l2: 0.389848	valid_1's l2: 0.396838
[400]	training's l2: 0.38916	valid_1's l2: 0.396693
[450]	training's l2: 0.388521	valid_1's l2: 0.396705
Early stopping, best iteration is:
[418]	training's l2: 0.388918	valid_1's l2: 0.396637
mean_14_2017: 2326294.16
mean_7_2017: 697294.28
mean_3_2017: 242807.08
promo_14: 232132.33
promo_14_2017: 49620.78
promo_7: 31713.90
promo_0: 29744.91
promo_15: 19946.17
promo_13: 11068.67
promo_9: 9069.83
promo_12: 8493.43
promo_10: 6377.01
promo_2: 4226.62
promo_6: 4122.19
promo_8: 3039.53
promo_11: 3002.61
promo_4: 2528.08
promo_1: 1724.67
promo_3: 1156.68
promo

In [162]:
print("Validation mse:", np.sqrt(mean_squared_error(
    np.expm1(y_validate), np.expm1(np.array(val_pred)).transpose())))

ValueError: Found input variables with inconsistent numbers of samples: [156790, 0]

In [None]:
validate_pred

In [173]:
y_query.shape, y_query

((156790, 16),
 array([[0.24158492, 0.24897964, 0.27285152, ..., 0.29180975, 0.30408553,
         0.27975772],
        [0.20325735, 0.19115871, 0.2179842 , ..., 0.2247866 , 0.21264327,
         0.20899921],
        [0.60762603, 0.5517181 , 0.62237221, ..., 0.6044998 , 0.60427741,
         0.56521111],
        ...,
        [0.2351554 , 0.21443665, 0.25158354, ..., 0.23555907, 0.22213154,
         0.21211291],
        [1.69932833, 1.56939365, 1.70439771, ..., 1.75361082, 1.68996464,
         1.71933721],
        [0.42327216, 0.39768466, 0.37982758, ..., 0.4117993 , 0.37641541,
         0.36814008]]))

In [174]:
print("Making submission...")
y_query = np.array(query_pred).transpose()
df_preds = pd.DataFrame(
    y_query, index=cumul_sales.index,
    columns=pd.date_range(query_start_date, periods=16)
).stack().to_frame("unit_sales")
df_preds.to_csv("/tmp/df_preds.csv")


Making submission...


In [176]:
df_preds.to_csv("/tmp/df_preds.csv")

In [168]:
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

In [169]:
df_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1
1,96995,2017-08-16,0.241585
1,96995,2017-08-17,0.248980
1,96995,2017-08-18,0.272852
1,96995,2017-08-19,0.400383
1,96995,2017-08-20,0.399940
1,96995,2017-08-21,0.315500
1,96995,2017-08-22,0.273534
1,96995,2017-08-23,0.298480
1,96995,2017-08-24,0.273323
1,96995,2017-08-25,0.282542


In [83]:

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)

NameError: name 'df_test' is not defined

In [61]:
submission

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,0.273266
1,99197,2017-08-16,125497041,0.225388
1,103501,2017-08-16,125497042,0.000000
1,103520,2017-08-16,125497043,0.648250
1,103665,2017-08-16,125497044,1.352113
1,105574,2017-08-16,125497045,3.222305
1,105575,2017-08-16,125497046,7.469437
1,105576,2017-08-16,125497047,0.000000
1,105577,2017-08-16,125497048,0.294655
1,105693,2017-08-16,125497049,0.319934


# Further Improvements

This is based on the work in this file: https://www.kaggle.com/vrtjso/lgbm-one-step-ahead

This was apparently in the top 10% at one point.

In [76]:
df_train = pd.read_csv(
    TrainData, usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    TestData, usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    ItemsPath,
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

print("Training and predicting models...")
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)


Preparing dataset...
Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.30191	valid_1's l2: 0.29409
[200]	training's l2: 0.298363	valid_1's l2: 0.292741
[300]	training's l2: 0.295918	valid_1's l2: 0.292337
[400]	training's l2: 0.293791	valid_1's l2: 0.2921
[500]	training's l2: 0.29199	valid_1's l2: 0.29195
Did not meet early stopping. Best iteration is:
[500]	training's l2: 0.29199	valid_1's l2: 0.29195
mean_7_2017: 1882639.38
mean_14_2017: 1229821.77
promo_0: 104143.51
day_1_2017: 89857.46
mean_20_dow0_2017: 84245.48
mean_3_2017: 76646.29
mean_30_2017: 76583.57
mean_4_dow0_2017: 58919.38
mean_60_2017: 33035.18
promo_14_2017: 28619.72
promo_7: 9432.45
mean_4_dow5_2017: 7417.05
mean_140_2017: 7406.32
promo_60_2017: 6740.72
mean_20_dow4_2017: 5611.55
promo_140_2017: 5493.72
mean_4_dow6_2017: 4633.44
mean_4_dow2_2017: 3813.74
mean_20_dow2_2017: 3343.78
mean_4_dow3_2017: 2824.66
promo_9: 2814.25
mean_4_dow1_2017: 2707.00
mean_20_dow3_2017: 2642.58
mean_20_dow1_2017: 2616.99
me

Step 7
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.346219	valid_1's l2: 0.421254
[200]	training's l2: 0.341874	valid_1's l2: 0.420923
[300]	training's l2: 0.339035	valid_1's l2: 0.420617
[400]	training's l2: 0.336652	valid_1's l2: 0.420437
Early stopping, best iteration is:
[393]	training's l2: 0.336799	valid_1's l2: 0.420378
mean_14_2017: 1274004.42
mean_30_2017: 842088.63
mean_7_2017: 445757.89
mean_20_dow6_2017: 152802.16
mean_3_2017: 145241.24
promo_6: 128192.88
mean_4_dow6_2017: 127503.94
mean_60_2017: 123326.90
promo_14_2017: 21944.55
day_1_2017: 13872.06
promo_3: 11205.46
promo_7: 9013.03
mean_4_dow5_2017: 8533.91
mean_20_dow5_2017: 8280.78
promo_60_2017: 8112.57
promo_140_2017: 6018.04
mean_20_dow1_2017: 4884.13
promo_5: 4487.41
mean_140_2017: 4382.31
promo_13: 3877.01
mean_4_dow0_2017: 3650.61
mean_20_dow0_2017: 3639.82
mean_20_dow3_2017: 3507.37
mean_4_dow1_2017: 3355.21
mean_20_dow4_2017: 2943.12
promo_4: 2784.87
promo_0: 2663.62
mean

[200]	training's l2: 0.365615	valid_1's l2: 0.376333
[300]	training's l2: 0.362327	valid_1's l2: 0.376258
Early stopping, best iteration is:
[338]	training's l2: 0.361236	valid_1's l2: 0.37609
mean_30_2017: 1601076.04
mean_14_2017: 588977.50
mean_60_2017: 374936.35
mean_7_2017: 305452.67
mean_3_2017: 148957.18
promo_12: 93518.03
mean_20_dow5_2017: 85159.77
mean_4_dow5_2017: 71638.34
promo_13: 19595.91
promo_14_2017: 16011.22
promo_14: 13355.62
promo_10: 11265.36
mean_140_2017: 9927.43
promo_60_2017: 8798.33
day_1_2017: 7950.62
mean_20_dow0_2017: 7544.02
promo_140_2017: 6088.78
mean_20_dow6_2017: 5888.75
mean_4_dow6_2017: 4725.31
promo_11: 4479.14
mean_4_dow0_2017: 3756.38
mean_20_dow3_2017: 3664.43
promo_9: 3613.91
mean_20_dow2_2017: 3341.99
mean_4_dow2_2017: 2714.48
mean_20_dow1_2017: 2701.22
mean_4_dow3_2017: 2566.89
mean_20_dow4_2017: 2531.39
promo_15: 2495.30
mean_4_dow1_2017: 2301.79
mean_4_dow4_2017: 2274.90
promo_7: 2189.77
promo_0: 1734.63
promo_8: 1699.81
promo_5: 1614.77
prom

In [81]:
print("Validation mse:", mean_squared_error(
    np.expm1(y_validate), np.expm1(np.array(validate_pred)).transpose()))

Validation mse: 275.7862813287468


In [78]:
np.sqrt(275), np.sqrt(247)

(16.583123951777, 15.716233645501712)