# Abstract

This is a clone of the script at https://www.kaggle.com/ceshine/lgbm-starter which is intended to give an idea of how to structure the data for trainig

# Prelude 

## Configuration

In [1]:
DataSetPath = "/home/bryanfeeney/Workspace/OttomanDiviner/favorita/"

StoresPath   = DataSetPath + "stores.csv.gz"
ItemsPath    = DataSetPath + "items.csv.gz"
OilPricePath = DataSetPath + "oil.csv.gz"
HolidaysPath = DataSetPath + "holidays_events.csv.gz"
Transactions = DataSetPath + "transactions.csv.gz"
# TrainData    = DataSetPath + "train-2017.csv.gz"
# TestData     = DataSetPath + "test.csv.gz"
TrainData    = DataSetPath + "train-2018.csv.gz"
TestData     = DataSetPath + "query-2018.csv"

FutureDaysToCalculate=16
WeeksOfHistoryForFeature=8
WeeksOfHistoryForFeatureOnValidation=3

## Imports

In [21]:
from datetime import date, datetime, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Intro to the Data

In [3]:
cumul_sales = pd.read_csv(
    TrainData, 
    usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    compression='gzip'
)


In [4]:

cumul_sales_query = pd.read_csv(
    TestData,
    usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
)

In [5]:
query_start_date = str(cumul_sales_query.iloc[0,1]).split(" ")[0]

In [6]:
query_start_date

'2018-05-26'

In [7]:
cumul_sales_query = cumul_sales_query.set_index(
    ['store_nbr', 'item_nbr', 'date']
)

<font color="red">DEBUG</font>

In [8]:
cumul_sales_query

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,103520,2018-05-26,116829647,False
1,103665,2018-05-26,116829648,False
1,105574,2018-05-26,116829649,False
1,105575,2018-05-26,116829650,False
1,105857,2018-05-26,116829651,False
1,108079,2018-05-26,116829652,False
1,108696,2018-05-26,116829653,True
1,108698,2018-05-26,116829654,True
1,108701,2018-05-26,116829655,True
1,108797,2018-05-26,116829656,False


In [9]:
promo_variables_test = cumul_sales_query[["onpromotion"]].unstack(level=-1).fillna(False)

<font color=red>DEBUG</font>

In [10]:
cumul_sales.iloc[-1,:]

date           2018-08-15 00:00:00
store_nbr                       54
item_nbr                   2116416
unit_sales                 1.09861
onpromotion                  False
Name: 23808259, dtype: object

In [11]:
cumul_sales_query.iloc[0,:]

id             116829647
onpromotion        False
Name: (1, 103520, 2018-05-26 00:00:00), dtype: object

In [12]:
items = pd.read_csv(
    ItemsPath,
).set_index("item_nbr")

stores = pd.read_csv(
    StoresPath
).set_index("store_nbr")

In [13]:
cumul_sales_query

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,103520,2018-05-26,116829647,False
1,103665,2018-05-26,116829648,False
1,105574,2018-05-26,116829649,False
1,105575,2018-05-26,116829650,False
1,105857,2018-05-26,116829651,False
1,108079,2018-05-26,116829652,False
1,108696,2018-05-26,116829653,True
1,108698,2018-05-26,116829654,True
1,108701,2018-05-26,116829655,True
1,108797,2018-05-26,116829656,False


In [14]:
cumul_sales.shape

(23808260, 5)

In [15]:
cumul_sales_query.shape

(1609465, 2)

In [16]:
items.shape

(4100, 3)

## Select only Last Three Months

This is a peculiar one, and it **games the benchmark** in a not great way. Essentially it uses the last 11 weeks of data before the prediction threshold to predict what's happening next

In [22]:
nowtime = datetime.now()
now = date(nowtime.year, nowtime.month, nowtime.day)

# How far back to go to start generating trend features for demand
data_start             = now - timedelta(7*11) + timedelta(1)
training_history_start = now - timedelta(7*WeeksOfHistoryForFeature) + timedelta(1)
validation_start       = now - timedelta(7*WeeksOfHistoryForFeatureOnValidation) + timedelta(1)



In [23]:
data_start, training_history_start, query_start_date

(datetime.date(2018, 3, 10), datetime.date(2018, 3, 31), '2018-05-26')

In [24]:
cumul_sales = cumul_sales[cumul_sales.date.isin(
    pd.date_range(data_start, periods=7 * 11))].copy()


In [25]:
cumul_sales.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
7024144,2018-03-10,1,103520,0.693147,False
7024145,2018-03-10,1,103665,1.386294,False
7024146,2018-03-10,1,105574,2.397895,False
7024147,2018-03-10,1,105575,2.079442,False
7024148,2018-03-10,1,105693,1.098612,False


In [26]:
cumul_sales.shape

(8116723, 5)

In [27]:
cumul_sales.iloc[-1,:]

date           2018-05-25 00:00:00
store_nbr                       54
item_nbr                   2108569
unit_sales                0.693147
onpromotion                  False
Name: 15140866, dtype: object

## Creating Promotion Variables

So this is a tricky. If one presumes that on-promotion will lead to a boost in demand, if if we presume we'll know *whats on promotion in advance*, then we can create variables to say that this product will be on promotion 1, 2, 3, ... 16 days from now (16 days in the future is the target)

In this case, this is also peculiar, there is a column for every single day!

In [28]:
promo_variables = cumul_sales.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]]

In [29]:
promo_variables.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1
1,103520,2018-03-10,False
1,103665,2018-03-10,False
1,105574,2018-03-10,False
1,105575,2018-03-10,False
1,105693,2018-03-10,False


In [30]:
promo_variables = cumul_sales.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)



In [31]:
promo_variables.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2018-03-10,2018-03-11,2018-03-12,2018-03-13,2018-03-14,2018-03-15,2018-03-16,2018-03-17,2018-03-18,2018-03-19,...,2018-05-16,2018-05-17,2018-05-18,2018-05-19,2018-05-20,2018-05-21,2018-05-22,2018-05-23,2018-05-24,2018-05-25
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [32]:
promo_variables.columns = promo_variables.columns.get_level_values(1)

promo_variables_query = cumul_sales_query[["onpromotion"]].unstack(level=-1).fillna(False)
promo_variables_query.columns = promo_variables_query.columns.get_level_values(1)
promo_variables_query = promo_variables_query.reindex(promo_variables.index).fillna(False)

promo_variables_train_and_query = pd.concat([promo_variables, promo_variables_query], axis=1)


In [33]:
promo_variables.shape, items.shape[0] * stores.shape[0]

((159434, 77), 221400)

In [34]:
cumul_sales.shape, cumul_sales_query.shape

((8116723, 5), (1609465, 2))

#  Unstack unit sales - do it across all days in a sliding window

Ah... they're creating a multi-task learning problem

In [35]:
cumul_sales = cumul_sales.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
cumul_sales.columns = cumul_sales.columns.get_level_values(1)
cumul_sales.shape

(159434, 77)

In [36]:
cumul_sales.head()

Unnamed: 0_level_0,date,2018-03-10 00:00:00,2018-03-11 00:00:00,2018-03-12 00:00:00,2018-03-13 00:00:00,2018-03-14 00:00:00,2018-03-15 00:00:00,2018-03-16 00:00:00,2018-03-17 00:00:00,2018-03-18 00:00:00,2018-03-19 00:00:00,...,2018-05-16 00:00:00,2018-05-17 00:00:00,2018-05-18 00:00:00,2018-05-19 00:00:00,2018-05-20 00:00:00,2018-05-21 00:00:00,2018-05-22 00:00:00,2018-05-23 00:00:00,2018-05-24 00:00:00,2018-05-25 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147
1,99197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,1.098612,0.693147,1.098612,0.0,0.0,0.0,0.0,1.098612,1.386294
1,103520,0.693147,0.693147,0.0,0.693147,0.0,0.693147,0.693147,1.609438,1.609438,0.0,...,1.609438,0.693147,0.0,0.0,1.098612,0.0,0.693147,1.098612,1.609438,2.079442
1,103665,1.386294,0.693147,0.0,0.693147,0.0,1.94591,0.0,1.791759,1.386294,0.693147,...,1.791759,0.0,0.693147,2.197225,1.386294,0.0,1.098612,1.791759,0.0,1.098612
1,105574,2.397895,1.609438,0.693147,1.791759,1.94591,1.609438,2.397895,2.484907,1.609438,0.693147,...,2.197225,2.302585,2.302585,1.94591,0.693147,0.0,1.94591,2.772589,1.791759,1.386294


## Make items match other data frames

They're sacraficing generability

In [37]:
items = items.reindex(cumul_sales.index.get_level_values(1))
items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1
105574,GROCERY I,1045,0


In [38]:
items.shape

(159434, 3)

## Time futzing

In [39]:
# Return that portion of the data frame that corresponds to the time period
#   beginning "minus" days before "dt" and extending for "periods" days
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [40]:
def prepare_dataset(cumul_sales, promo_variables_train_and_query, start_date, is_train=True):
    X = pd.DataFrame({  # Mean target for different retrospective timespans & total # promotions
        "mean_3_2017": get_timespan(cumul_sales, start_date, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(cumul_sales, start_date, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(cumul_sales, start_date, 14, 14).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_variables_train_and_query, start_date, 14, 14).sum(axis=1).values
    })
    for i in range(16):  # Promotions on future days
        X["promo_{}".format(i)] = promo_variables_train_and_query[
            start_date + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = cumul_sales[  # Target values for future days
            pd.date_range(start_date, periods=16)
        ].values
        return X, y
    return X

In [41]:
print("Preparing dataset...")
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(cumul_sales, promo_variables_train_and_query, training_history_start + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

X_validate, y_validate = prepare_dataset(cumul_sales, promo_variables_train_and_query, validation_start)

X_query = prepare_dataset(cumul_sales, promo_variables_train_and_query, now, is_train=False)

Preparing dataset...


In [42]:
X_train.shape, X_validate.shape, X_query.shape

((637736, 20), (159434, 20), (159434, 20))

This dataset is **super gamey**. They're using the means for the week, fortnight, and last three days, and then seeing how to permute it to generate values for the following window of time. It's hardcoded to product IDs, not categories.

It does however, permit multi-task learning, and therefore better representation learning

It does not incorporate any information about seasonality at all, and so would fall arse over face at Christmas



In [43]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

Training and predicting models...


In [44]:
MAX_ROUNDS = 1000
validate_pred = []
query_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    
    dvalidate = lgb.Dataset(
        X_validate, label=y_validate[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dvalidate], early_stopping_rounds=50, verbose_eval=50
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    
    
    validate_pred.append(bst.predict(
        X_validate, num_iteration=bst.best_iteration or MAX_ROUNDS))
    
    query_pred.append(bst.predict(
        X_query, num_iteration=bst.best_iteration or MAX_ROUNDS))

Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.374993	valid_1's l2: 0.373291
[100]	training's l2: 0.362088	valid_1's l2: 0.365847
[150]	training's l2: 0.359539	valid_1's l2: 0.364535
[200]	training's l2: 0.358059	valid_1's l2: 0.363779
[250]	training's l2: 0.356911	valid_1's l2: 0.363265
[300]	training's l2: 0.35594	valid_1's l2: 0.362884
[350]	training's l2: 0.35518	valid_1's l2: 0.362592
[400]	training's l2: 0.354505	valid_1's l2: 0.362336
[450]	training's l2: 0.35387	valid_1's l2: 0.362213
[500]	training's l2: 0.35335	valid_1's l2: 0.362148
[550]	training's l2: 0.352837	valid_1's l2: 0.36203
[600]	training's l2: 0.352353	valid_1's l2: 0.361967
[650]	training's l2: 0.351841	valid_1's l2: 0.361916
[700]	training's l2: 0.35142	valid_1's l2: 0.361879
[750]	training's l2: 0.351006	valid_1's l2: 0.361831
[800]	training's l2: 0.350598	valid_1's l2: 0.361797
[850]	training's l2: 0.35019	valid_1's l2: 0.361708
[900]	training's l2: 0.349804	valid_1's l2: 0

[150]	training's l2: 0.407847	valid_1's l2: 0.403595
[200]	training's l2: 0.406059	valid_1's l2: 0.402703
[250]	training's l2: 0.404593	valid_1's l2: 0.402181
[300]	training's l2: 0.40352	valid_1's l2: 0.401689
[350]	training's l2: 0.402588	valid_1's l2: 0.401459
[400]	training's l2: 0.401756	valid_1's l2: 0.401272
[450]	training's l2: 0.401046	valid_1's l2: 0.40118
[500]	training's l2: 0.400417	valid_1's l2: 0.401106
[550]	training's l2: 0.399816	valid_1's l2: 0.401044
[600]	training's l2: 0.39924	valid_1's l2: 0.400945
[650]	training's l2: 0.398692	valid_1's l2: 0.400974
Early stopping, best iteration is:
[627]	training's l2: 0.398939	valid_1's l2: 0.400922
mean_14_2017: 1958442.69
mean_7_2017: 714103.30
mean_3_2017: 352482.60
promo_6: 102438.94
promo_14_2017: 50278.70
promo_5: 12217.17
promo_12: 10625.26
promo_7: 8544.62
promo_1: 7938.47
promo_9: 6824.24
promo_8: 6504.56
promo_10: 4083.75
promo_4: 3733.00
promo_11: 3240.62
promo_3: 3002.54
promo_14: 2983.49
promo_2: 2509.49
promo_0:

[550]	training's l2: 0.408382	valid_1's l2: 0.391766
[600]	training's l2: 0.407792	valid_1's l2: 0.391675
Early stopping, best iteration is:
[598]	training's l2: 0.407812	valid_1's l2: 0.391658
mean_14_2017: 2271912.93
mean_7_2017: 638725.93
mean_3_2017: 398574.51
promo_12: 252621.21
promo_14_2017: 61932.71
promo_5: 53149.55
promo_13: 13132.33
promo_11: 10844.69
promo_15: 9544.88
promo_14: 8554.80
promo_7: 7097.88
promo_10: 4841.04
promo_4: 4485.20
promo_0: 4133.15
promo_9: 3729.90
promo_6: 3319.76
promo_2: 2566.20
promo_8: 2271.34
promo_1: 1552.70
promo_3: 1309.66
Step 14
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.453843	valid_1's l2: 0.412024
[100]	training's l2: 0.441871	valid_1's l2: 0.403915
[150]	training's l2: 0.438235	valid_1's l2: 0.402105
[200]	training's l2: 0.436333	valid_1's l2: 0.401265
[250]	training's l2: 0.434812	valid_1's l2: 0.400748
[300]	training's l2: 0.433666	valid_1's l2: 0.400371
[350]	training's l2: 0.432696	valid_1's l

In [47]:
print("Validation mse:", np.sqrt(mean_squared_error(
    np.expm1(y_validate), np.expm1(np.array(validate_pred)).transpose())))

Validation mse: 17.635705675653934


In [48]:
validate_pred

[array([0.22670771, 0.03805695, 1.14243052, ..., 0.03805695, 0.03045197,
        0.03805695]),
 array([0.33118785, 0.06358992, 1.28310697, ..., 0.06358992, 0.10461153,
        0.06358992]),
 array([0.29328676, 0.08154939, 1.19042486, ..., 0.08154939, 0.13012389,
        0.08154939]),
 array([0.23788341, 0.08246951, 1.03112951, ..., 0.08246951, 0.13245651,
        0.08246951]),
 array([0.24203497, 0.08389569, 1.03652769, ..., 0.08389569, 0.06927573,
        0.08389569]),
 array([0.22278414, 0.09472484, 1.05182358, ..., 0.09472484, 0.13119284,
        0.09472484]),
 array([0.23614268, 0.16395045, 1.04323934, ..., 0.16395045, 0.37621929,
        0.16395045]),
 array([0.25710815, 0.18286433, 1.04976098, ..., 0.18286433, 0.4065457 ,
        0.18286433]),
 array([0.34092375, 0.21697018, 1.20642206, ..., 0.21697018, 0.5132463 ,
        0.21697018]),
 array([0.32994322, 0.22267369, 1.12193   , ..., 0.22267369, 0.49796372,
        0.22267369]),
 array([0.26906826, 0.20580286, 1.12292526, ..., 0

In [49]:
y_query.shape, y_query

NameError: name 'y_query' is not defined

In [50]:
print("Making submission...")
y_query = np.array(query_pred).transpose()
df_preds = pd.DataFrame(
    y_query, index=cumul_sales.index,
    columns=pd.date_range(query_start_date, periods=16)
).stack().to_frame("unit_sales")
df_preds.to_csv("/tmp/preds-2018.csv")


Making submission...


In [51]:
df_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1
1,96995,2018-05-26,0.082069
1,96995,2018-05-27,0.129561
1,96995,2018-05-28,0.134338
1,96995,2018-05-29,0.109364
1,96995,2018-05-30,0.105926
1,96995,2018-05-31,0.118327
1,96995,2018-06-01,0.123348
1,96995,2018-06-02,0.139309
1,96995,2018-06-03,0.176478
1,96995,2018-06-04,0.182017


In [168]:
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

In [83]:

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)

NameError: name 'df_test' is not defined

In [61]:
submission

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,0.273266
1,99197,2017-08-16,125497041,0.225388
1,103501,2017-08-16,125497042,0.000000
1,103520,2017-08-16,125497043,0.648250
1,103665,2017-08-16,125497044,1.352113
1,105574,2017-08-16,125497045,3.222305
1,105575,2017-08-16,125497046,7.469437
1,105576,2017-08-16,125497047,0.000000
1,105577,2017-08-16,125497048,0.294655
1,105693,2017-08-16,125497049,0.319934


# Further Improvements

This is based on the work in this file: https://www.kaggle.com/vrtjso/lgbm-one-step-ahead

This was apparently in the top 10% at one point.

In [76]:
df_train = pd.read_csv(
    TrainData, usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    TestData, usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    ItemsPath,
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

print("Training and predicting models...")
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)


Preparing dataset...
Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.30191	valid_1's l2: 0.29409
[200]	training's l2: 0.298363	valid_1's l2: 0.292741
[300]	training's l2: 0.295918	valid_1's l2: 0.292337
[400]	training's l2: 0.293791	valid_1's l2: 0.2921
[500]	training's l2: 0.29199	valid_1's l2: 0.29195
Did not meet early stopping. Best iteration is:
[500]	training's l2: 0.29199	valid_1's l2: 0.29195
mean_7_2017: 1882639.38
mean_14_2017: 1229821.77
promo_0: 104143.51
day_1_2017: 89857.46
mean_20_dow0_2017: 84245.48
mean_3_2017: 76646.29
mean_30_2017: 76583.57
mean_4_dow0_2017: 58919.38
mean_60_2017: 33035.18
promo_14_2017: 28619.72
promo_7: 9432.45
mean_4_dow5_2017: 7417.05
mean_140_2017: 7406.32
promo_60_2017: 6740.72
mean_20_dow4_2017: 5611.55
promo_140_2017: 5493.72
mean_4_dow6_2017: 4633.44
mean_4_dow2_2017: 3813.74
mean_20_dow2_2017: 3343.78
mean_4_dow3_2017: 2824.66
promo_9: 2814.25
mean_4_dow1_2017: 2707.00
mean_20_dow3_2017: 2642.58
mean_20_dow1_2017: 2616.99
me

Step 7
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.346219	valid_1's l2: 0.421254
[200]	training's l2: 0.341874	valid_1's l2: 0.420923
[300]	training's l2: 0.339035	valid_1's l2: 0.420617
[400]	training's l2: 0.336652	valid_1's l2: 0.420437
Early stopping, best iteration is:
[393]	training's l2: 0.336799	valid_1's l2: 0.420378
mean_14_2017: 1274004.42
mean_30_2017: 842088.63
mean_7_2017: 445757.89
mean_20_dow6_2017: 152802.16
mean_3_2017: 145241.24
promo_6: 128192.88
mean_4_dow6_2017: 127503.94
mean_60_2017: 123326.90
promo_14_2017: 21944.55
day_1_2017: 13872.06
promo_3: 11205.46
promo_7: 9013.03
mean_4_dow5_2017: 8533.91
mean_20_dow5_2017: 8280.78
promo_60_2017: 8112.57
promo_140_2017: 6018.04
mean_20_dow1_2017: 4884.13
promo_5: 4487.41
mean_140_2017: 4382.31
promo_13: 3877.01
mean_4_dow0_2017: 3650.61
mean_20_dow0_2017: 3639.82
mean_20_dow3_2017: 3507.37
mean_4_dow1_2017: 3355.21
mean_20_dow4_2017: 2943.12
promo_4: 2784.87
promo_0: 2663.62
mean

[200]	training's l2: 0.365615	valid_1's l2: 0.376333
[300]	training's l2: 0.362327	valid_1's l2: 0.376258
Early stopping, best iteration is:
[338]	training's l2: 0.361236	valid_1's l2: 0.37609
mean_30_2017: 1601076.04
mean_14_2017: 588977.50
mean_60_2017: 374936.35
mean_7_2017: 305452.67
mean_3_2017: 148957.18
promo_12: 93518.03
mean_20_dow5_2017: 85159.77
mean_4_dow5_2017: 71638.34
promo_13: 19595.91
promo_14_2017: 16011.22
promo_14: 13355.62
promo_10: 11265.36
mean_140_2017: 9927.43
promo_60_2017: 8798.33
day_1_2017: 7950.62
mean_20_dow0_2017: 7544.02
promo_140_2017: 6088.78
mean_20_dow6_2017: 5888.75
mean_4_dow6_2017: 4725.31
promo_11: 4479.14
mean_4_dow0_2017: 3756.38
mean_20_dow3_2017: 3664.43
promo_9: 3613.91
mean_20_dow2_2017: 3341.99
mean_4_dow2_2017: 2714.48
mean_20_dow1_2017: 2701.22
mean_4_dow3_2017: 2566.89
mean_20_dow4_2017: 2531.39
promo_15: 2495.30
mean_4_dow1_2017: 2301.79
mean_4_dow4_2017: 2274.90
promo_7: 2189.77
promo_0: 1734.63
promo_8: 1699.81
promo_5: 1614.77
prom

In [81]:
print("Validation mse:", mean_squared_error(
    np.expm1(y_validate), np.expm1(np.array(validate_pred)).transpose()))

Validation mse: 275.7862813287468


In [78]:
np.sqrt(275), np.sqrt(247)

(16.583123951777, 15.716233645501712)