## Libraries

In [1]:
%pip install -qqq "mlforecast[lag_transforms]"

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path

import lightgbm as lgb
import mlforecast
import numpy as np
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean, SeasonalRollingMean

In [None]:
import mlforecast

In [3]:
mlforecast.__version__

'0.12.1'

## Data setup

In [None]:
input_path = Path('../input/m5-forecasting-accuracy/')

### Calendar

In [4]:
cal_dtypes = {
    'd': 'category',
    'wm_yr_wk': np.uint16,
    'event_name_1': 'category',
    'event_type_1': 'category',
    'event_name_2': 'category',
    'event_type_2': 'category',
    'snap_CA': np.uint8,
    'snap_TX': np.uint8,
    'snap_WI': np.uint8,
}
cal = pd.read_csv( 'calendar.csv', 
                  dtype=cal_dtypes, 
                  usecols=list(cal_dtypes.keys()) + ['date'], 
                  parse_dates=['date'])
event_cols = [k for k in cal_dtypes if k.startswith('event')]
for col in event_cols:
    cal[col] = cal[col].cat.add_categories('nan').fillna('nan')

### Prices

In [5]:
prices_dtypes = {
    'store_id': 'category',
    'item_id': 'category',
    'wm_yr_wk': np.uint16,
    'sell_price': np.float32
}
prices = pd.read_csv('sell_prices.csv', dtype=prices_dtypes)

### Sales

In [6]:
sales_dtypes = {
    'id': 'category',
    'item_id': prices.item_id.dtype,
    'dept_id': 'category',
    'cat_id': 'category',
    'store_id': 'category',
    'state_id': 'category',
    **{f'd_{i}': np.float32 for i in range(1942)}
}
sales = pd.read_csv(
    'sales_train_evaluation.csv',
    dtype=sales_dtypes,
)

In [None]:
cal

In [7]:
long = sales.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    var_name='d',
    value_name='y'
)

In [None]:
long=long[long['cat_id']=="HOBBIES"]

In [None]:
long

In [8]:
%%time
print(long.shape[0])
long['date_idx'] = long['d'].str.replace('d_', '').astype('int32')
dates = sorted(long['date_idx'].unique())
long = long.sort_values(['id', 'date_idx'])


59181090
CPU times: total: 41.4 s
Wall time: 1min 13s


In [None]:
without_leading_zeros = long['y'].gt(0).groupby(long['id']).transform('cummax')


In [None]:
without_leading_zeros

In [None]:
above_min_date = long['date_idx'] >= dates[-400]
keep_mask = without_leading_zeros & above_min_date
long = long[keep_mask]
print(long.shape[0])

In [None]:
long=long[long['sell_price'].notna()]

In [17]:
long

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,y,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOUSEHOLD_2_461_CA_3_evaluation,HOUSEHOLD_2_461,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0.0,2014-03-19,,,,,0,0,0,9.97
1,HOUSEHOLD_1_047_WI_2_evaluation,HOUSEHOLD_1_047,HOUSEHOLD_1,HOUSEHOLD,WI_2,WI,0.0,2015-05-22,,,,,0,0,0,4.23
2,FOODS_3_686_TX_2_evaluation,FOODS_3_686,FOODS_3,FOODS,TX_2,TX,0.0,2011-06-29,,,,,0,0,0,3.50
3,HOUSEHOLD_1_506_TX_1_evaluation,HOUSEHOLD_1_506,HOUSEHOLD_1,HOUSEHOLD,TX_1,TX,0.0,2013-07-25,,,,,0,0,0,3.28
4,FOODS_1_161_CA_2_evaluation,FOODS_1_161,FOODS_1,FOODS,CA_2,CA,2.0,2014-11-22,,,,,0,0,0,0.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46881672,HOBBIES_1_055_CA_3_evaluation,HOBBIES_1_055,HOBBIES_1,HOBBIES,CA_3,CA,2.0,2012-05-20,,,,,0,0,0,7.44
46881673,FOODS_3_565_CA_4_evaluation,FOODS_3_565,FOODS_3,FOODS,CA_4,CA,2.0,2013-05-01,,,,,1,1,0,2.68
46881674,FOODS_3_132_WI_3_evaluation,FOODS_3_132,FOODS_3,FOODS,WI_3,WI,1.0,2015-08-25,,,,,0,0,0,2.68
46881675,HOBBIES_2_011_TX_2_evaluation,HOBBIES_2_011,HOBBIES_2,HOBBIES,TX_2,TX,0.0,2011-07-05,,,,,1,1,1,0.97


In [None]:

keep_mask = without_leading_zeros 
long = long[keep_mask]
print(long.shape[0])

In [9]:
long['d'] = long['d'].astype(cal.d.dtype)


In [10]:
long = long.merge(cal, on=['d'])
long = long.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'])
last_wmyrwk = long['wm_yr_wk'].max()
last_date_train = long['date'].max()
long = long.drop(columns=['d', 'date_idx', 'wm_yr_wk'])
long = long.sample(frac=1.0, random_state=0).reset_index(drop=True)




In [None]:
long

In [None]:
long["sell_price"].isna().any()

In [None]:
long=long['sell_price'].dropna()

In [None]:
long

In [None]:
long[long['id']=="HOBBIES_1_002_CA_1_evaluation"].sort_values('date')

In [None]:
big=big[big['cat_id']=="HOBBIES"]

In [None]:
big=pd.read_csv("final.csv")

In [None]:
big.head()

In [None]:
big.shape

In [None]:
long.shape

In [None]:
long.dropna(subset=['Salary'])

In [11]:
# build future X
future_cal = cal[cal['date'] > last_date_train]
future_prices = prices[prices['wm_yr_wk'] >= last_wmyrwk].copy()
future_prices['id'] = future_prices['item_id'].astype(str) + '_' + future_prices['store_id'].astype(str) + '_evaluation'
future_prices['id']=future_prices['id'].astype(sales.id.dtype)
X_df = future_prices.merge(future_cal, on='wm_yr_wk').drop(columns=['store_id', 'item_id', 'wm_yr_wk', 'd'])

In [18]:
X_df

Unnamed: 0,sell_price,id,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-23,,,,,0,0,0
1,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-24,,,,,0,0,0
2,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-25,,,,,0,0,0
3,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-26,,,,,0,0,0
4,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-27,,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...
853715,3.98,FOODS_3_825_WI_3_evaluation,2016-06-19,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0
853716,1.28,FOODS_3_826_WI_3_evaluation,2016-06-18,,,,,0,0,0
853717,1.28,FOODS_3_826_WI_3_evaluation,2016-06-19,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0
853718,1.00,FOODS_3_827_WI_3_evaluation,2016-06-18,,,,,0,0,0


## Training

In [12]:
model_params = {
    'verbose': -1,
    'force_col_wise': True,
    'num_leaves': 256,
    'n_estimators': 50,
}

fcst = MLForecast(
    models=[lgb.LGBMRegressor(**model_params)],
    freq='D',
    lags=[7 * (i+1) for i in range(8)],
    lag_transforms = {
        1:  [ExpandingMean()],
        7:  [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        14: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        28: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
    },
    date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week'],    
    num_threads=4,
)

In [None]:
df.drop(['item_id','dept_id','d','wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI'],axis=1,inplace=True)

In [13]:
%%time
fcst.fit(
    long,
    id_col='id',
    time_col='date',
    target_col='y',
    static_features=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
)

CPU times: total: 18min 11s
Wall time: 9min 27s


MLForecast(models=[LGBMRegressor], freq=D, lag_features=['lag7', 'lag14', 'lag21', 'lag28', 'lag35', 'lag42', 'lag49', 'lag56', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size7', 'rolling_mean_lag7_window_size14', 'rolling_mean_lag7_window_size28', 'seasonal_rolling_mean_lag7_season_length7_window_size4', 'rolling_mean_lag14_window_size7', 'rolling_mean_lag14_window_size14', 'rolling_mean_lag14_window_size28', 'seasonal_rolling_mean_lag14_season_length7_window_size4', 'rolling_mean_lag28_window_size7', 'rolling_mean_lag28_window_size14', 'rolling_mean_lag28_window_size28', 'seasonal_rolling_mean_lag28_season_length7_window_size4'], date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week'], num_threads=4)

## Forecasting

In [14]:
%time preds = fcst.predict(28, X_df=X_df)

CPU times: total: 12.8 s
Wall time: 7.19 s


In [15]:
preds

Unnamed: 0,id,date,LGBMRegressor
0,HOBBIES_1_001_CA_1_evaluation,2016-05-23,0.877976
1,HOBBIES_1_001_CA_1_evaluation,2016-05-24,0.835212
2,HOBBIES_1_001_CA_1_evaluation,2016-05-25,0.775414
3,HOBBIES_1_001_CA_1_evaluation,2016-05-26,0.931718
4,HOBBIES_1_001_CA_1_evaluation,2016-05-27,1.062886
...,...,...,...
853715,FOODS_3_827_WI_3_evaluation,2016-06-15,1.400227
853716,FOODS_3_827_WI_3_evaluation,2016-06-16,1.398280
853717,FOODS_3_827_WI_3_evaluation,2016-06-17,1.448629
853718,FOODS_3_827_WI_3_evaluation,2016-06-18,1.874117


## Submission

In [16]:
wide = preds.pivot_table(index='id', columns='date')
wide.columns = [f'F{i+1}' for i in range(28)]
wide.columns.name = None
wide.index.name = 'id'
wide

Unnamed: 0_level_0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1_evaluation,0.877976,0.835212,0.775414,0.931718,1.062886,1.387005,1.357186,0.928795,1.000231,0.980687,...,1.029555,1.410138,1.307139,0.856688,0.827875,0.803359,0.931401,0.955330,1.143615,1.163005
HOBBIES_1_002_CA_1_evaluation,0.262983,0.281592,0.254399,0.250680,0.300234,0.340215,0.330691,0.248301,0.242641,0.251359,...,0.285726,0.362500,0.391959,0.276867,0.269464,0.263859,0.261514,0.296068,0.357660,0.361725
HOBBIES_1_003_CA_1_evaluation,0.521123,0.494400,0.501995,0.550923,0.729368,0.758479,0.792038,0.601835,0.596176,0.621793,...,0.731635,0.755471,0.847113,0.561550,0.540159,0.542203,0.591882,0.702492,0.744334,0.811306
HOBBIES_1_004_CA_1_evaluation,1.665364,1.320832,1.366851,1.522473,1.613744,2.090503,2.431325,1.709499,1.481126,1.580113,...,1.724806,2.238522,2.541457,1.734087,1.425817,1.528251,1.603435,1.646503,2.207976,2.427049
HOBBIES_1_005_CA_1_evaluation,1.159528,1.139842,1.146845,1.199997,1.312087,1.486001,1.503625,1.040549,1.049936,1.101666,...,1.173144,1.344920,1.395648,1.079079,1.016841,0.992127,0.944137,1.079758,1.282912,1.359598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3_evaluation,0.466912,0.514764,0.508233,0.496473,0.548206,0.626661,0.637273,0.499262,0.508275,0.556374,...,0.582747,0.687132,0.811324,0.567329,0.597014,0.597684,0.544427,0.588002,0.670443,0.663765
FOODS_3_824_WI_3_evaluation,0.231844,0.224804,0.216259,0.303353,0.293367,0.353854,0.328532,0.292526,0.274600,0.328220,...,0.311013,0.402009,0.394360,0.322389,0.345262,0.355183,0.341185,0.329087,0.364387,0.340217
FOODS_3_825_WI_3_evaluation,0.763536,0.641234,0.682685,0.628957,0.720214,0.816047,0.962030,0.764978,0.704615,0.847358,...,0.792696,1.007890,1.076326,0.817604,0.800241,0.837650,0.707697,0.771415,0.937359,0.944033
FOODS_3_826_WI_3_evaluation,1.099694,1.243254,1.055367,1.110753,1.135527,1.380320,1.238029,1.066665,0.998495,1.055498,...,1.131306,1.447542,1.445021,1.163226,1.273698,1.145746,1.062390,1.139882,1.358111,1.355568


In [None]:
sample_sub = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
sample_sub.update(wide)
np.testing.assert_allclose(sample_sub.sum().sum(), preds['LGBMRegressor'].sum())
sample_sub.to_csv('submission.csv')

In [None]:
import plotly.express as px

In [None]:
df = px.data.gapminder().query("country=='Canada'")
fig = px.line(df, x="year", y="lifeExp", title='Life expectancy in Canada')
fig.show()