In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from statsforecast import StatsForecast

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error
import time

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor
import lightgbm as lgb
import seaborn as sns
from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from mlforecast.lag_transforms import RollingMean, RollingStd, ExpandingMean
from sklearn.linear_model import Ridge
import lightgbm as lgb

In [2]:
dir='data/store-sales-time-series-forecasting'
file='train.csv'
df=pd.read_csv(f'{dir}/{file}')
df.date=pd.to_datetime(df.date)


file='test.csv'
df_test=pd.read_csv(f'{dir}/{file}')
df_test.date=pd.to_datetime(df_test.date)



In [3]:


df_train = df.copy()

df_train['unique_id'] = (
    df_train['store_nbr'].astype(str) + '_' + df_train['family']
)

df_train = df_train.rename(
    columns={'date': 'ds', 'sales': 'y'}
)

df_train = df_train[['unique_id', 'ds', 'y', 'onpromotion']]

In [4]:
models = {
    'lgbm': lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
}

fcst = MLForecast(
    models=models,
    freq='D',

    # ðŸ”¹ Time-delay embedding (automatic)
    lags=[1, 7, 14],
    lag_transforms={
        7: [RollingMean(7), RollingStd(7)],
        14: [RollingMean(14)]
    },

    # ðŸ”¹ Temporal embedding
    date_features=['dayofweek', 'month'],
    
    # ðŸ”¹ Target transformations
    target_transforms=[
        Differences([1, 7])  # trend + weekly seasonality
    ]
)

In [5]:
fcst.fit(
    df_train,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    static_features=[]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1756
[LightGBM] [Info] Number of data points in the train set: 2938518, number of used features: 9
[LightGBM] [Info] Start training from score -0.004748


MLForecast(models=[lgbm], freq=D, lag_features=['lag1', 'lag7', 'lag14', 'rolling_mean_lag7_window_size7', 'rolling_std_lag7_window_size7', 'rolling_mean_lag14_window_size14'], date_features=['dayofweek', 'month'], num_threads=1)

In [6]:
df_test_mlf = df_test.copy()

df_test_mlf['unique_id'] = (
    df_test_mlf['store_nbr'].astype(str) + '_' + df_test_mlf['family']
)

df_test_mlf = df_test_mlf.rename(columns={'date': 'ds'})

df_test_mlf = df_test_mlf[['unique_id', 'ds', 'onpromotion']]

In [7]:
preds = fcst.predict(
    h=df_test_mlf['ds'].nunique(),
    X_df=df_test_mlf
)

In [11]:
df_test_merge = df_test.copy()

df_test_merge['unique_id'] = (
    df_test_merge['store_nbr'].astype(str) + '_' + df_test_merge['family']
)

df_test_merge = df_test_merge.rename(columns={'date': 'ds'})

In [12]:
submission = (
    df_test_merge[['id', 'unique_id', 'ds']]
    .merge(
        preds.rename(columns={'lgbm': 'sales'}),
        on=['unique_id', 'ds'],
        how='left'
    )
)

In [13]:
submission['sales'] = submission['sales'].clip(lower=0)

submission[['id', 'sales']].to_csv(
    'MLF_submission.csv',
    index=False
)

In [14]:
submission.isna().sum()


id           0
unique_id    0
ds           0
sales        0
dtype: int64

In [15]:
submission.head()


Unnamed: 0,id,unique_id,ds,sales
0,3000888,1_AUTOMOTIVE,2017-08-16,2.902297
1,3000889,1_BABY CARE,2017-08-16,0.496832
2,3000890,1_BEAUTY,2017-08-16,6.644416
3,3000891,1_BEVERAGES,2017-08-16,1967.180848
4,3000892,1_BOOKS,2017-08-16,0.496832


In [16]:
submission.tail()

Unnamed: 0,id,unique_id,ds,sales
28507,3029395,9_POULTRY,2017-08-31,429.751799
28508,3029396,9_PREPARED FOODS,2017-08-31,219.898098
28509,3029397,9_PRODUCE,2017-08-31,1657.530079
28510,3029398,9_SCHOOL AND OFFICE SUPPLIES,2017-08-31,243.602315
28511,3029399,9_SEAFOOD,2017-08-31,20.784633


In [17]:
submission.shape

(28512, 4)