In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from statsforecast import StatsForecast

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error
import time

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor
import lightgbm as lgb
import seaborn as sns
from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from mlforecast.lag_transforms import RollingMean, RollingStd, ExpandingMean
from sklearn.linear_model import Ridge
import lightgbm as lgb

In [2]:
dir='data/store-sales-time-series-forecasting'
file='train.csv'
df=pd.read_csv(f'{dir}/{file}')
df.date=pd.to_datetime(df.date)


file='test.csv'
df_test=pd.read_csv(f'{dir}/{file}')
df_test.date=pd.to_datetime(df_test.date)


file='oil.csv'
df_oil=pd.read_csv(f'{dir}/{file}')
df_oil.date=pd.to_datetime(df_oil.date)


In [3]:
# Clean oil data
df_oil_clean = (
    df_oil
    .rename(columns={'date': 'ds', 'dcoilwtico': 'oil_price'})
    .set_index('ds')
    .asfreq('D')                # ensure daily frequency
    .ffill()                    # forward-fill missing prices
    .reset_index()
)

df_oil_clean

Unnamed: 0,ds,oil_price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-05,93.12
...,...,...
1699,2017-08-27,47.65
1700,2017-08-28,46.40
1701,2017-08-29,46.46
1702,2017-08-30,45.96


In [4]:


df_train = df.copy()

df_train['unique_id'] = (
    df_train['store_nbr'].astype(str) + '_' + df_train['family']
)

df_train = df_train.rename(
    columns={'date': 'ds', 'sales': 'y'}
)

df_train = df_train[['unique_id', 'ds', 'y', 'onpromotion']]

df_train = (
    df_train
    .merge(df_oil_clean, on='ds', how='left')
)

# Optional: safety fill (early NaNs)
df_train['oil_price'] = df_train['oil_price'].ffill().bfill()

In [5]:
df_train.head()

Unnamed: 0,unique_id,ds,y,onpromotion,oil_price
0,1_AUTOMOTIVE,2013-01-01,0.0,0,93.14
1,1_BABY CARE,2013-01-01,0.0,0,93.14
2,1_BEAUTY,2013-01-01,0.0,0,93.14
3,1_BEVERAGES,2013-01-01,0.0,0,93.14
4,1_BOOKS,2013-01-01,0.0,0,93.14


In [6]:
models = {
    'lgbm': lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
}

fcst = MLForecast(
    models=models,
    freq='D',

    # ðŸ”¹ Time-delay embedding (automatic)
    lags=[1, 7, 14],
    lag_transforms={
        7: [RollingMean(7), RollingStd(7)],
        14: [RollingMean(14)]
    },

    # ðŸ”¹ Temporal embedding
    date_features=['dayofweek', 'month'],
    
    # ðŸ”¹ Target transformations
    target_transforms=[
        Differences([1, 7])  # trend + weekly seasonality
    ]
)

In [7]:
fcst.fit(
    df_train,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    static_features=[]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008645 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2010
[LightGBM] [Info] Number of data points in the train set: 2938518, number of used features: 10
[LightGBM] [Info] Start training from score -0.004748


MLForecast(models=[lgbm], freq=D, lag_features=['lag1', 'lag7', 'lag14', 'rolling_mean_lag7_window_size7', 'rolling_std_lag7_window_size7', 'rolling_mean_lag14_window_size14'], date_features=['dayofweek', 'month'], num_threads=1)

In [8]:
df_test_mlf = df_test.copy()

df_test_mlf['unique_id'] = (
    df_test_mlf['store_nbr'].astype(str) + '_' + df_test_mlf['family']
)

df_test_mlf = df_test_mlf.rename(columns={'date': 'ds'})

df_test_mlf = df_test_mlf[['unique_id', 'ds', 'onpromotion']]

df_test_mlf = (
    df_test_mlf
    .merge(df_oil_clean, on='ds', how='left')
)

df_test_mlf['oil_price'] = df_test_mlf['oil_price'].ffill().bfill()

In [9]:
df_test_mlf.head()

Unnamed: 0,unique_id,ds,onpromotion,oil_price
0,1_AUTOMOTIVE,2017-08-16,0,46.8
1,1_BABY CARE,2017-08-16,0,46.8
2,1_BEAUTY,2017-08-16,2,46.8
3,1_BEVERAGES,2017-08-16,20,46.8
4,1_BOOKS,2017-08-16,0,46.8


In [10]:
preds = fcst.predict(
    h=df_test_mlf['ds'].nunique(),
    X_df=df_test_mlf
)

In [11]:
df_test_merge = df_test.copy()

df_test_merge['unique_id'] = (
    df_test_merge['store_nbr'].astype(str) + '_' + df_test_merge['family']
)

df_test_merge = df_test_merge.rename(columns={'date': 'ds'})

In [12]:
submission = (
    df_test_merge[['id', 'unique_id', 'ds']]
    .merge(
        preds.rename(columns={'lgbm': 'sales'}),
        on=['unique_id', 'ds'],
        how='left'
    )
)

In [13]:
submission['sales'] = submission['sales'].clip(lower=0)

submission[['id', 'sales']].to_csv(
    'MLF_Oil_submission.csv',
    index=False
)

In [14]:
submission.isna().sum()


id           0
unique_id    0
ds           0
sales        0
dtype: int64

In [15]:
submission.head()


Unnamed: 0,id,unique_id,ds,sales
0,3000888,1_AUTOMOTIVE,2017-08-16,2.850697
1,3000889,1_BABY CARE,2017-08-16,0.349844
2,3000890,1_BEAUTY,2017-08-16,6.242645
3,3000891,1_BEVERAGES,2017-08-16,1960.950917
4,3000892,1_BOOKS,2017-08-16,0.349844


In [16]:
submission.tail()

Unnamed: 0,id,unique_id,ds,sales
28507,3029395,9_POULTRY,2017-08-31,404.697696
28508,3029396,9_PREPARED FOODS,2017-08-31,222.588489
28509,3029397,9_PRODUCE,2017-08-31,1671.431314
28510,3029398,9_SCHOOL AND OFFICE SUPPLIES,2017-08-31,228.406752
28511,3029399,9_SEAFOOD,2017-08-31,21.906337


In [17]:
submission.shape

(28512, 4)