# Time Series Forecasting using LightGBM

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

df = pd.read_csv(
    '/kaggle/input/time-series-uk-supermarket-data/All_Data_ASDA.csv',
    parse_dates=['date']
)

df.head()


In [None]:
print(df.shape)


In [None]:
print(df.columns,
df.info())

In [None]:
target_col = 'prices_unit_(£)'
date_col   = 'date'
id_col     = 'names'                    
cat_cols   = ['unit', 'category', 'own_brand']

df = df.drop_duplicates(subset=[id_col, date_col])
df = df.sort_values([id_col, date_col]).reset_index(drop=True)

In [None]:
def fill_missing_dates(df, id_col, date_col):
    min_d = df[date_col].min()
    max_d = df[date_col].max()
    full_dates = pd.date_range(min_d, max_d, freq='D')

    prod_idx = pd.MultiIndex.from_product(
        [df[id_col].unique(), full_dates],
        names=[id_col, date_col]
    )
    df_full = df.set_index([id_col, date_col]).reindex(prod_idx).reset_index()
    return df_full

df = fill_missing_dates(df, id_col, date_col)

In [None]:
df[target_col] = df.groupby(id_col)[target_col].transform(lambda s: s.ffill().bfill())
df[target_col] = df.groupby('category')[target_col].transform(lambda s: s.fillna(s.median()))
df[target_col] = df[target_col].fillna(df[target_col].median())


In [None]:
print(df[target_col].isna().sum()) 

In [None]:
df['own_brand'] = df['own_brand'].map({True:1, False:0, 'True':1, 'False':0}).fillna(0).astype(int)
for c in ['unit', 'category']:
    df[c] = df[c].fillna('Unknown')

In [None]:
def safe_lag_rolling(df, id_col, target, lags=[1,7,14], windows=[7,30]):
    df = df.copy()
    for lag in lags:
        df[f'lag_{lag}'] = df.groupby(id_col)[target].shift(lag)
    for w in windows:
        df[f'roll_mean_{w}'] = (
            df.groupby(id_col)[target]
              .shift(1).rolling(w, min_periods=1).mean()
        )
        df[f'roll_std_{w}'] = (
            df.groupby(id_col)[target]
              .shift(1).rolling(w, min_periods=1).std()
        )
    df = df.dropna(subset=[target] + [f'lag_{l}' for l in lags] + 
                   [f'roll_mean_{w}' for w in windows] + [f'roll_std_{w}' for w in windows])
    return df.reset_index(drop=True)

df = safe_lag_rolling(df, id_col, target_col)

print(f"After lag/rolling → {df.shape[0]:,} rows, {df[id_col].nunique()} products")


In [None]:
df['dayofweek']  = df[date_col].dt.dayofweek
df['month']      = df[date_col].dt.month
df['quarter']    = df[date_col].dt.quarter
df['year']       = df[date_col].dt.year
df['dayofmonth'] = df[date_col].dt.day
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)


In [None]:
df.columns

In [None]:
horizon = 30   
max_date = df['date'].max()

train = df[df['date'] <= (max_date - pd.Timedelta(days=horizon))]
test  = df[df['date']  > (max_date - pd.Timedelta(days=horizon))]

X_train = train.drop(columns=[target_col,'date'])
y_train = train[target_col]
X_test  = test.drop(columns=[target_col,'date'])
y_test  = test[target_col]

In [None]:
obj_cols = X_train.select_dtypes(include='object').columns.tolist()
print("Object columns:", obj_cols)

for col in obj_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')


In [None]:
lgb_model = lgb.LGBMRegressor(random_state=42)

tscv = TimeSeriesSplit(n_splits=3)

param_grid = {
    'num_leaves': [31, 63],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [200, 500],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_root_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
best_model = grid.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f" Test RMSE: {rmse:.3f}")
print(f" Test MAPE: {mape:.3f}")

In [None]:
plt.figure(figsize=(12,5))
plt.plot(test['date'], y_test, label='Actual', color='black')
plt.plot(test['date'], y_pred, label='Forecast', color='red')
plt.title("Price per Unit Forecast")
plt.xlabel("Date")
plt.ylabel("Price (£)")
plt.legend()
plt.show()