In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from statsforecast import StatsForecast

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error
import time

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor
import lightgbm as lgb
import seaborn as sns



# Fourier terms (weekly)
def fourier_series(t, period, K):
    return pd.DataFrame({
        f'sin_{period}_{k}': np.sin(2 * np.pi * k * t / period)
        for k in range(1, K + 1)
    } | {
        f'cos_{period}_{k}': np.cos(2 * np.pi * k * t / period)
        for k in range(1, K + 1)
    })


In [7]:
dir='data/store-sales-time-series-forecasting'
file='train.csv'
df=pd.read_csv(f'{dir}/{file}')
df.date=pd.to_datetime(df.date)


file='test.csv'
df_test=pd.read_csv(f'{dir}/{file}')
df_test.date=pd.to_datetime(df_test.date)



In [8]:
df_test

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [3]:
global_rows = []

for (store_nbr, family), df_sf in df.groupby(['store_nbr', 'family']):
    
    df_sf = df_sf.sort_values('date').set_index('date')
    y = df_sf['sales']

    # ----- target transforms -----
    y_log = np.log1p(y)
    
    t = np.arange(len(y_log)).reshape(-1, 1)
    trend_model = LinearRegression()
    trend_model.fit(t, y_log.values)
    trend = trend_model.predict(t)
    
    y_detrended = y_log - trend
    weekly_seasonality = y_detrended.groupby(y_detrended.index.dayofweek).transform('mean')
    y_transformed = y_detrended - weekly_seasonality
    y_transformed = y_transformed.dropna()

    X = pd.DataFrame(index=y_transformed.index)

    # lags
    for lag in [1, 7, 14]:
        X[f'lag_{lag}'] = y_transformed.shift(lag)

    # rolling stats
    X['roll_mean_7'] = y_transformed.rolling(7).mean()
    X['roll_std_7'] = y_transformed.rolling(7).std()
    X['ewm_7'] = y_transformed.ewm(span=7).mean()

    # calendar
    X['dow'] = X.index.dayofweek
    X['month'] = X.index.month
    X['time_idx'] = np.arange(len(X))

    # Fourier
    fourier = fourier_series(
        t=np.arange(len(X)),
        period=7,
        K=2
    )
    fourier.index = X.index
    X = pd.concat([X, fourier], axis=1)

    data = pd.concat([X, y_transformed.rename('y')], axis=1).dropna()

    data['store_nbr'] = store_nbr
    data['family'] = family
    data['unique_id'] = f'{store_nbr}_{family}'

    global_rows.append(data)

In [26]:
global_train_df = pd.concat(global_rows).reset_index(drop=True)



drop_columns=[  'y',  'unique_id']

X_global = global_train_df.drop(columns=drop_columns)
y_global = global_train_df['y']

In [27]:
def clean_feature_names(df):
    df = df.copy()
    df.columns = (
        df.columns
        .str.replace(r"[^\w]", "_", regex=True)  # replace all non-alphanumeric
        .str.replace("__+", "_", regex=True)     # collapse multiple underscores
        .str.strip("_")
    )
    return df

In [28]:
X_global = pd.get_dummies(
    X_global,
    columns=['store_nbr', 'family'],
    drop_first=False
)

X_global = clean_feature_names(X_global)
feature_columns = X_global.columns

In [29]:
X_global.columns

Index(['lag_1', 'lag_7', 'lag_14', 'roll_mean_7', 'roll_std_7', 'ewm_7', 'dow',
       'month', 'time_idx', 'sin_7_1', 'sin_7_2', 'cos_7_1', 'cos_7_2',
       'store_nbr_1', 'store_nbr_2', 'store_nbr_3', 'store_nbr_4',
       'store_nbr_5', 'store_nbr_6', 'store_nbr_7', 'store_nbr_8',
       'store_nbr_9', 'store_nbr_10', 'store_nbr_11', 'store_nbr_12',
       'store_nbr_13', 'store_nbr_14', 'store_nbr_15', 'store_nbr_16',
       'store_nbr_17', 'store_nbr_18', 'store_nbr_19', 'store_nbr_20',
       'store_nbr_21', 'store_nbr_22', 'store_nbr_23', 'store_nbr_24',
       'store_nbr_25', 'store_nbr_26', 'store_nbr_27', 'store_nbr_28',
       'store_nbr_29', 'store_nbr_30', 'store_nbr_31', 'store_nbr_32',
       'store_nbr_33', 'store_nbr_34', 'store_nbr_35', 'store_nbr_36',
       'store_nbr_37', 'store_nbr_38', 'store_nbr_39', 'store_nbr_40',
       'store_nbr_41', 'store_nbr_42', 'store_nbr_43', 'store_nbr_44',
       'store_nbr_45', 'store_nbr_46', 'store_nbr_47', 'store_nbr_48',
     

In [30]:


global_model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

global_model.fit(X_global, y_global)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007781 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2996
[LightGBM] [Info] Number of data points in the train set: 2975940, number of used features: 100
[LightGBM] [Info] Start training from score 0.000658


In [33]:
test_feature_rows = []

for (store_nbr, family), df_sf in df.groupby(['store_nbr', 'family']):

    # ---- TRAIN HISTORY ----
    df_sf = df_sf.sort_values('date').set_index('date')
    y = df_sf['sales']

    # Target transforms (same as training)
    y_log = np.log1p(y)

    t = np.arange(len(y_log)).reshape(-1, 1)
    trend_model = LinearRegression()
    trend_model.fit(t, y_log.values)
    trend = trend_model.predict(t)

    y_detrended = y_log - trend
    weekly_seasonality = (
        y_detrended
        .groupby(y_detrended.index.dayofweek)
        .transform('mean')
    )
    y_transformed = y_detrended - weekly_seasonality

    # ---- TEST DATES ----
    test_dates = df_test.loc[
        (df_test.store_nbr == store_nbr) &
        (df_test.family == family),
        'date'
    ]

    if len(test_dates) == 0:
        continue

    # ---- EXTEND SERIES ----
    test_index = pd.DatetimeIndex(test_dates)
    full_index = y_transformed.index.append(test_index)
    y_full = y_transformed.reindex(full_index)

    # ---- FEATURES ----
    X = pd.DataFrame(index=full_index)

    for lag in [1, 7, 14]:
        X[f'lag_{lag}'] = y_full.shift(lag)

    X['roll_mean_7'] = y_full.rolling(7).mean()
    X['roll_std_7'] = y_full.rolling(7).std()
    X['ewm_7'] = y_full.ewm(span=7).mean()

    X['dow'] = X.index.dayofweek
    X['month'] = X.index.month
    X['time_idx'] = np.arange(len(X))

    fourier = fourier_series(
        t=np.arange(len(X)),
        period=7,
        K=2
    )
    fourier.index = X.index
    X = pd.concat([X, fourier], axis=1)

    # ---- KEEP ONLY TEST ROWS ----
    X = X.loc[test_dates]
    X['store_nbr'] = store_nbr
    X['family'] = family

    test_feature_rows.append(X)

In [34]:
assert isinstance(full_index, pd.DatetimeIndex)
assert full_index.is_monotonic_increasing

In [35]:
X_test_global = pd.concat(test_feature_rows)

In [36]:
X_test_global = pd.get_dummies(
    X_test_global,
    columns=['store_nbr', 'family'],
    drop_first=False
)

In [37]:
X_test_global = clean_feature_names(X_test_global)

In [38]:
X_test_global = X_test_global.reindex(
    columns=feature_columns,
    fill_value=0
)

In [39]:
assert list(X_test_global.columns) == list(feature_columns)

In [40]:
y_pred_transformed = global_model.predict(X_test_global)

In [41]:
# Add back mean seasonality and last trend value
seasonal_level = weekly_seasonality.mean()
trend_last = trend[-1]

y_pred_log = y_pred_transformed + seasonal_level + trend_last
y_pred = np.expm1(y_pred_log)

# Safety
y_pred = np.clip(y_pred, 0, None)

In [42]:
submission = df_test[['id']].copy()
submission['sales'] = y_pred

submission.to_csv('submission.csv', index=False)

In [43]:
submission.head()


Unnamed: 0,id,sales
0,3000888,0.257046
1,3000889,0.145738
2,3000890,0.161245
3,3000891,0.240132
4,3000892,0.10577


In [44]:
submission.isna().sum()


id       0
sales    0
dtype: int64

In [45]:
submission['sales'].describe()

count    28512.000000
mean         6.944674
std         32.691214
min          0.000000
25%          0.000000
50%          0.399409
75%          1.312754
max        613.791266
Name: sales, dtype: float64

In [46]:
submission.shape

(28512, 2)