In [1]:
import os, gc, math, warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# Modeling
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

SEED = 42
np.random.seed(SEED)

# ---- Metrics (fixed) ----
def wmape(y_true, y_pred, eps=1e-9):
    denom = max(np.abs(y_true).sum(), eps)
    return np.abs(y_true - y_pred).sum() / denom

def accuracy_from_wmape(w):  # convenience
    return 1.0 - w

def bias(y_true, y_pred, eps=1e-9):
    pred_sum = max(np.abs(y_pred).sum(), eps)
    return (np.abs(y_true).sum() / pred_sum) - 1.0

def print_metrics(y_true, y_pred, label=""):
    w = wmape(y_true, y_pred)
    b = bias(y_true, y_pred)
    print(f"{label} WMAPE: {w:.4f} | Accuracy: {1-w:.4f} | Bias: {b:.4f}")


In [2]:
df = pd.read_csv('sales_pred_case/sales_pred_case.csv') # adjust if needed


# Types
df['YearWeek'] = df['YearWeek'].astype(str)
df['Key'] = df['Key'].astype(str)
df['Sales'] = df['Sales'].astype(float)

print("Shape:", df.shape)
print("Keys:", df['Key'].nunique(), "| Weeks:", df['YearWeek'].min(), "→", df['YearWeek'].max())
print("Zero %:", (df['Sales']==0).mean()*100)


Shape: (143273, 20)
Keys: 970 | Weeks: 2020-01 → 2023-03
Zero %: 56.21505796626022


In [3]:
# Exact prediction window (9 weeks)
PRED_WEEKS = [f"2022-{w:02d}" for w in range(46, 53)] + [f"2023-{w:02d}" for w in range(1, 3)]
CUTOFF_TRAIN = "2022-45"
VALID_WEEKS = [f"2022-{w:02d}" for w in range(41, 46)]

def add_time_index(d):
    d = d.copy()
    year = d['YearWeek'].str[:4].astype(int)
    week = d['YearWeek'].str[5:7].astype(int)
    d['yw_index'] = (year - year.min()) * 60 + week
    return d

df = add_time_index(df)
print("Valid weeks:", VALID_WEEKS)
print("Pred weeks :", PRED_WEEKS)


Valid weeks: ['2022-41', '2022-42', '2022-43', '2022-44', '2022-45']
Pred weeks : ['2022-46', '2022-47', '2022-48', '2022-49', '2022-50', '2022-51', '2022-52', '2023-01', '2023-02']


In [4]:
print("="*50)
print("FEATURE ENGINEERING (leak-safe)")
print("="*50)

dfe = df.sort_values(['Key','yw_index']).copy()

# Calendar helpers from existing cols (already provided)
# We keep: Week, Month, Qtr, holidays, price, promo, objectives, etc.

# Lags
LAGS = (1,2,3,4,8,12,26,52)
def make_lags(g):
    g = g.sort_values('yw_index').copy()
    for L in LAGS:
        g[f'Sales_lag_{L}'] = g['Sales'].shift(L)
    return g

dfe = dfe.groupby('Key', group_keys=False).apply(make_lags)

# Rolling stats on shifted sales (no leakage)
ROLLS = (3,8,12,26)
def make_rolls(g):
    g = g.sort_values('yw_index').copy()
    s = g['Sales'].shift(1)
    for W in ROLLS:
        g[f'Sales_rollmean_{W}'] = s.rolling(W, min_periods=2).mean()
        g[f'Sales_rollstd_{W}']  = s.rolling(W, min_periods=2).std()
    return g

dfe = dfe.groupby('Key', group_keys=False).apply(make_rolls)

# EWM on shifted sales
def make_ewm(g):
    g = g.sort_values('yw_index').copy()
    s = g['Sales'].shift(1)
    g['Sales_ewm_4'] = s.ewm(span=4).mean()
    g['Sales_ewm_8'] = s.ewm(span=8).mean()
    return g

dfe = dfe.groupby('Key', group_keys=False).apply(make_ewm)

# Growth rates from lags (no current value)
dfe['Sales_growth_1'] = (dfe['Sales_lag_1'] - dfe['Sales_lag_2']) / (dfe['Sales_lag_2'].replace(0, np.nan))
dfe['Sales_growth_4'] = (dfe['Sales_lag_1'] - dfe['Sales_lag_4']) / (dfe['Sales_lag_4'].replace(0, np.nan))
dfe[['Sales_growth_1','Sales_growth_4']] = dfe[['Sales_growth_1','Sales_growth_4']].replace([np.inf,-np.inf], np.nan)

# Interactions
dfe['DiscountedPrice_x_PromoShipment'] = dfe['DiscountedPrice'] * dfe['PromoShipment']
dfe['Holiday_sum'] = dfe['New_Year'] + dfe['Christmas_Day'] + dfe['Easter_Monday'] + dfe['Other_Holidays']
dfe['Is_Holiday'] = (dfe['Holiday_sum'] > 0).astype(int)

# Key-level stats (safe: computed from entire history, small leakage risk but static)
key_stats = dfe.groupby('Key')['Sales'].agg(['mean','std','min','max']).add_prefix('Key_')
dfe = dfe.merge(key_stats, left_on='Key', right_index=True, how='left')

print("After FE:", dfe.shape)


FEATURE ENGINEERING (leak-safe)
After FE: (143273, 48)


In [5]:
def split_weeks(d):
    train = d[d['YearWeek'] <= CUTOFF_TRAIN].copy()
    valid = d[d['YearWeek'].isin(VALID_WEEKS)].copy()
    testp = d[d['YearWeek'].isin(PRED_WEEKS)].copy()
    return train, valid, testp

train, valid, testp = split_weeks(dfe)

# Drop rows without required history
train = train.dropna(subset=[c for c in train.columns if c.startswith('Sales_lag_')])

BASE_FEATS = [
    'Material','Customer','CustomerGroup','Category',
    'Week','Month','Qtr',
    'New_Year','Christmas_Day','Easter_Monday','Other_Holidays',
    'DiscountedPrice','PromoShipment','Objective1','Objective2','PromoMethod','PromoStatus',
    'DiscountedPrice_x_PromoShipment','Is_Holiday','Holiday_sum',
    'Key_mean','Key_std','Key_min','Key_max'
]
LAG_FEATS  = [c for c in dfe.columns if c.startswith('Sales_lag_')]
ROLL_FEATS = [c for c in dfe.columns if c.startswith('Sales_roll')]
EWM_FEATS  = ['Sales_ewm_4','Sales_ewm_8']
GRW_FEATS  = ['Sales_growth_1','Sales_growth_4']

FEATS = BASE_FEATS + LAG_FEATS + ROLL_FEATS + EWM_FEATS + GRW_FEATS

print("Train weeks:", train['YearWeek'].min(), "→", train['YearWeek'].max())
print("Valid weeks:", sorted(valid['YearWeek'].unique()))
print("Pred weeks :", sorted(testp['YearWeek'].unique()))
print("Features  :", len(FEATS))


Train weeks: 2020-53 → 2022-45
Valid weeks: ['2022-41', '2022-42', '2022-43', '2022-44', '2022-45']
Pred weeks : ['2022-46', '2022-47', '2022-48', '2022-49', '2022-50', '2022-51', '2022-52', '2023-01', '2023-02']
Features  : 44


In [6]:
X_tr = train[FEATS].copy()
y_tr = train['Sales'].values
X_va = valid[FEATS].copy()
y_va = valid['Sales'].values

# Fill remaining NaNs (from std/growth early periods)
X_tr = X_tr.fillna(0.0)
X_va = X_va.fillna(0.0)

lgb_params = dict(
    objective='mae',      # L1 to align with WMAPE
    metric='mae',
    boosting_type='gbdt',
    num_leaves=128,
    learning_rate=0.02,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    min_data_in_leaf=50,
    lambda_l1=0.1,
    lambda_l2=0.1,
    verbose=-1,
    random_state=SEED
)

dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_va, label=y_va, reference=dtrain)

lgb_model = lgb.train(
    lgb_params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    valid_names=['train','valid'],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(150), lgb.log_evaluation(200)]
)

va_pred = lgb_model.predict(X_va)
print_metrics(y_va, va_pred, "Validation (2022-41..45)")

# ---- Baseline blend (safe, time-ordered, no leakage) ----
alpha = 0.2  # 20% baseline, 80% model

# Ensure training data is sorted by time so "tail(4)" is truly the latest 4 per key
train_sorted = train.sort_values(['Key', 'yw_index'])

# Per-key average of the last 4 observed weeks in TRAIN (≤ 2022-45)
last4wk = train_sorted.groupby('Key')['Sales'].apply(lambda s: s.tail(4).mean())

# Map baseline to the validation rows by Key
baseline = valid['Key'].map(last4wk).astype(float).fillna(0.0).values

# Blend baseline with the LGB predictions
blended = alpha * baseline + (1.0 - alpha) * va_pred
print_metrics(y_va, blended, "LGBM+Baseline (20% key-avg + 80% model)")


Training until validation scores don't improve for 150 rounds
[200]	train's l1: 131.248	valid's l1: 165.584
[400]	train's l1: 127.82	valid's l1: 162.415
[600]	train's l1: 125.153	valid's l1: 159.392
[800]	train's l1: 122.785	valid's l1: 156.777
[1000]	train's l1: 120.734	valid's l1: 154.275
[1200]	train's l1: 118.725	valid's l1: 151.707
[1400]	train's l1: 117.138	valid's l1: 149.99
[1600]	train's l1: 115.771	valid's l1: 148.339
[1800]	train's l1: 114.453	valid's l1: 146.739
[2000]	train's l1: 113.414	valid's l1: 145.525
Did not meet early stopping. Best iteration is:
[2000]	train's l1: 113.414	valid's l1: 145.525
Validation (2022-41..45) WMAPE: 0.5472 | Accuracy: 0.4528 | Bias: 0.2748
LGBM+Baseline (20% key-avg + 80% model) WMAPE: 0.5608 | Accuracy: 0.4392 | Bias: 0.2110


In [7]:
rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=16,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    n_jobs=-1,
    random_state=SEED
)
rf.fit(X_tr, y_tr)
va_pred_rf = rf.predict(X_va)

print_metrics(y_va, va_pred_rf, "Validation RF")

# Grid of weights; here we MINIMIZE WMAPE (not maximize)
candidates = [(0.9,0.1),(0.8,0.2),(0.7,0.3),(0.6,0.4),(0.5,0.5)]
best = (None, 1e9)
for w_lgb, w_rf in candidates:
    blend = w_lgb*va_pred + w_rf*va_pred_rf
    w = wmape(y_va, blend)
    if w < best[1]:
        best = ((w_lgb, w_rf), w)
best_weights, best_wmape = best
print(f"Best ensemble weights (LGB, RF): {best_weights} | WMAPE: {best_wmape:.4f}")


Validation RF WMAPE: 0.5360 | Accuracy: 0.4640 | Bias: 0.0097
Best ensemble weights (LGB, RF): (0.5, 0.5) | WMAPE: 0.5387


In [8]:
# Retrain on all data up to cutoff
full_hist = dfe[dfe['YearWeek'] <= CUTOFF_TRAIN].copy()
full_hist = full_hist.dropna(subset=[c for c in full_hist.columns if c.startswith('Sales_lag_')])

X_full = full_hist[FEATS].fillna(0.0)
y_full = full_hist['Sales'].values

final_lgb = lgb.train(lgb_params, lgb.Dataset(X_full, label=y_full), num_boost_round=lgb_model.best_iteration or 1000)

# If using the ensemble:
use_rf = True
if use_rf:
    final_rf = RandomForestRegressor(
        n_estimators=rf.n_estimators,
        max_depth=rf.max_depth,
        min_samples_split=rf.min_samples_split,
        min_samples_leaf=rf.min_samples_leaf,
        max_features=rf.max_features,
        n_jobs=-1,
        random_state=SEED
    )
    final_rf.fit(X_full, y_full)

# Make a working copy we can roll forward
work = df.copy()
work = add_time_index(work)
work = work.sort_values(['Key','yw_index'])
work = work.groupby('Key', group_keys=False).apply(make_lags)
work = work.groupby('Key', group_keys=False).apply(make_rolls)
work = work.groupby('Key', group_keys=False).apply(make_ewm)

preds = []
for wk in PRED_WEEKS:
    step = work[work['YearWeek'] == wk].copy()

    # Build feature frame for this step
    # We need all engineered columns; recompute helper cols that depend on lagged sales already in `work`
    step = step.merge(
        dfe[['Key','YearWeek'] + BASE_FEATS].drop_duplicates(),
        on=['Key','YearWeek'],
        how='left'
    )

    # Ensure all FEATS exist
    for c in FEATS:
        if c not in step.columns:
            step[c] = np.nan

    X_step = step[FEATS].fillna(0.0)
    yhat_lgb = final_lgb.predict(X_step)
    if use_rf:
        yhat_rf = final_rf.predict(X_step)
        yhat = best_weights[0]*yhat_lgb + best_weights[1]*yhat_rf
    else:
        yhat = yhat_lgb

    step['Pred'] = np.clip(yhat, 0, None)  # non-negative
    preds.append(step[['Key','YearWeek','Pred','yw_index']])

    # Write predictions back into 'work' as if they were observed Sales to advance lags
    idx = work['YearWeek'].eq(wk)
    work.loc[idx, 'Sales'] = step['Pred'].values

    # Rebuild lags/rolls/ewm for future weeks
    work = work.sort_values(['Key','yw_index'])
    work = work.groupby('Key', group_keys=False).apply(make_lags)
    work = work.groupby('Key', group_keys=False).apply(make_rolls)
    work = work.groupby('Key', group_keys=False).apply(make_ewm)

pred_df = pd.concat(preds, ignore_index=True)
pred_df = pred_df.sort_values(['Key','yw_index'])
print("Forecast rows:", pred_df.shape)
print("Weeks covered:", pred_df['YearWeek'].unique())


Forecast rows: (8730, 4)
Weeks covered: ['2022-46' '2022-47' '2022-48' '2022-49' '2022-50' '2022-51' '2022-52'
 '2023-01' '2023-02']


In [9]:
print("="*50)
print("EVALUATION & EXPORT")
print("="*50)

# 1) Proper evaluation on validation weeks only
val_truth = valid[['Key','YearWeek','Sales']].copy()
val_pred  = pd.DataFrame({'Key': valid['Key'], 'YearWeek': valid['YearWeek']})
val_pred['Pred'] = va_pred  # from the LGB validation prediction
if 'va_pred_rf' in globals():
    # use ensemble if you trained RF and found best_weights
    val_pred['Pred'] = best_weights[0]*va_pred + best_weights[1]*va_pred_rf

val_eval = val_truth.merge(val_pred, on=['Key','YearWeek'], how='left')
print_metrics(val_eval['Sales'].values, val_eval['Pred'].values, "Validation (2022-41..45)")

print("\nValidation weekly sums (sanity check):")
print(val_eval.groupby('YearWeek').agg(Sales=('Sales','sum'), Pred=('Pred','sum')))

# 2) Final 9-week forecast: export only (do not evaluate against 'Sales' since they are zero placeholders)
truth_9w = df[df['YearWeek'].isin(PRED_WEEKS)][['Key','YearWeek','Sales']].copy()
sums_9w_truth = truth_9w['Sales'].abs().sum()
print(f"\nGround-truth availability for forecast window: Σ|Sales| = {sums_9w_truth:.1f}")
if sums_9w_truth < 1e-6:
    print("Ground truth appears hidden (all zeros). Skipping 9-week WMAPE/Bias.")

# Save predictions
submission = pred_df[['Key','YearWeek','Pred']].rename(columns={'Pred':'Prediction'}).copy()
submission = submission.sort_values(['Key','YearWeek'])
submission.to_csv("predictions_2022-46_to_2023-02.csv", index=False)
print("Saved: predictions_2022-46_to_2023-02.csv")


EVALUATION & EXPORT
Validation (2022-41..45) WMAPE: 0.5387 | Accuracy: 0.4613 | Bias: 0.1272

Validation weekly sums (sanity check):
             Sales           Pred
YearWeek                         
2022-41   265160.0  207178.737454
2022-42   232719.0  237227.966944
2022-43   261467.0  241134.539897
2022-44   252236.0  239431.807359
2022-45   278212.0  219279.502823

Ground-truth availability for forecast window: Σ|Sales| = 0.0
Ground truth appears hidden (all zeros). Skipping 9-week WMAPE/Bias.
Saved: predictions_2022-46_to_2023-02.csv
