In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_log_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
PATH = "results/df_featured_full.parquet"
df = pd.read_parquet(PATH)

if not np.issubdtype(df['date'].dtype, np.datetime64):
    df['date'] = pd.to_datetime(df['date'])

pairs = df.groupby(['store_nbr', 'item_nbr']).agg({
    'family': 'first',
    'perishable': 'first',
}).reset_index()

PAIRS_PER_FAMILY = 50
sampled_pairs = (
    pairs.sample(frac=1, random_state=42)
    .groupby('family')
    .head(PAIRS_PER_FAMILY)
    .reset_index(drop=True)
)

df_sample = df.merge(
    sampled_pairs[['store_nbr', 'item_nbr']],
    on=['store_nbr', 'item_nbr'],
    how='inner'
).copy()

print(f"Sampled dataset: {df_sample.shape}")
del df
import gc
gc.collect()

Sampled dataset: (866805, 46)


0

In [3]:
df_sample = df_sample.sort_values(['store_nbr', 'item_nbr', 'date']).reset_index(drop=True)

print("✓ Data sorted")


✓ Data sorted


In [4]:
max_date = df_sample['date'].max()
test_start = max_date - pd.Timedelta(days=15)

# Test set is the last 16 days
test_mask = df_sample['date'] >= test_start

# Split remaining data into train and validation
non_test = df_sample[~test_mask].index.tolist()
np.random.seed(42)
np.random.shuffle(non_test)

valid_size = int(0.015 * len(non_test))
valid_idx = non_test[:valid_size]
train_idx = non_test[valid_size:]

df_sample['set'] = 'test'
df_sample.loc[train_idx, 'set'] = 'train'
df_sample.loc[valid_idx, 'set'] = 'valid'

print(f"\nSplit distribution:")
print(df_sample['set'].value_counts())
print(df_sample.groupby('set')['date'].agg(['min', 'max']))


Split distribution:
set
train    842725
valid     12833
test      11247
Name: count, dtype: int64
             min        max
set                        
test  2017-07-31 2017-08-15
train 2013-01-01 2017-07-30
valid 2013-01-02 2017-07-30


In [5]:
feature_cols = [
    'store_nbr',
    'item_nbr',
    'onpromotion',
    'day_of_week',
    'day_of_month',
    'month',
    'perishable',
    'is_weekend',
    'is_holiday',
]

print(f"\nTotal features: {len(feature_cols)}")



Total features: 9


In [6]:
train_df = df_sample[df_sample['set'] == 'train'].copy()
valid_df = df_sample[df_sample['set'] == 'valid'].copy()
test_df  = df_sample[df_sample['set'] == 'test'].copy()

X_train = train_df[feature_cols].copy()
X_valid = valid_df[feature_cols].copy()
X_test  = test_df[feature_cols].copy()

y_train = train_df['unit_sales'].values
y_valid = valid_df['unit_sales'].values
y_test  = test_df['unit_sales'].values

# Handle categorical columns
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
bool_cols = X_train.select_dtypes(include=['bool']).columns.tolist()

for X in [X_train, X_valid, X_test]:
    for c in cat_cols:
        X[c] = X[c].astype(str).astype('category')
    for c in bool_cols:
        X[c] = X[c].astype(int)

print(f"\nX_train: {X_train.shape}")
print(f"X_valid: {X_valid.shape}")
print(f"X_test:  {X_test.shape}")

train_weights = 1 + train_df['perishable'].values
valid_weights = 1 + valid_df['perishable'].values


X_train: (842725, 9)
X_valid: (12833, 9)
X_test:  (11247, 9)


In [7]:
def rmsle(y_true, y_pred):
    y_true = np.clip(y_true, 0, None)
    y_pred = np.clip(y_pred, 0, None)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def smape(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = np.abs(y_true) + np.abs(y_pred)
    mask = denom != 0
    out = np.zeros_like(denom)
    out[mask] = 2.0 * np.abs(y_pred[mask] - y_true[mask]) / denom[mask]
    return np.mean(out)

def nwrmsle(y_true, y_pred, perishable):
    y_true = np.clip(y_true, 0, None)
    y_pred = np.clip(y_pred, 0, None)
    w = 1 + (perishable == 1)
    msle = (w * (np.log1p(y_pred) - np.log1p(y_true))**2).sum() / w.sum()
    return np.sqrt(msle)


In [8]:
print("\nCreating DMatrix objects...")

dtrain = xgb.DMatrix(
    X_train, 
    label=y_train, 
    weight=train_weights,
    enable_categorical=True
)

dvalid = xgb.DMatrix(
    X_valid, 
    label=y_valid,
    weight=valid_weights,
    enable_categorical=True
)

dtest = xgb.DMatrix(
    X_test, 
    label=y_test,
    enable_categorical=True
)

print("✓ DMatrix created")


Creating DMatrix objects...
✓ DMatrix created


In [9]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'device': 'cuda',
    'max_depth': 25,
    'learning_rate': 0.3,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'min_child_weight': 1,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'random_state': 42,
}

print("\nTraining XGBoost...")

evals = [(dtrain, 'train'), (dvalid, 'valid')]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=2000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=100
)

print(f"\nBest iteration: {model.best_iteration}")


Training XGBoost...
[0]	train-rmse:11.43120	valid-rmse:11.80289
[53]	train-rmse:0.55418	valid-rmse:10.60624

Best iteration: 3


In [10]:
y_train_pred = model.predict(dtrain)
y_valid_pred = model.predict(dvalid)
y_test_pred  = model.predict(dtest)

In [11]:
print("\n" + "="*60)
print("XGBOOST EVALUATION RESULTS")
print("="*60)

print("\n--- TRAIN ---")
print(f"RMSLE:  {rmsle(y_train, y_train_pred):.6f}")
print(f"SMAPE:  {smape(y_train, y_train_pred):.6f}")

print("\n--- VALID ---")
print(f"RMSLE:  {rmsle(y_valid, y_valid_pred):.6f}")
print(f"SMAPE:  {smape(y_valid, y_valid_pred):.6f}")

print("\n--- TEST (last 16 days) ---")
print(f"RMSLE:  {rmsle(y_test, y_test_pred):.6f}")
print(f"SMAPE:  {smape(y_test, y_test_pred):.6f}")

print(f"\nNWRMSLE (train): {nwrmsle(y_train, y_train_pred, train_df['perishable'].values):.6f}")
print(f"NWRMSLE (valid): {nwrmsle(y_valid, y_valid_pred, valid_df['perishable'].values):.6f}")
print(f"NWRMSLE (test):  {nwrmsle(y_test, y_test_pred, test_df['perishable'].values):.6f}")


XGBOOST EVALUATION RESULTS

--- TRAIN ---
RMSLE:  0.067780
SMAPE:  0.010723

--- VALID ---
RMSLE:  0.647197
SMAPE:  0.577340

--- TEST (last 16 days) ---
RMSLE:  0.710084
SMAPE:  0.610904

NWRMSLE (train): 0.061692
NWRMSLE (valid): 0.654354
NWRMSLE (test):  0.713441


In [12]:
importance = model.get_score(importance_type='gain')

fi = pd.DataFrame({
    'feature': list(importance.keys()),
    'importance': list(importance.values())
}).sort_values('importance', ascending=False)

print("\n" + "="*60)
print("FEATURE IMPORTANCE (GAIN)")
print("="*60)
print(fi.to_string(index=False))


FEATURE IMPORTANCE (GAIN)
     feature  importance
  perishable  984.841187
    item_nbr  142.423126
 onpromotion   87.837112
   store_nbr   67.609825
day_of_month   23.381880
 day_of_week   21.657513
  is_holiday   19.437176
       month   13.377021


In [13]:
test_df = test_df.copy()
test_df['pred'] = y_test_pred

date_rmsle = (
    test_df.groupby('date')
    .apply(lambda x: rmsle(x['unit_sales'], x['pred']))
    .rename('rmsle')
    .reset_index()
)

date_holiday_flag = df_sample[['date', 'is_holiday']].drop_duplicates()
date_rmsle = date_rmsle.merge(date_holiday_flag, on='date', how='left')

print("\n" + "="*60)
print("RMSLE BY DATE (TEST) - with holiday flag")
print("="*60)
print(date_rmsle.to_string(index=False))
print(f"\nMean RMSLE: {date_rmsle['rmsle'].mean():.6f}")
print(f"Std RMSLE:  {date_rmsle['rmsle'].std():.6f}")



RMSLE BY DATE (TEST) - with holiday flag
      date    rmsle  is_holiday
2017-07-31 0.714330           0
2017-08-01 0.712513           0
2017-08-02 0.709579           0
2017-08-03 0.682135           0
2017-08-04 0.701710           0
2017-08-05 0.710719           1
2017-08-06 0.718463           0
2017-08-07 0.743466           0
2017-08-08 0.704992           0
2017-08-09 0.708029           0
2017-08-10 0.743440           1
2017-08-11 0.691486           1
2017-08-12 0.701753           0
2017-08-13 0.709716           0
2017-08-14 0.701770           0
2017-08-15 0.705739           1

Mean RMSLE: 0.709990
Std RMSLE:  0.015719


In [15]:
# RANDOM SAMPLE OF 20 ITEMS - FOR MODEL COMPARISON
np.random.seed(99)
all_items = test_df['item_nbr'].unique()
SAMPLE_ITEMS = np.random.choice(all_items, size=20, replace=False)

print("="*90)
print("RANDOM 20 ITEMS - MODEL COMPARISON")
print("="*90)
print(f"Selected Items: {sorted(SAMPLE_ITEMS)}")

# Filter for sample items (all test dates)
sample_df = test_df[test_df['item_nbr'].isin(SAMPLE_ITEMS)].copy()

print(f"Total records: {len(sample_df)}")

# Calculate errors
sample_df['error'] = sample_df['pred'] - sample_df['unit_sales']
sample_df['abs_error'] = np.abs(sample_df['error'])

# RMSLE BY ITEM
print("\n" + "="*90)
print("RMSLE BY ITEM")
print("="*90)

item_metrics = sample_df.groupby('item_nbr').apply(
    lambda x: pd.Series({
        'n_records': len(x),
        'avg_true_sales': x['unit_sales'].mean(),
        'avg_predicted': x['pred'].mean(),
        'MAE': x['abs_error'].mean(),
        'RMSLE': rmsle(x['unit_sales'], x['pred']),
        'family': x['family'].iloc[0]
    })
).round(4)

item_metrics = item_metrics.sort_values('RMSLE', ascending=False)
print(item_metrics.to_string())

# OVERALL METRICS FOR THESE 20 ITEMS
print("\n" + "="*90)
print("OVERALL METRICS (20 ITEMS SAMPLE)")
print("="*90)
print(f"RMSLE:       {rmsle(sample_df['unit_sales'], sample_df['pred']):.6f}")
print(f"MAE:         {sample_df['abs_error'].mean():.4f}")
print(f"Avg True:    {sample_df['unit_sales'].mean():.4f}")
print(f"Avg Pred:    {sample_df['pred'].mean():.4f}")

# SAMPLE PREDICTIONS (first 50 rows)
print("\n" + "="*90)
print("SAMPLE PREDICTIONS (50 rows)")
print("="*90)

sample_preview = sample_df[[
    'date', 'store_nbr', 'item_nbr', 'family', 'day_of_week', 'onpromotion', 'is_holiday',
    'unit_sales', 'pred', 'abs_error'
]].copy()

sample_preview = sample_preview.rename(columns={
    'unit_sales': 'true_sales',
    'pred': 'predicted',
    'day_of_week': 'dow',
    'onpromotion': 'promo'
})

sample_preview['predicted'] = sample_preview['predicted'].round(2)
sample_preview['abs_error'] = sample_preview['abs_error'].round(2)
sample_preview = sample_preview.sort_values(['item_nbr', 'date', 'store_nbr'])

print(sample_preview.head(50).to_string(index=False))

# METRICS BY FAMILY
print("\n" + "="*90)
print("RMSLE BY FAMILY (within 20 items sample)")
print("="*90)

family_metrics = sample_df.groupby('family').apply(
    lambda x: pd.Series({
        'n_items': x['item_nbr'].nunique(),
        'n_records': len(x),
        'avg_true': x['unit_sales'].mean(),
        'avg_pred': x['pred'].mean(),
        'MAE': x['abs_error'].mean(),
        'RMSLE': rmsle(x['unit_sales'], x['pred'])
    })
).round(4)

family_metrics = family_metrics.sort_values('RMSLE', ascending=False)
print(family_metrics.to_string())

# METRICS BY PROMOTION STATUS
print("\n" + "="*90)
print("RMSLE BY PROMOTION STATUS (within 20 items sample)")
print("="*90)

promo_metrics = sample_df.groupby('onpromotion').apply(
    lambda x: pd.Series({
        'n_records': len(x),
        'avg_true': x['unit_sales'].mean(),
        'avg_pred': x['pred'].mean(),
        'MAE': x['abs_error'].mean(),
        'RMSLE': rmsle(x['unit_sales'], x['pred'])
    })
).round(4)

promo_metrics.index = ['No Promo', 'On Promo']
print(promo_metrics.to_string())

# METRICS BY DAY OF WEEK
print("\n" + "="*90)
print("RMSLE BY DAY OF WEEK (within 20 items sample)")
print("="*90)

dow_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 
             4: 'Friday', 5: 'Saturday', 6: 'Sunday'}

dow_metrics = sample_df.groupby('day_of_week').apply(
    lambda x: pd.Series({
        'n_records': len(x),
        'avg_true': x['unit_sales'].mean(),
        'avg_pred': x['pred'].mean(),
        'MAE': x['abs_error'].mean(),
        'RMSLE': rmsle(x['unit_sales'], x['pred'])
    })
).round(4)

dow_metrics.index = dow_metrics.index.map(dow_names)
print(dow_metrics.to_string())

# ============================================================
# QUICK COMPARISON SUMMARY
# ============================================================
print("\n" + "="*90)
print("COMPARISON")
print("="*90)
print(f"""
MODEL: XGboost Final
------------------------
Overall RMSLE (20 items): {rmsle(sample_df['unit_sales'], sample_df['pred']):.6f}
Overall MAE (20 items):   {sample_df['abs_error'].mean():.4f}

Best Item RMSLE:  {item_metrics['RMSLE'].min():.4f} (item {item_metrics['RMSLE'].idxmin()})
Worst Item RMSLE: {item_metrics['RMSLE'].max():.4f} (item {item_metrics['RMSLE'].idxmax()})
Std of Item RMSLE: {item_metrics['RMSLE'].std():.4f}
""")

RANDOM 20 ITEMS - MODEL COMPARISON
Selected Items: [167437, 322094, 507958, 759894, 852937, 956013, 1085246, 1229440, 1239808, 1370542, 1373944, 1430083, 1456910, 1459058, 1584348, 1900715, 1931079, 2046297, 2046298, 2053751]
Total records: 274

RMSLE BY ITEM
          n_records  avg_true_sales  avg_predicted      MAE   RMSLE               family
item_nbr                                                                                
507958           23         13.2666       7.073000  11.6019  1.1607              POULTRY
2053751          16          6.3125       4.438100   5.8738  1.1405            BEVERAGES
1239808          14          7.9286      11.586800   6.7829  0.8078                DAIRY
759894           28          3.0714       4.925900   3.1916  0.7913        PERSONAL CARE
1085246          16         11.1907       7.412000   5.6248  0.7012                MEATS
167437            6          3.3333       4.288700   3.0898  0.6837           GROCERY II
1430083           5         