#Env Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

with open('/content/drive/MyDrive/MLFinal/git_token.env', 'r') as f:
    token = f.read().strip()

username = "badrilosaberidze"

%cd /content/drive/MyDrive/MLFinal/walmart-sales-forecasting
!git remote set-url origin https://{username}:{token}@github.com/{username}/Walmart-Recruiting---Store-Sales-Forecasting.git
!git pull

/content/drive/MyDrive/MLFinal/walmart-sales-forecasting
Already up to date.


In [3]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, TargetEncoder
from sklearn.metrics import mean_absolute_error
import wandb

#Custom WMae functions

In [4]:
def wmae_objective(y_pred, y_true, is_holiday):
    """
    Custom WMAE objective function for XGBoost
    """
    # Get holiday weights
    weights = np.where(is_holiday, 5.0, 1.0)

    # Calculate weighted residuals
    residuals = y_true.get_label() - y_pred
    weighted_residuals = weights * residuals

    # Gradient: derivative of WMAE w.r.t. predictions
    grad = -np.sign(weighted_residuals) * weights / np.sum(weights)

    # Hessian: second derivative (approximate for MAE)
    hess = np.ones_like(grad) * 0.1  # Small constant for numerical stability

    return grad, hess


In [5]:
def wmae_eval(y_pred, y_true, is_holiday):
    """
    Custom WMAE evaluation function for XGBoost
    """
    weights = np.where(is_holiday, 5.0, 1.0)
    weighted_errors = weights * np.abs(y_true.get_label() - y_pred)
    wmae = np.sum(weighted_errors) / np.sum(weights)
    return 'wmae', wmae

In [6]:
def calculate_wmae(y_true, y_pred, is_holiday):
    """
    Calculate Weighted Mean Absolute Error (WMAE)
    Holiday weeks have 5x weight
    """
    weights = np.where(is_holiday, 5.0, 1.0)
    weighted_errors = weights * np.abs(y_true - y_pred)
    wmae = np.sum(weighted_errors) / np.sum(weights)
    return wmae

#Advanced XGBoost model

In [22]:
class AdvancedWalmartXGBoost:
    """
    Advanced XGBoost with time series features and custom WMAE objective
    """

    def __init__(self):
        self.model = None
        self.feature_names = None
        self.target_encoders = {}
        self.feature_importance = None
        self.metrics = None

    def create_advanced_features(self, df, features_df, stores_df, is_training=True):
        """
        Create advanced features
        """
        print("🔧 Creating features (same for train/test)...")

        data = df.copy()
        data = data.merge(features_df, on=['Store', 'Date'], how='left')
        data = data.merge(stores_df, on='Store', how='left')
        data['Date'] = pd.to_datetime(data['Date'])

        # TIME FEATURES (same for both)
        data['Year'] = data['Date'].dt.year
        data['Month'] = data['Date'].dt.month
        data['Week'] = data['Date'].dt.isocalendar().week
        data['Quarter'] = data['Date'].dt.quarter
        data['WeekOfMonth'] = data['Date'].dt.day // 7 + 1

        # HOLIDAY FEATURES (same for both)
        holiday_col = 'IsHoliday_x' if 'IsHoliday_x' in data.columns else 'IsHoliday'
        if holiday_col in data.columns:
            data['IsHoliday'] = data[holiday_col].astype(int)
        else:
            data['IsHoliday'] = 0

        data['Holiday_Month'] = data['IsHoliday'] * data['Month']
        data['Christmas_Season'] = ((data['Month'] == 12) & (data['Week'] >= 50)).astype(int)
        data['Thanksgiving_Week'] = ((data['Month'] == 11) & (data['Week'] >= 46)).astype(int)

        # CATEGORICAL ENCODING (same for both)
        data['Store'] = data['Store'].astype('category').cat.codes
        data['Dept'] = data['Dept'].astype('category').cat.codes
        data['Type'] = data['Type'].astype('category').cat.codes

        # EXTERNAL FEATURES (same for both)
        numeric_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size']
        for col in numeric_cols:
            if col in data.columns:
                data[col] = data[col].fillna(data[col].median())
                data[f'{col}_x_Holiday'] = data[col] * data['IsHoliday']

        # INTERACTIONS (same for both)
        data['Store_x_Dept'] = data['Store'] * data['Dept']
        data['Month_x_Dept'] = data['Month'] * data['Dept']
        data['Holiday_x_Size'] = data['IsHoliday'] * data['Size']
        data['Christmas_x_Dept'] = data['Christmas_Season'] * data['Dept']

        # SEASONAL ENCODING (same for both)
        data['Month_sin'] = np.sin(2 * np.pi * data['Month'] / 12)
        data['Month_cos'] = np.cos(2 * np.pi * data['Month'] / 12)
        data['Week_sin'] = np.sin(2 * np.pi * data['Week'] / 52)
        data['Week_cos'] = np.cos(2 * np.pi * data['Week'] / 52)

        data = data.fillna(0)

        print(f"✅ Feature engineering complete. Shape: {data.shape}")
        print(f"📊 Features are identical for train/test: {not is_training}")

        return data

    def prepare_data(self, train_df, features_df, stores_df, test_df=None):
        """
        Prepare data with advanced features
        """
        print("📊 Preparing data with proper target encoding...")

        # Create features for training (without target encoding)
        train_features = self.create_advanced_features(train_df, features_df, stores_df, is_training=True)

        # TIME-AWARE SPLIT FIRST (before target encoding)
        split_idx = int(len(train_features) * 0.8)
        train_part = train_features.iloc[:split_idx].copy()
        val_part = train_features.iloc[split_idx:].copy()

        print(f"📊 Split: Train {len(train_part)}, Validation {len(val_part)}")

        # TARGET ENCODING: Use only train part to calculate encodings
        if 'Weekly_Sales' in train_part.columns:
            print("📊 Calculating target encodings from train part only...")

            # Calculate encodings from train part only
            store_means = train_part.groupby('Store')['Weekly_Sales'].mean()
            dept_means = train_part.groupby('Dept')['Weekly_Sales'].mean()

            # Apply to train part
            train_part['Store_TargetEnc'] = train_part['Store'].map(store_means)
            train_part['Dept_TargetEnc'] = train_part['Dept'].map(dept_means)

            # Apply to validation part (no leakage)
            val_part['Store_TargetEnc'] = val_part['Store'].map(store_means).fillna(store_means.mean())
            val_part['Dept_TargetEnc'] = val_part['Dept'].map(dept_means).fillna(dept_means.mean())

            # Store encoders for test data
            self.target_encoders = {'Store': store_means, 'Dept': dept_means}

            # Combine back
            train_features_fixed = pd.concat([train_part, val_part], ignore_index=True)
        else:
            train_features_fixed = train_features
            train_features_fixed['Store_TargetEnc'] = 0
            train_features_fixed['Dept_TargetEnc'] = 0

        # Feature columns
        exclude_cols = ['Weekly_Sales', 'Date', 'Id']
        feature_cols = [col for col in train_features_fixed.columns if col not in exclude_cols]

        X_train = train_features_fixed[feature_cols]
        y_train = train_features_fixed['Weekly_Sales']

        self.feature_names = feature_cols

        # Prepare test data
        X_test = None
        if test_df is not None:
            test_features = self.create_advanced_features(test_df, features_df, stores_df, is_training=False)

            # Apply stored target encodings to test
            if hasattr(self, 'target_encoders'):
                test_features['Store_TargetEnc'] = test_features['Store'].map(self.target_encoders['Store']).fillna(self.target_encoders['Store'].mean())
                test_features['Dept_TargetEnc'] = test_features['Dept'].map(self.target_encoders['Dept']).fillna(self.target_encoders['Dept'].mean())
            else:
                test_features['Store_TargetEnc'] = 0
                test_features['Dept_TargetEnc'] = 0

            X_test = test_features[feature_cols]

        print(f"✅ Data prepared - Train: {X_train.shape}, Target: {y_train.shape}")
        if X_test is not None:
            print(f"✅ Test data: {X_test.shape}")

        return X_train, y_train, X_test

    def train(self, X_train, y_train, validation_split=0.2):
        print("🚀 Training XGBoost with standard objective...")

        # Time series split
        split_idx = int(len(X_train) * (1 - validation_split))

        X_tr = X_train.iloc[:split_idx]
        X_val = X_train.iloc[split_idx:]
        y_tr = y_train.iloc[:split_idx]
        y_val = y_train.iloc[split_idx:]

        # Get holiday flags for evaluation only
        is_holiday_train = X_tr['IsHoliday'].values.astype(bool)
        is_holiday_val = X_val['IsHoliday'].values.astype(bool)

        print(f"📊 Holiday weeks - Train: {is_holiday_train.sum()}/{len(is_holiday_train)}, Val: {is_holiday_val.sum()}/{len(is_holiday_val)}")

        # FIXED: Use standard XGBoost objective (not custom)
        params = {
            'objective': 'reg:squarederror',  # Standard objective
            'eval_metric': 'mae',
            'max_depth': 7,
            'learning_rate': 0.05,
            'n_estimators': 800,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'n_jobs': -1,
            'verbosity': 1
        }

        # FIXED: Standard XGBRegressor training
        self.model = xgb.XGBRegressor(**params)
        self.model.fit(X_tr, y_tr)

        # Calculate metrics
        train_pred = self.model.predict(X_tr)
        val_pred = self.model.predict(X_val)

        train_mae = mean_absolute_error(y_tr, train_pred)
        val_mae = mean_absolute_error(y_val, val_pred)
        train_wmae = calculate_wmae(y_tr, train_pred, is_holiday_train)
        val_wmae = calculate_wmae(y_val, val_pred, is_holiday_val)

        # Feature importance
        self.feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)

        self.metrics = {
            'train_mae': train_mae,
            'val_mae': val_mae,
            'train_wmae': train_wmae,
            'val_wmae': val_wmae,
            'n_estimators': params['n_estimators']
        }

        print(f"✅ Training complete!")
        print(f"   Training MAE: {train_mae:.2f}")
        print(f"   Validation MAE: {val_mae:.2f}")
        print(f"   Training WMAE: {train_wmae:.2f}")
        print(f"   Validation WMAE: {val_wmae:.2f}")

        return self.metrics

    def predict(self, X_test):
        if self.model is None:
            raise ValueError("Model not trained yet!")

        # FIXED: Standard sklearn prediction
        predictions = self.model.predict(X_test)
        predictions = np.maximum(0, predictions)  # Ensure non-negative

        return predictions

    def get_feature_importance(self, top_n=20):
        """
        Get feature importance
        """
        if self.feature_importance is None:
            return None
        return self.feature_importance.head(top_n)

#Complete Pipeline

In [24]:
def complete_advanced_xgboost_pipeline(train_path, test_path, features_path, stores_path):
    """
    Complete advanced XGBoost pipeline with custom WMAE objective
    """
    print("="*80)
    print("WALMART SALES FORECASTING - ADVANCED XGBOOST")
    print("="*80)

    wandb.init(
        project="walmart-forecasting_XGBoost",
        name=f"xgboost-advanced-{pd.Timestamp.now().strftime('%Y%m%d-%H%M%S')}",
        tags=["xgboost", "advanced", "custom-wmae", "time-series-features"]
    )

    try:
        # Load data
        print("📂 Loading data...")
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        features_df = pd.read_csv(features_path)
        stores_df = pd.read_csv(stores_path)

        # Convert dates
        train_df['Date'] = pd.to_datetime(train_df['Date'])
        test_df['Date'] = pd.to_datetime(test_df['Date'])
        features_df['Date'] = pd.to_datetime(features_df['Date'])

        # Create submission ID
        test_df['Id'] = (test_df['Store'].astype(str) + '_' +
                        test_df['Dept'].astype(str) + '_' +
                        test_df['Date'].dt.strftime('%Y-%m-%d'))

        # Log config
        wandb.config.update({
            'model_type': 'XGBoost Advanced',
            'custom_objective': 'WMAE',
            'time_series_features': True,
            'target_encoding': True,
            'train_rows': len(train_df)
        })

        # Initialize and train
        model = AdvancedWalmartXGBoost()
        X_train, y_train, X_test = model.prepare_data(train_df, features_df, stores_df, test_df)

        # Train with custom WMAE
        metrics = model.train(X_train, y_train)

        # Log metrics
        wandb.log({
            'train_mae': metrics['train_mae'],
            'val_mae': metrics['val_mae'],
            'train_wmae': metrics['train_wmae'],
            'val_wmae': metrics['val_wmae'],
            'n_estimators': metrics['n_estimators'],
            'features_count': len(model.feature_names)
        })

        # Feature importance
        importance_df = model.get_feature_importance()
        print("\n🔍 Top 15 Feature Importances:")
        print(importance_df.head(15))

        # Generate predictions
        predictions = model.predict(X_test)

        # Create submission
        submission_df = pd.DataFrame({
            'Id': test_df['Id'],
            'Weekly_Sales': predictions
        })

        # Save results
        submission_filename = 'walmart_xgboost_advanced_submission.csv'
        submission_df.to_csv(submission_filename, index=False)

        print(f"\n✅ Advanced XGBoost Complete!")
        print(f"📊 Validation WMAE: {metrics['val_wmae']:.2f}")
        print(f"📁 Submission: {submission_filename}")

        return model, submission_df, metrics

    except Exception as e:
        print(f"❌ Pipeline failed: {e}")
        import traceback
        traceback.print_exc()
        return None

    finally:
        wandb.finish()

In [23]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"
FEATURES_PATH = "data/features.csv"
STORES_PATH = "data/stores.csv"

result = complete_advanced_xgboost_pipeline(TRAIN_PATH, TEST_PATH, FEATURES_PATH, STORES_PATH)

WALMART SALES FORECASTING - ADVANCED XGBOOST


📂 Loading data...
📊 Preparing data with proper target encoding...
🔧 Creating features (same for train/test)...
✅ Feature engineering complete. Shape: (421570, 39)
📊 Features are identical for train/test: False
📊 Split: Train 337256, Validation 84314
📊 Calculating target encodings from train part only...
🔧 Creating features (same for train/test)...
✅ Feature engineering complete. Shape: (115064, 39)
📊 Features are identical for train/test: True
✅ Data prepared - Train: (421570, 39), Target: (421570,)
✅ Test data: (115064, 39)
🚀 Training XGBoost with standard objective...
📊 Holiday weeks - Train: 23740/337256, Val: 5921/84314
✅ Training complete!
   Training MAE: 1718.00
   Validation MAE: 8386.46
   Training WMAE: 1747.35
   Validation WMAE: 8594.48

🔍 Top 15 Feature Importances:
              feature  importance
38     Dept_TargetEnc    0.184926
37    Store_TargetEnc    0.167549
1                Dept    0.087168
14               Size    0.082060
16              Month    0.049637
13    

0,1
features_count,▁
n_estimators,▁
train_mae,▁
train_wmae,▁
val_mae,▁
val_wmae,▁

0,1
features_count,39.0
n_estimators,800.0
train_mae,1718.00391
train_wmae,1747.35188
val_mae,8386.4559
val_wmae,8594.48372
