#Env Setup


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

with open('/content/drive/MyDrive/MLFinal/git_token.env', 'r') as f:
    token = f.read().strip()

username = "badrilosaberidze"

%cd /content/drive/MyDrive/MLFinal/walmart-sales-forecasting
!git remote set-url origin https://{username}:{token}@github.com/{username}/Walmart-Recruiting---Store-Sales-Forecasting.git
!git pull

/content/drive/MyDrive/MLFinal/walmart-sales-forecasting
Already up to date.


In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import wandb
import warnings
warnings.filterwarnings('ignore')

#Wmae Calculation

In [17]:
def calculate_wmae(y_true, y_pred, is_holiday):
    """
    Calculate Weighted Mean Absolute Error (WMAE)
    Holiday weeks have 5x weight
    """
    weights = np.where(is_holiday, 5.0, 1.0)
    weighted_errors = weights * np.abs(y_true - y_pred)
    wmae = np.sum(weighted_errors) / np.sum(weights)
    return wmae

#XGBoost Model

In [18]:
class WalmartXGBoostBaseline:
    """
    XGBoost baseline for Walmart sales forecasting

    Key Insights from ARIMA work to apply:
    1. Department-level patterns matter most
    2. Holiday effects are crucial (5x weight in WMAE)
    3. Seasonal patterns are strong
    4. Store characteristics affect sales magnitude
    """

    def __init__(self):
        self.model = None
        self.feature_names = None
        self.label_encoders = {}
        self.feature_importance = None

    def create_features(self, df, features_df, stores_df, is_training=True):
        """
        Create features for XGBoost using learnings from ARIMA phase
        """
        print("🔧 Creating XGBoost features...")

        # Start with base data
        data = df.copy()

        # Merge external data
        data = data.merge(features_df, on=['Store', 'Date'], how='left')
        data = data.merge(stores_df, on='Store', how='left')

        # Convert dates
        data['Date'] = pd.to_datetime(data['Date'])

        # 📅 TIME FEATURES
        data['Year'] = data['Date'].dt.year
        data['Month'] = data['Date'].dt.month
        data['Week'] = data['Date'].dt.isocalendar().week
        data['DayOfYear'] = data['Date'].dt.dayofyear
        data['Quarter'] = data['Date'].dt.quarter
        data['WeekOfYear'] = data['Date'].dt.isocalendar().week

        # 🎄 HOLIDAY FEATURES
        data['IsHoliday_x'] = data['IsHoliday_x'].astype(int)

        # Holiday timing features
        data['Month_Dec'] = (data['Month'] == 12).astype(int)  # Christmas month
        data['Month_Nov'] = (data['Month'] == 11).astype(int)  # Thanksgiving
        data['Week_51_52'] = ((data['Week'] == 51) | (data['Week'] == 52)).astype(int)

        # 🏪 STORE & DEPARTMENT FEATURES
        # Keep as categorical - XGBoost handles these well
        data['Store'] = data['Store'].astype('category')
        data['Dept'] = data['Dept'].astype('category')
        data['Type'] = data['Type'].astype('category')

        # Store size (numeric)
        data['Size'] = data['Size'].fillna(data['Size'].median())

        # 🌡️ EXTERNAL FEATURES (clean and enhance)
        # Handle missing values
        numeric_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        for col in numeric_cols:
            data[col] = data[col].fillna(data[col].median())

        # 📊 INTERACTION FEATURES (key for XGBoost performance)
        data['Holiday_x_Month'] = data['IsHoliday_x'] * data['Month']
        data['Holiday_x_Dept'] = data['IsHoliday_x'] * data['Dept'].cat.codes
        data['Store_x_Dept'] = data['Store'].cat.codes * data['Dept'].cat.codes

        # 📈 SEASONAL PATTERNS (from ARIMA insights)
        # Cyclical encoding for seasonality
        data['Month_sin'] = np.sin(2 * np.pi * data['Month'] / 12)
        data['Month_cos'] = np.cos(2 * np.pi * data['Month'] / 12)
        data['Week_sin'] = np.sin(2 * np.pi * data['Week'] / 52)
        data['Week_cos'] = np.cos(2 * np.pi * data['Week'] / 52)

        print(f"✅ Feature engineering complete. Shape: {data.shape}")

        return data

    def prepare_data(self, train_df, features_df, stores_df, test_df=None):
        """
        Prepare training and test data for XGBoost
        """
        print("📊 Preparing data for XGBoost...")

        # Create features for training data
        train_features = self.create_features(train_df, features_df, stores_df, is_training=True)

        # Features to use (exclude target and identifiers)
        exclude_cols = ['Weekly_Sales', 'Date']
        if 'Id' in train_features.columns:
            exclude_cols.append('Id')

        feature_cols = [col for col in train_features.columns if col not in exclude_cols]

        # Handle categorical variables with label encoding
        categorical_cols = ['Store', 'Dept', 'Type']

        for col in categorical_cols:
            if col in train_features.columns:
                le = LabelEncoder()
                train_features[col] = le.fit_transform(train_features[col].astype(str))
                self.label_encoders[col] = le

        # Extract features and target
        X_train = train_features[feature_cols]
        y_train = train_features['Weekly_Sales']

        self.feature_names = feature_cols

        # Prepare test data if provided
        X_test = None
        if test_df is not None:
            test_features = self.create_features(test_df, features_df, stores_df, is_training=False)

            # Apply same label encoding
            for col in categorical_cols:
                if col in test_features.columns and col in self.label_encoders:
                    # Handle unseen categories
                    test_features[col] = test_features[col].astype(str)
                    mask = test_features[col].isin(self.label_encoders[col].classes_)
                    test_features.loc[~mask, col] = self.label_encoders[col].classes_[0]  # Default to first class
                    test_features[col] = self.label_encoders[col].transform(test_features[col])

            X_test = test_features[feature_cols]

        print(f"✅ Data prepared - Train: {X_train.shape}, Target: {y_train.shape}")
        if X_test is not None:
            print(f"✅ Test data prepared: {X_test.shape}")

        return X_train, y_train, X_test

    def train(self, X_train, y_train, validation_split=0.2):
        """
        Train XGBoost model with validation metrics
        """
        print("🚀 Training XGBoost model...")

        # Split for validation
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train, y_train, test_size=validation_split, random_state=42
        )

        # Find holiday column for weighted metrics
        holiday_features = [col for col in X_train.columns if 'Holiday' in col or 'IsHoliday' in col]

        if 'IsHoliday' in X_train.columns:
            holiday_col = 'IsHoliday'
        elif holiday_features:
            holiday_col = holiday_features[0]
        else:
            holiday_col = None

        # Extract holiday flags
        if holiday_col:
            is_holiday_train = X_tr[holiday_col].values.astype(bool)
            is_holiday_val = X_val[holiday_col].values.astype(bool)
            print(f"📊 Holiday weeks in train: {is_holiday_train.sum()}/{len(is_holiday_train)}")
            print(f"📊 Holiday weeks in val: {is_holiday_val.sum()}/{len(is_holiday_val)}")
        else:
            print("⚠️ No holiday feature found")
            is_holiday_train = np.zeros(len(X_tr), dtype=bool)
            is_holiday_val = np.zeros(len(X_val), dtype=bool)

        # XGBoost parameters
        params = {
            'objective': 'reg:squarederror',
            'max_depth': 6,
            'learning_rate': 0.1,
            'n_estimators': 500,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'n_jobs': -1,
            'verbosity': 1
        }

        # Train model
        self.model = xgb.XGBRegressor(**params)
        self.model.fit(X_tr, y_tr)

        # Calculate metrics
        train_pred = self.model.predict(X_tr)
        val_pred = self.model.predict(X_val)

        train_mae = mean_absolute_error(y_tr, train_pred)
        val_mae = mean_absolute_error(y_val, val_pred)
        train_wmae = calculate_wmae(y_tr, train_pred, is_holiday_train)
        val_wmae = calculate_wmae(y_val, val_pred, is_holiday_val)

        # Feature importance
        self.feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)

        # Store metrics
        self.metrics = {
            'train_mae': train_mae,
            'val_mae': val_mae,
            'train_wmae': train_wmae,
            'val_wmae': val_wmae,
            'n_estimators': params['n_estimators']
        }

        print(f"✅ Training complete!")
        print(f"   Training MAE: {train_mae:.2f}")
        print(f"   Validation MAE: {val_mae:.2f}")
        print(f"   Training WMAE: {train_wmae:.2f}")
        print(f"   Validation WMAE: {val_wmae:.2f}")

        return self.metrics

    def predict(self, X_test):
        """
        Generate predictions
        """
        if self.model is None:
            raise ValueError("Model not trained yet!")

        predictions = self.model.predict(X_test)

        # Ensure non-negative predictions
        predictions = np.maximum(0, predictions)

        return predictions

    def get_feature_importance(self, top_n=20):
        """
        Get top feature importances
        """
        if self.feature_importance is None:
            return None

        return self.feature_importance.head(top_n)

#XGBoost Pipeline

In [19]:
def complete_xgboost_pipeline(train_path, test_path, features_path, stores_path):
    """
    Complete XGBoost pipeline for Walmart forecasting
    """
    print("="*80)
    print("WALMART SALES FORECASTING - XGBOOST BASELINE")
    print("="*80)

    # Setup WandB
    wandb.init(
        project="walmart-forecasting",
        name=f"xgboost-baseline-{pd.Timestamp.now().strftime('%Y%m%d-%H%M%S')}",
        tags=["xgboost", "baseline"]
    )

    try:
        # Load data
        print("📂 Loading data...")
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        features_df = pd.read_csv(features_path)
        stores_df = pd.read_csv(stores_path)

        # Convert dates
        train_df['Date'] = pd.to_datetime(train_df['Date'])
        test_df['Date'] = pd.to_datetime(test_df['Date'])
        features_df['Date'] = pd.to_datetime(features_df['Date'])

        # Create submission ID
        test_df['Id'] = (test_df['Store'].astype(str) + '_' +
                        test_df['Dept'].astype(str) + '_' +
                        test_df['Date'].dt.strftime('%Y-%m-%d'))

        # Log data info
        wandb.config.update({
            'train_rows': len(train_df),
            'test_rows': len(test_df),
            'model_type': 'XGBoost',
            'approach': 'baseline'
        })

        # Initialize and prepare
        model = WalmartXGBoostBaseline()
        X_train, y_train, X_test = model.prepare_data(train_df, features_df, stores_df, test_df)

        # Train model
        metrics = model.train(X_train, y_train)

        # Log metrics
        wandb.log({
            'train_mae': metrics['train_mae'],
            'val_mae': metrics['val_mae'],
            'train_wmae': metrics['train_wmae'],
            'val_wmae': metrics['val_wmae'],
            'train_samples': len(X_train),
            'features_count': len(model.feature_names)
        })

        # Feature importance
        importance_df = model.get_feature_importance()
        if importance_df is not None:
            print("\n🔍 Top 10 Feature Importances:")
            print(importance_df.head(10))

            importance_table = wandb.Table(dataframe=importance_df.head(20))
            wandb.log({"feature_importance": importance_table})

        # Generate predictions
        print("🔮 Generating predictions...")
        predictions = model.predict(X_test)

        # Create submission
        submission_df = pd.DataFrame({
            'Id': test_df['Id'],
            'Weekly_Sales': predictions
        })

        # Log submission stats
        wandb.log({
            'submission_total': len(submission_df),
            'submission_avg': predictions.mean(),
            'submission_std': predictions.std(),
            'submission_min': predictions.min(),
            'submission_max': predictions.max()
        })

        # Save submission
        submission_filename = 'walmart_xgboost_submission.csv'
        submission_df.to_csv(submission_filename, index=False)

        # Save model
        import joblib
        model_filename = 'xgboost_model.pkl'
        joblib.dump(model, model_filename)

        # Create artifacts
        model_artifact = wandb.Artifact("xgboost_model", type="model")
        model_artifact.add_file(model_filename)

        submission_artifact = wandb.Artifact("submission", type="submission")
        submission_artifact.add_file(submission_filename)

        wandb.log_artifact(model_artifact)
        wandb.log_artifact(submission_artifact)

        print(f"\n✅ Pipeline Complete!")
        print(f"📊 Validation WMAE: {metrics['val_wmae']:.2f}")
        print(f"📊 Validation MAE: {metrics['val_mae']:.2f}")
        print(f"📁 Submission: {submission_filename}")
        print(f"🔗 WandB: {wandb.run.url}")

        return model, submission_df, metrics

    except Exception as e:
        print(f"❌ Pipeline failed: {e}")
        import traceback
        traceback.print_exc()
        return None

    finally:
        wandb.finish()

In [22]:
def run_xgboost_baseline():
    """
    Run XGBoost baseline
    """
    TRAIN_PATH = "data/train.csv"
    TEST_PATH = "data/test.csv"
    FEATURES_PATH = "data/features.csv"
    STORES_PATH = "data/stores.csv"

    return complete_xgboost_pipeline(TRAIN_PATH, TEST_PATH, FEATURES_PATH, STORES_PATH)

In [23]:
result = run_xgboost_baseline()

WALMART SALES FORECASTING - XGBOOST BASELINE


📂 Loading data...
📊 Preparing data for XGBoost...
🔧 Creating XGBoost features...
✅ Feature engineering complete. Shape: (421570, 33)
🔧 Creating XGBoost features...
✅ Feature engineering complete. Shape: (115064, 33)
✅ Data prepared - Train: (421570, 31), Target: (421570,)
✅ Test data prepared: (115064, 31)
🚀 Training XGBoost model...
📊 Holiday weeks in train: 23675/337256
📊 Holiday weeks in val: 5986/84314
✅ Training complete!
   Training MAE: 2426.44
   Validation MAE: 2539.97
   Training WMAE: 2481.33
   Validation WMAE: 2636.78

🔍 Top 10 Feature Importances:
            feature  importance
1              Dept    0.229880
13             Type    0.138694
14             Size    0.138065
16            Month    0.054093
17             Week    0.045742
0             Store    0.043364
25   Holiday_x_Dept    0.043112
24  Holiday_x_Month    0.033984
26     Store_x_Dept    0.032191
20       WeekOfYear    0.026174
🔮 Generating predictions...

✅ Pipeline Complete!
📊 Validation WMAE: 2636.78
📊 V

0,1
features_count,▁
submission_avg,▁
submission_max,▁
submission_min,▁
submission_std,▁
submission_total,▁
train_mae,▁
train_samples,▁
train_wmae,▁
val_mae,▁

0,1
features_count,31.0
submission_avg,16626.85156
submission_max,557650.0
submission_min,0.0
submission_std,22371.36328
submission_total,115064.0
train_mae,2426.43801
train_samples,421570.0
train_wmae,2481.32775
val_mae,2539.96817
