In [None]:
! pip install kaggle



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! mkdir ~/.kaggle

In [None]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json


In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 565MB/s]


In [None]:
! unzip walmart-recruiting-store-sales-forecasting

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder # For Type encoding if not using category dtype directly
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import gc # For garbage collection
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [None]:
stores = pd.read_csv('stores.csv')
train = pd.read_csv("train.csv.zip")
features = pd.read_csv('features.csv.zip')
sample = pd.read_csv('sampleSubmission.csv.zip')
test = pd.read_csv('test.csv.zip')

In [None]:
# Convert 'Date' columns to datetime objects for easier manipulation
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Merge features with train and test data.
# Note: 'IsHoliday' is present in both train/test and features.csv.
# We'll merge on it to ensure consistency, but if there were discrepancies,
# we'd need a more careful merge strategy.
train_df = pd.merge(train, features, on=['Store', 'Date'], how='left')
test_df = pd.merge(test, features, on=['Store', 'Date'], how='left')

# Merge store information
train_df = pd.merge(train_df, stores, on='Store', how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')

print("\n--- Merged Train Data Head ---")
print(train_df.head())
print("\n--- Merged Test Data Head ---")
print(test_df.head())

print("\n--- Merged Train Data Info ---")
print(train_df.info())
print("\n--- Merged Test Data Info ---")
print(test_df.info())

# After merging but before preprocessing, add this:
train_df['IsHoliday'] = train_df['IsHoliday_x'] | train_df['IsHoliday_y']
test_df['IsHoliday'] = test_df['IsHoliday_x'] | test_df['IsHoliday_y']

# Then drop the redundant columns
train_df = train_df.drop(['IsHoliday_x', 'IsHoliday_y'], axis=1)
test_df = test_df.drop(['IsHoliday_x', 'IsHoliday_y'], axis=1)

# Free up memory
del train, test, features, stores
gc.collect()


--- Merged Train Data Head ---
   Store  Dept       Date  Weekly_Sales  IsHoliday_x  Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment  IsHoliday_y Type    Size
0      1     1 2010-02-05      24924.50        False        42.31       2.572        NaN        NaN        NaN        NaN        NaN  211.096358         8.106        False    A  151315
1      1     1 2010-02-12      46039.49         True        38.51       2.548        NaN        NaN        NaN        NaN        NaN  211.242170         8.106         True    A  151315
2      1     1 2010-02-19      41595.55        False        39.93       2.514        NaN        NaN        NaN        NaN        NaN  211.289143         8.106        False    A  151315
3      1     1 2010-02-26      19403.54        False        46.63       2.561        NaN        NaN        NaN        NaN        NaN  211.319643         8.106        False    A  151315
4      1     1 2010-03-05      21827.90    

16

//todo: xgboost optimizes missing values itself so we can try with or without this MissingValueImputer

In [None]:
class MissingMarkdownHandler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.markdown_cols = [f'MarkDown{i}' for i in range(1, 6)]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.markdown_cols:
            if col in X_copy.columns:
                X_copy[f"{col}_was_missing"] = X_copy[col].isna().astype(int)
                X_copy[col] = X_copy[col].fillna(0)

        return X_copy

In [None]:
class MissingValueImputer(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to handle missing values for specific columns.
    - MarkDown columns: fill with 0.
    - Other specified numerical columns: fill with ffill then bfill, fallback to mean.
    """
    def __init__(self, numerical_cols_to_impute=None):
        self.numerical_cols_to_impute = numerical_cols_to_impute if numerical_cols_to_impute is not None else ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        self.means = {} # To store means for fallback imputation during transform

    def fit(self, X, y=None):
        # Calculate means for fallback imputation from the training data
        for col in self.numerical_cols_to_impute:
            if col in X.columns:
                self.means[col] = X[col].mean()
        return self

    def transform(self, X):
        X_copy = X.copy()

        # Impute other numerical columns with ffill then bfill, fallback to mean
        for col in self.numerical_cols_to_impute:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')
                # Fallback to mean if NaNs still exist (e.g., if all values were NaN in a column)
                if X_copy[col].isnull().any() and col in self.means:
                    X_copy[col] = X_copy[col].fillna(self.means[col])
        return X_copy

In [None]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to extract temporal features from the 'Date' column.
    """
    def __init__(self, date_column='Date'):
        self.date_column = date_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        if self.date_column not in X_copy.columns:
            raise ValueError(f"Date column '{self.date_column}' not found in DataFrame.")

        X_copy[self.date_column] = pd.to_datetime(X_copy[self.date_column])

        X_copy['Year'] = X_copy[self.date_column].dt.year
        X_copy['Month'] = X_copy[self.date_column].dt.month
        X_copy['Month_sin'] = np.sin(2 * np.pi * X_copy['Month'] / 12)
        X_copy['Month_cos'] = np.cos(2 * np.pi * X_copy['Month'] / 12)

        # Using .dt.isocalendar().week for consistent week numbering across years
        X_copy['Week'] = X_copy[self.date_column].dt.isocalendar().week.astype(int)
        X_copy['Day'] = X_copy[self.date_column].dt.day
        X_copy['DayOfWeek'] = X_copy[self.date_column].dt.dayofweek

        X_copy['Week_sin'] = np.sin(2 * np.pi * X_copy['Week'] / 52)
        X_copy['Week_cos'] = np.cos(2 * np.pi * X_copy['Week'] / 52)

        # Markdown aggregation
        X_copy['Total_MarkDown'] = X_copy[[f'MarkDown{i}' for i in range(1, 6)]].sum(axis=1)
        X_copy['MarkDown_Intensity'] = X_copy['Total_MarkDown'] / (X_copy['Total_MarkDown'].mean() + 1)

        # Economic indicators
        X_copy['Fuel_CPI_Ratio'] = X_copy['Fuel_Price'] / X_copy['CPI']
        X_copy['Economic_Index'] = (X_copy['CPI'] * 0.4 + (100 - X_copy['Unemployment']) * 0.6) / 100


        # Convert IsHoliday to integer if it exists and is boolean
        if 'IsHoliday' in X_copy.columns and X_copy['IsHoliday'].dtype == bool:
            X_copy['IsHoliday'] = X_copy['IsHoliday'].astype(int)

        # Keep the 'Date' column for ARIMA
        return X_copy # Removed .drop(columns=[self.date_column, "Month", "Week"])


//todo: can change this to other encoders

In [None]:
class XGBoostLabelEncoder(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to encode categorical features for XGBoost.
    XGBoost works better with label-encoded categoricals than pandas categories.
    """
    def __init__(self, categorical_cols=None):
        self.categorical_cols = categorical_cols if categorical_cols is not None else ['Store', 'Dept', 'Type']
        self.label_encoders = {}

    def fit(self, X, y=None):
        for col in self.categorical_cols:
            if col in X.columns:
                self.label_encoders[col] = LabelEncoder()
                self.label_encoders[col].fit(X[col].astype(str))
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.categorical_cols:
            if col in X_copy.columns and col in self.label_encoders:
                # Handle unseen categories by using a default value
                X_copy[col] = X_copy[col].astype(str)
                known_categories = set(self.label_encoders[col].classes_)
                X_copy[col] = X_copy[col].apply(lambda x: x if x in known_categories else 'unknown')

                # Add 'unknown' to encoder if needed
                if 'unknown' not in self.label_encoders[col].classes_:
                    current_classes = list(self.label_encoders[col].classes_)
                    current_classes.append('unknown')
                    self.label_encoders[col].classes_ = np.array(current_classes)

                X_copy[col] = self.label_encoders[col].transform(X_copy[col])
        return X_copy

//todo: lest use target encoder instead of label encoder

In [None]:
class XGBoostTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_cols=None, smoothing=1.0):
        self.categorical_cols = categorical_cols if categorical_cols is not None else ['Store', 'Dept', 'Type']
        self.smoothing = smoothing
        self.target_encoders = {}

    def fit(self, X, y):
        for col in self.categorical_cols:
            if col in X.columns:
                self.target_encoders[col] = TargetEncoder(smoothing=self.smoothing)
                self.target_encoders[col].fit(X[col], y)
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.categorical_cols:
            if col in X_copy.columns and col in self.target_encoders:
                X_copy[col] = self.target_encoders[col].transform(X_copy[col])
        return X_copy

In [None]:
%pip install -q dagshub

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m104.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install mlflow==2.7.1



In [None]:
import dagshub
# Try to get credentials from environment first
dagshub.init(
    repo_owner='abarb22',
    repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
    mlflow=True
)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=d9a72f37-0de1-4ed0-9b6d-a282b90c0535&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=95a6593ddd55274f50d9439490e0bce8749b867c3aa9dec203707fd88e202dca




In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Build the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('missing_markdown', MissingMarkdownHandler()),
    ('missing_imputer', MissingValueImputer()),
    ('date_features', DateFeatureExtractor()),
    ('label_encoder', XGBoostLabelEncoder())  # or XGBoostTargetEncoder if you want target encoding
])

# Prepare the data
print("Preparing training data...")
X_train = train_df.drop(['Weekly_Sales'], axis=1)
y_train = train_df['Weekly_Sales']

print("\n--- Applying Preprocessing Pipeline to Train Data ---")
X_train_processed = preprocessing_pipeline.fit_transform(X_train, y_train)

# Remove raw Date column since we've already extracted meaningful features from it
# The DateFeatureExtractor already created Year, Month, Week, Day, etc.
# Raw datetime objects aren't useful numerical features for XGBoost
if 'Date' in X_train_processed.columns:
    dates = X_train_processed['Date']  # Keep for potential time-based validation
    X_train_processed = X_train_processed.drop(['Date'], axis=1)

print(f"Training data shape: {X_train_processed.shape}")
print(f"Features: {list(X_train_processed.columns)}")

print("\n--- Applying Preprocessing Pipeline to Test Data ---")
# For the test set, we only call transform, as fit was done on the training data.
X_test_processed = preprocessing_pipeline.transform(test_df.drop(columns=['Id'], errors='ignore'))

if 'Date' in X_test_processed.columns:
    dates = X_test_processed['Date']  # Keep for potential time-based validation
    X_test_processed = X_test_processed.drop(['Date'], axis=1)

print("\nProcessed X_train_processed info:")
print(X_train_processed.info())
print("\nProcessed X_test_processed info:")
print(X_test_processed.info())

# Verify no missing values in processed data
print("\nMissing values in processed X_train_processed:\n", X_train_processed.isnull().sum().sum())
print("Missing values in processed X_test_processed:\n", X_test_processed.isnull().sum().sum())

Preparing training data...

--- Applying Preprocessing Pipeline to Train Data ---


  X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')


Training data shape: (421570, 32)
Features: ['Store', 'Dept', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'IsHoliday', 'MarkDown1_was_missing', 'MarkDown2_was_missing', 'MarkDown3_was_missing', 'MarkDown4_was_missing', 'MarkDown5_was_missing', 'Year', 'Month', 'Month_sin', 'Month_cos', 'Week', 'Day', 'DayOfWeek', 'Week_sin', 'Week_cos', 'Total_MarkDown', 'MarkDown_Intensity', 'Fuel_CPI_Ratio', 'Economic_Index']

--- Applying Preprocessing Pipeline to Test Data ---


  X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')



Processed X_train_processed info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 32 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Store                  421570 non-null  int64  
 1   Dept                   421570 non-null  int64  
 2   Temperature            421570 non-null  float64
 3   Fuel_Price             421570 non-null  float64
 4   MarkDown1              421570 non-null  float64
 5   MarkDown2              421570 non-null  float64
 6   MarkDown3              421570 non-null  float64
 7   MarkDown4              421570 non-null  float64
 8   MarkDown5              421570 non-null  float64
 9   CPI                    421570 non-null  float64
 10  Unemployment           421570 non-null  float64
 11  Type                   421570 non-null  int64  
 12  Size                   421570 non-null  int64  
 13  IsHoliday              421570 non-null  int64  
 14  M



In [None]:
# Get feature names after preprocessing
features_after_pipeline = X_train_processed.columns.tolist()
categorical_features_after_pipeline = [col for col in ['Store', 'Dept', 'Type'] if col in features_after_pipeline]

# Prepare weights for training (Walmart competition uses WMAE - holiday weeks get 5x weight)
# We need the 'IsHoliday' column which is now an integer from DateFeatureExtractor
train_weights = np.where(X_train_processed['IsHoliday'] == 1, 5, 1)

# Store test IDs for submission
test_ids = test_df['Store'].astype(str) + '_' + test_df['Dept'].astype(str) + '_' + test_df['Date'].dt.strftime('%Y-%m-%d')

# Sort processed data by date for proper time-series splitting
# We need to re-attach Date for splitting
temp_train_df = X_train_processed.copy()
temp_train_df['Date'] = pd.to_datetime(train_df['Date'])

# Get original dates back for sorting
temp_train_df['Weekly_Sales'] = y_train
temp_train_df = temp_train_df.sort_values(by='Date').reset_index(drop=True)

# Define a cutoff date for validation (avoid random splits in time series)
validation_cutoff_date = pd.to_datetime('2012-09-01')

# Remove Date column from features list for training
features_for_training = [col for col in features_after_pipeline if col != 'Date']

# # Split data based on time
X_train_split = temp_train_df[temp_train_df['Date'] < validation_cutoff_date][features_for_training]
y_train_split = temp_train_df[temp_train_df['Date'] < validation_cutoff_date]['Weekly_Sales']
X_val_split = temp_train_df[temp_train_df['Date'] >= validation_cutoff_date][features_for_training]
y_val_split = temp_train_df[temp_train_df['Date'] >= validation_cutoff_date]['Weekly_Sales']

# Calculate weights for validation split
def weighted_mean_absolute_error(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

val_weights = np.where(X_val_split['IsHoliday'] == 1, 5, 1)
train_weights_split = np.where(X_train_split['IsHoliday'] == 1, 5, 1)

In [None]:
# from sklearn.metrics import make_scorer

# wmae_scorer = make_scorer(weighted_mean_absolute_error, greater_is_better=False)

In [None]:
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.model_selection import ParameterGrid

# Base model parameters (cleaned up)
xgb_base_params = {
    'objective': 'reg:squarederror',
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 1,
    'tree_method': 'hist',
    'random_state': 42,
    'n_jobs': -1,
    'eval_metric': 'mae',
    'early_stopping_rounds': 50  # Move here
}

# Grid to search
param_grid = {
    'max_depth': [6, 8],
    'learning_rate': [0.05, 0.1]
}

# Best tracking
best_score = float('inf')
best_params = None
best_model = None

# Start MLflow experiment
mlflow.set_experiment("XGBoost_Training")

with mlflow.start_run(run_name="XGBoost_HyperParameter_Tuning"):
    for params in ParameterGrid(param_grid):
        subrun_name = f"depth={params['max_depth']}_lr={params['learning_rate']}"
        with mlflow.start_run(run_name=subrun_name, nested=True):
            model_params = {**xgb_base_params, **params}
            mlflow.log_params(model_params)

            model = xgb.XGBRegressor(**model_params)

            model.fit(
                X_train_split,
                y_train_split,
                sample_weight=train_weights_split,
                eval_set=[(X_val_split, y_val_split)],
                verbose=False
            )

            # Predict and evaluate
            val_preds = model.predict(X_val_split)
            train_preds = model.predict(X_train_split)

            val_wmae = weighted_mean_absolute_error(y_val_split, val_preds, val_weights)
            train_wmae = weighted_mean_absolute_error(y_train_split, train_preds, train_weights_split)

            # Log metrics
            mlflow.log_metrics({
                "val_wmae": val_wmae,
                "train_wmae": train_wmae,
                "best_iteration": model.best_iteration,
                "n_estimators_used": model.best_iteration + 1,  # +1 because iterations are 0-indexed
                "total_estimators": model.n_estimators
            })

            # Log model
            signature = mlflow.models.infer_signature(X_train_split, train_preds)
            mlflow.xgboost.log_model(model, "model", signature=signature)

            print(f"WMAE - Train: {train_wmae:.4f} | Val: {val_wmae:.4f} | Params: {params}")

            if val_wmae < best_score:
                best_score = val_wmae
                best_params = model_params
                best_model = model

    # Log best score and best params
    mlflow.log_metric("best_val_wmae", best_score)
    mlflow.log_params({"best_" + k: v for k, v in best_params.items()})

    # Log best model in separate subrun
    with mlflow.start_run(run_name="Best_Model", nested=True):
        mlflow.log_params(best_params)
        mlflow.log_metrics({
            "val_wmae": best_score,
            "best_iteration": best_model.best_iteration
        })

        signature = mlflow.models.infer_signature(X_train_split, best_model.predict(X_train_split))
        mlflow.xgboost.log_model(
            best_model,
            "best_model",
            signature=signature,
            input_example=X_train_split.iloc[:1]
        )
        print(f"Best model logged with WMAE: {best_score:.4f}")

# Final print
print("\nFinal Results:")
print(f"Best WMAE: {best_score:.4f}")
print("Best parameters:", {k: v for k, v in best_params.items() if k in param_grid})

  inputs = _infer_schema(model_input) if model_input is not None else None


WMAE - Train: 4201.0748 | Val: 3915.0076 | Params: {'learning_rate': 0.05, 'max_depth': 6}


  inputs = _infer_schema(model_input) if model_input is not None else None


WMAE - Train: 2783.5565 | Val: 2764.9281 | Params: {'learning_rate': 0.05, 'max_depth': 8}


  inputs = _infer_schema(model_input) if model_input is not None else None


WMAE - Train: 3542.0185 | Val: 3391.6105 | Params: {'learning_rate': 0.1, 'max_depth': 6}


  inputs = _infer_schema(model_input) if model_input is not None else None


WMAE - Train: 2239.7041 | Val: 2371.9634 | Params: {'learning_rate': 0.1, 'max_depth': 8}


  inputs = _infer_schema(model_input) if model_input is not None else None


Best model logged with WMAE: 2371.9634

Final Results:
Best WMAE: 2371.9634
Best parameters: {'learning_rate': 0.1, 'max_depth': 8}


In [None]:
# # Prepare test data for predictions
# print("\nPreparing test data...")
# X_test_processed = preprocessing_pipeline.transform(test_df)

# # Remove Date column from test data
# X_test_final = X_test_processed[features_for_training]

# # Make predictions
# test_predictions = final_model.predict(X_test_final)

# # Create submission file using proper test IDs
# submission = pd.DataFrame({
#     'Id': test_df['Id'],  # Use the original Id from test data
#     'Weekly_Sales': test_predictions
# })

# submission.to_csv('walmart_xgboost_submission.csv', index=False)
# print("Submission file created: walmart_xgboost_submission.csv")