<a href="https://colab.research.google.com/github/abarb2022/Walmart-Recruiting---Store-Sales-Forecasting/blob/main/model_experiment_sarima.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
! mkdir ~/.kaggle

In [3]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 773MB/s]


In [6]:
! unzip walmart-recruiting-store-sales-forecasting

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [7]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder # For Type encoding if not using category dtype directly
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import gc # For garbage collection
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)


In [8]:
stores = pd.read_csv('stores.csv')
train = pd.read_csv("train.csv.zip")
features = pd.read_csv('features.csv.zip')
sample = pd.read_csv('sampleSubmission.csv.zip')
test = pd.read_csv('test.csv.zip')

In [9]:
# Convert 'Date' columns to datetime objects for easier manipulation
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Merge features with train and test data.
# Note: 'IsHoliday' is present in both train/test and features.csv.
# We'll merge on it to ensure consistency, but if there were discrepancies,
# we'd need a more careful merge strategy.
train_df = pd.merge(train, features, on=['Store', 'Date', 'IsHoliday'], how='left')
test_df = pd.merge(test, features, on=['Store', 'Date', 'IsHoliday'], how='left')

# Merge store information
train_df = pd.merge(train_df, stores, on='Store', how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')

print("\n--- Merged Train Data Head ---")
print(train_df.head())
print("\n--- Merged Test Data Head ---")
print(test_df.head())

print("\n--- Merged Train Data Info ---")
print(train_df.info())
print("\n--- Merged Test Data Info ---")
print(test_df.info())

# Free up memory
del train, test, features, stores
gc.collect()


--- Merged Train Data Head ---
   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment Type    Size
0      1     1 2010-02-05      24924.50      False        42.31       2.572        NaN        NaN        NaN        NaN        NaN  211.096358         8.106    A  151315
1      1     1 2010-02-12      46039.49       True        38.51       2.548        NaN        NaN        NaN        NaN        NaN  211.242170         8.106    A  151315
2      1     1 2010-02-19      41595.55      False        39.93       2.514        NaN        NaN        NaN        NaN        NaN  211.289143         8.106    A  151315
3      1     1 2010-02-26      19403.54      False        46.63       2.561        NaN        NaN        NaN        NaN        NaN  211.319643         8.106    A  151315
4      1     1 2010-03-05      21827.90      False        46.50       2.625        NaN        NaN        NaN        Na

0

## **DATA CLEANING**


In [10]:
class MissingValueImputer(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to handle missing values for specific columns.
    - MarkDown columns: fill with 0.
    - Other specified numerical columns: fill with ffill then bfill, fallback to mean.
    """
    def __init__(self, markdown_cols=None, numerical_cols_to_impute=None):
        self.markdown_cols = markdown_cols if markdown_cols is not None else [f'MarkDown{i}' for i in range(1, 6)]
        self.numerical_cols_to_impute = numerical_cols_to_impute if numerical_cols_to_impute is not None else ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        self.means = {} # To store means for fallback imputation during transform

    def fit(self, X, y=None):
        # Calculate means for fallback imputation from the training data
        for col in self.numerical_cols_to_impute:
            if col in X.columns:
                self.means[col] = X[col].mean()
        return self

    def transform(self, X):
        X_copy = X.copy()


        for col in self.markdown_cols:
          if col in X_copy.columns:
            X_copy[f"{col}_was_missing"] = X_copy[col].isna().astype(int)
            X_copy[col] = X_copy[col].fillna(0)


        # Impute other numerical columns with ffill then bfill, fallback to mean
        for col in self.numerical_cols_to_impute:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].ffill().bfill() # Use ffill and bfill directly
                # Fallback to mean if NaNs still exist (e.g., if all values were NaN in a column)
                if X_copy[col].isnull().any() and col in self.means:
                    X_copy[col] = X_copy[col].fillna(self.means[col])
        return X_copy

In [16]:
class FastWalmartHolidayFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_column='Date', verbose=False):
        self.date_column = date_column
        self.verbose = verbose
        self.holiday_dates = {
            'super_bowl': pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08']),
            'labor_day': pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06']),
            'thanksgiving': pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29']),
            'christmas': pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])
        }

    def _get_holiday_features_vectorized(self, dates):
        features = pd.DataFrame(index=dates.index)
        for holiday in self.holiday_dates.keys():
            features[f'is_{holiday}'] = 0
            features[f'days_until_{holiday}'] = 999
            features[f'days_since_{holiday}'] = 999
        features['holiday_week_flag'] = 0

        for holiday_name, holiday_list in self.holiday_dates.items():
            is_holiday = dates.isin(holiday_list)
            features[f'is_{holiday_name}'] = is_holiday.astype(int)
            for holiday_date in holiday_list:
                days_diff = (holiday_date - dates).dt.days
                future_mask = days_diff > 0
                current_until = features[f'days_until_{holiday_name}']
                features[f'days_until_{holiday_name}'] = np.where(future_mask & (days_diff < current_until), days_diff, current_until)
                past_mask = days_diff <= 0
                days_since = -days_diff
                current_since = features[f'days_since_{holiday_name}']
                features[f'days_since_{holiday_name}'] = np.where(past_mask & (days_since < current_since), days_since, current_since)
        holiday_cols = [f'is_{holiday}' for holiday in self.holiday_dates.keys()]
        features['holiday_week_flag'] = features[holiday_cols].sum(axis=1).clip(0, 1)
        return features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        dates = pd.to_datetime(X_copy[self.date_column])
        try:
            X_copy['week_of_year'] = dates.dt.isocalendar().week.astype(int)
        except:
            X_copy['week_of_year'] = dates.dt.week.astype(int)
        X_copy['day_of_week'] = dates.dt.dayofweek.astype(int)
        holiday_features = self._get_holiday_features_vectorized(dates)
        X_copy = pd.concat([X_copy, holiday_features], axis=1)
        X_copy['black_friday_week'] = ((X_copy['is_thanksgiving'] == 1) | (X_copy['days_since_thanksgiving'] <= 7)).astype(int)
        X_copy['christmas_buildup'] = ((X_copy['days_until_christmas'] <= 21) & (X_copy['days_until_christmas'] > 0)).astype(int)
        X_copy['post_super_bowl'] = ((X_copy['days_since_super_bowl'] <= 14) & (X_copy['days_since_super_bowl'] > 0)).astype(int)
        X_copy['seasonal_sin'] = np.sin(2 * np.pi * X_copy['week_of_year'] / 52)
        X_copy['seasonal_cos'] = np.cos(2 * np.pi * X_copy['week_of_year'] / 52)
        if 'CPI' in X_copy.columns and 'Fuel_Price' in X_copy.columns:
            X_copy['holiday_economic_index'] = (X_copy['CPI'] * X_copy['holiday_week_flag'] + X_copy['Fuel_Price'] * X_copy['holiday_week_flag'])
        return X_copy

In [17]:
class CategoricalFeatureConverter(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to convert specified columns to 'category' dtype
    for LightGBM to handle them efficiently.
    """
    def __init__(self, categorical_cols=None):
        self.categorical_cols = categorical_cols if categorical_cols is not None else ['Store', 'Dept', 'Type']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.categorical_cols:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].astype('category')
        return X_copy

# Ensure Date column is preserved through transformations for SARIMAX indexing
class KeepDateImputer(MissingValueImputer):
    def transform(self, X):
        X_copy = super().transform(X)
        if 'Date' not in X_copy.columns and 'Date' in X.columns:
            X_copy['Date'] = X['Date'].copy()
        return X_copy

class KeepDateFeatureExtractor(FastWalmartHolidayFeatureExtractor):
    def transform(self, X):
        X_copy = super().transform(X)
        # Ensure Date is kept or recreated after feature extraction
        if 'Date' not in X_copy.columns and self.date_column in X.columns:
            X_copy['Date'] = X[self.date_column].copy()
        return X_copy

In [11]:
y_train = train_df['Weekly_Sales']
X_train = train_df.drop(columns=['Weekly_Sales', 'Id'], errors='ignore')

temp_train_df = X_train.copy()
temp_train_df['Date'] = pd.to_datetime(train_df['Date']) # Get original dates back for sorting
temp_train_df['Weekly_Sales'] = y_train

temp_train_df = temp_train_df.sort_values(by='Date').reset_index(drop=True)

# Define a cutoff date for validation
validation_cutoff_date = pd.to_datetime('2012-09-01')

X_train_split = temp_train_df[temp_train_df['Date'] < validation_cutoff_date]
y_train_split = temp_train_df[temp_train_df['Date'] < validation_cutoff_date]['Weekly_Sales']

X_val_split = temp_train_df[temp_train_df['Date'] >= validation_cutoff_date]
y_val_split = temp_train_df[temp_train_df['Date'] >= validation_cutoff_date]['Weekly_Sales']

def weighted_mean_absolute_error(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

val_weights_raw = X_val_split['IsHoliday'].apply(lambda x: 5 if x else 1)


In [19]:
import pandas as pd
import numpy as np
import gc
from sklearn.pipeline import Pipeline
import warnings
from statsmodels.tsa.arima.model import ARIMA  # SARIMA without exogenous variables

# Suppress warnings from statsmodels
warnings.filterwarnings("ignore", category=UserWarning, module="statsmodels")
warnings.filterwarnings("ignore", category=RuntimeWarning, module="statsmodels")
warnings.filterwarnings("ignore", category=FutureWarning, module="statsmodels")
warnings.filterwarnings('ignore', category=DeprecationWarning)
# Define preprocessing pipeline
preprocessing_with_date = Pipeline(steps=[
    ('imputer', MissingValueImputer()),
    ('categorical_converter', CategoricalFeatureConverter())
])

# Apply preprocessing
print("\n--- Applying Preprocessing Pipeline ---")
X_train_processed = preprocessing_with_date.fit_transform(X_train_split)
X_val_processed = preprocessing_with_date.transform(X_val_split)
print("Preprocessing complete.")

print(f"\n--- Data Dimensions After Preprocessing ---")
print(f"X_train_processed shape: {X_train_processed.shape}")
print(f"X_val_processed shape: {X_val_processed.shape}")

# SARIMA Parameters
sarima_order = (1, 0, 2)
sarima_seasonal_order = (0, 0, 0, 52)

def weighted_mean_absolute_error(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

# Model storage
fitted_models = {}
all_individual_predictions = []
overall_results_summary = []

# Unique Store-Dept combinations
unique_store_depts_df = X_train_split[['Store', 'Dept']].drop_duplicates().sort_values(by=['Store', 'Dept']).reset_index(drop=True)
unique_store_depts = unique_store_depts_df.values

print(f"\n--- Starting SARIMA Modeling for {len(unique_store_depts)} Store-Department Combinations ---")

models_processed_count = 0
min_obs_required = 2 * sarima_seasonal_order[3] + sum(sarima_order) + sum(sarima_seasonal_order[:3]) + 1

for store_id, dept_id in unique_store_depts:
    models_processed_count += 1
    print(f"\n--- Processing Store: {int(store_id)}, Department: {int(dept_id)} ({models_processed_count}/{len(unique_store_depts)}) ---")

    train_idx_combo = (X_train_split['Store'] == store_id) & (X_train_split['Dept'] == dept_id)
    val_idx_combo = (X_val_split['Store'] == store_id) & (X_val_split['Dept'] == dept_id)

    single_train_X = X_train_processed.loc[train_idx_combo]
    single_train_y = y_train_split.loc[train_idx_combo]
    single_val_X = X_val_processed.loc[val_idx_combo]
    single_val_y = y_val_split.loc[val_idx_combo]

    if len(single_train_X) < min_obs_required:
        print(f"  Skipping (Store {int(store_id)}, Dept {int(dept_id)}): Not enough training data points.")
        overall_results_summary.append({
            'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': np.nan, 'Status': 'Skipped (Too few train data)',
            'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
        })
        gc.collect()
        continue

    if len(single_val_X) == 0:
        print(f"  Skipping (Store {int(store_id)}, Dept {int(dept_id)}): No validation data points.")
        overall_results_summary.append({
            'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': np.nan, 'Status': 'Skipped (No val data)',
            'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
        })
        gc.collect()
        continue

    try:
        y_train_series = single_train_y.set_axis(single_train_X['Date']).sort_index()
        y_val_series = single_val_y.set_axis(single_val_X['Date']).sort_index()
        val_weights = val_weights_raw.loc[val_idx_combo].set_axis(single_val_X['Date']).sort_index()

        if y_train_series.nunique() == 1:
            predicted_value = y_train_series.iloc[0]
            print(f"  Constant training series (value: {predicted_value:.2f}). Predicting constant value.")
            predictions = pd.Series(predicted_value, index=y_val_series.index)
            predictions_for_overall = pd.Series(predictions.values, index=single_val_X.index, name='Weekly_Sales_Pred')
            all_individual_predictions.append(predictions_for_overall)

            wmae = weighted_mean_absolute_error(y_val_series, predictions, val_weights)
            overall_results_summary.append({
                'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': wmae,
                'Status': 'Success (Constant Series Prediction)',
                'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
            })
            gc.collect()
            continue

        sarima_model = ARIMA(
            y_train_series,
            order=sarima_order,
            seasonal_order=sarima_seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        sarima_results = sarima_model.fit()

        fitted_models[(int(store_id), int(dept_id))] = sarima_results

        predictions = sarima_results.predict(
            start=y_val_series.index[0],
            end=y_val_series.index[-1]
        )
        predictions[predictions < 0] = 0

        predictions_for_overall = pd.Series(predictions.values, index=single_val_X.index, name='Weekly_Sales_Pred')
        all_individual_predictions.append(predictions_for_overall)

        wmae = weighted_mean_absolute_error(y_val_series, predictions, val_weights)
        status = 'Success'

        print(f"  WMAE: {wmae:.2f}")
        overall_results_summary.append({
            'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': wmae, 'Status': status,
            'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
        })

    except Exception as e:
        print(f"  Error fitting/predicting (Store {int(store_id)}, Dept {int(dept_id)}): {e}")
        overall_results_summary.append({
            'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': np.nan, 'Status': f'Failed: {type(e).__name__}',
            'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
        })
    finally:
        gc.collect()

print("\n--- All SARIMA Modeling Attempts Complete ---")

# Combine all individual predictions for overall WMAE
print("\n--- Combining all individual predictions for overall WMAE ---")

if all_individual_predictions:
    combined_predictions = pd.concat(all_individual_predictions).sort_index()
    y_true_overall = y_val_split.copy()
    y_pred_overall = combined_predictions.reindex(y_val_split.index)
    weights_overall = val_weights_raw.copy()

    overall_df = pd.DataFrame({
        'y_true': y_true_overall,
        'y_pred': y_pred_overall,
        'weights': weights_overall
    }).dropna(subset=['y_true', 'y_pred', 'weights'])

    if not overall_df.empty and overall_df['weights'].sum() > 0:
        overall_wmae = (overall_df['weights'] * np.abs(overall_df['y_true'] - overall_df['y_pred'])).sum() / overall_df['weights'].sum()
        print(f"\n--- Overall Weighted Mean Absolute Error (WMAE) on Validation Set: {overall_wmae:.2f} ---")
    else:
        print("\n--- Cannot calculate overall WMAE: No valid predictions or weights available. ---")
else:
    print("\n--- No individual predictions were generated. Overall WMAE cannot be calculated. ---")

results_df = pd.DataFrame(overall_results_summary)
print("\n--- Summary of Individual Store-Dept Model Results ---")
print(results_df.head(15))

successful_models_summary = results_df.dropna(subset=['WMAE'])
if not successful_models_summary.empty:
    print(f"\nTotal successful models fitted: {len(successful_models_summary)} (out of {len(unique_store_depts)})")
    print(f"Total failed/skipped models: {len(results_df) - len(successful_models_summary)}")
    print(f"Average WMAE for successfully fitted models (individual series): {successful_models_summary['WMAE'].mean():.2f}")
else:
    print("\nNo successful SARIMA models were fitted or predicted.")

print("\n--- Number of fitted models stored: ", len(fitted_models))



--- Applying Preprocessing Pipeline ---
Preprocessing complete.

--- Data Dimensions After Preprocessing ---
X_train_processed shape: (397841, 21)
X_val_processed shape: (23729, 21)




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
--- Processing Store: 22, Department: 83 (1670/3326) ---
  WMAE: 106.59

--- Processing Store: 22, Department: 85 (1671/3326) ---
  WMAE: 288.34

--- Processing Store: 22, Department: 87 (1672/3326) ---
  WMAE: 2828.16

--- Processing Store: 22, Department: 90 (1673/3326) ---
  WMAE: 1995.79

--- Processing Store: 22, Department: 91 (1674/3326) ---
  WMAE: 1336.71

--- Processing Store: 22, Department: 92 (1675/3326) ---
  WMAE: 5384.35

--- Processing Store: 22, Department: 93 (1676/3326) ---
  WMAE: 467.61

--- Processing Store: 22, Department: 94 (1677/3326) ---
  Error fitting/predicting (Store 22, Dept 94): 'The `start` argument could not be matched to a location related to the index of the data.'

--- Processing Store: 22, Department: 95 (1678/3326) ---
  WMAE: 3651.32

--- Processing Store: 22, Department: 96 (1679/3326) ---
  Skipping (Store 22, Dept 96): Not enough training data points.

--- Processing Store: 22,

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import gc
from statsmodels.tsa.statespace.sarimax import SARIMAX # Import SARIMAX
import warnings

# Suppress specific warnings from statsmodels that can be noisy during loops
warnings.filterwarnings("ignore", category=UserWarning, module="statsmodels")
warnings.filterwarnings("ignore", category=RuntimeWarning, module="statsmodels")
warnings.filterwarnings("ignore", category=FutureWarning, module="statsmodels")
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning, module="statsmodels") # Often appears with older numpy/statsmodels versions
preprocessing_with_date = Pipeline(steps=[
    ('imputer', KeepDateImputer()),
    ('feature_extractor', KeepDateFeatureExtractor(verbose=False)),
    ('categorical_converter', CategoricalFeatureConverter())
])



print("\n--- Applying Preprocessing Pipeline ---")
X_train_processed = preprocessing_with_date.fit_transform(X_train_split)
X_val_processed = preprocessing_with_date.transform(X_val_split)
print("Preprocessing complete.")

print(f"\n--- Data Dimensions After Preprocessing ---")
print(f"X_train_processed shape: {X_train_processed.shape}")
print(f"X_val_processed shape: {X_val_processed.shape}")

# Prepare common exogenous variable columns
numeric_exog_cols = [
    'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'week_of_year', 'day_of_week', 'is_super_bowl', 'is_labor_day',
    'is_thanksgiving', 'is_christmas', 'days_until_super_bowl',
    'days_since_super_bowl', 'days_until_labor_day', 'days_since_labor_day',
    'days_until_thanksgiving', 'days_since_thanksgiving',
    'days_until_christmas', 'days_since_christmas', 'holiday_week_flag',
    'black_friday_week', 'christmas_buildup', 'post_super_bowl',
    'seasonal_sin', 'seasonal_cos', 'holiday_economic_index'
]
# Filter to only include columns that actually exist and are numeric after preprocessing
final_exog_cols = [col for col in numeric_exog_cols if col in X_train_processed.columns and pd.api.types.is_numeric_dtype(X_train_processed[col])]
print(f"Final exogenous columns used ({len(final_exog_cols)}): {final_exog_cols}")

# SARIMA Model Parameters (can be tuned, these are just examples)
sarima_order = (1, 0, 0)
sarima_seasonal_order = (0, 0, 0, 52) # 52 for weekly seasonality

# Define weighted_mean_absolute_error function
def weighted_mean_absolute_error(y_true, y_pred, weights):
    # Ensure all inputs are pandas Series for consistent indexing
    if not isinstance(y_true, pd.Series):
        y_true = pd.Series(y_true)
    if not isinstance(y_pred, pd.Series):
        y_pred = pd.Series(y_pred)
    if not isinstance(weights, pd.Series):
        weights = pd.Series(weights)

    # Align all series based on their DatetimeIndex (or original index if Dates are not unique identifiers)
    # Since y_true, y_pred, weights might come from different filtering operations,
    # it's best to align them by their *original* index if they represent different parts of the same validation set.
    # However, if they are already series of the *same* validation period, direct alignment is fine.
    # For overall WMAE, we will concatenate based on the original index of X_val_split.

    # This function is used for *per-series* WMAE calculation first, then for overall
    # For per-series: they should already be aligned by date if you indexed them by date.
    # For overall: use the index of y_val_split.

    # If y_true, y_pred, weights are from a single (Store,Dept) combination
    # and have already been indexed by date, this alignment ensures integrity.
    aligned_df = pd.DataFrame({
        'y_true': y_true,
        'y_pred': y_pred.reindex(y_true.index), # Reindex y_pred to y_true's index
        'weights': weights.reindex(y_true.index) # Reindex weights to y_true's index
    }).dropna() # Drop rows where any of the three are missing after alignment

    if aligned_df.empty:
        return np.nan

    aligned_df['weights'] = aligned_df['weights'].replace([np.inf, -np.inf], np.nan).fillna(0)

    if aligned_df['weights'].sum() == 0:
        return np.nan # Avoid division by zero if all weights are zero or NaN

    return (aligned_df['weights'] * np.abs(aligned_df['y_true'] - aligned_df['y_pred'])).sum() / aligned_df['weights'].sum()


# --- Store Models and Predictions ---
fitted_models = {} # Dictionary to store SARIMA results objects: {(Store, Dept): sarima_results}
all_individual_predictions = [] # List to collect predictions for later overall WMAE calculation
overall_results_summary = [] # List to store WMAE per series and status


# --- Loop through all unique Store-Department combinations ---
# Create a DataFrame of unique Store-Dept combinations to iterate
unique_store_depts_df = X_train_split[['Store', 'Dept']].drop_duplicates().sort_values(by=['Store', 'Dept']).reset_index(drop=True)
unique_store_depts = unique_store_depts_df.values

print(f"\n--- Starting SARIMA Modeling for {len(unique_store_depts)} Store-Department Combinations ---")

models_processed_count = 0

# No max_combinations_to_run limit, as per your request to process all.
# Be aware: This WILL take a very long time and consume significant memory.

for store_id, dept_id in unique_store_depts:
    models_processed_count += 1
    print(f"\n--- Processing Store: {int(store_id)}, Department: {int(dept_id)} ({models_processed_count}/{len(unique_store_depts)}) ---")

    # Filter processed data for the specific store and department using boolean indexing
    train_idx_combo = (X_train_split['Store'] == store_id) & (X_train_split['Dept'] == dept_id)
    val_idx_combo = (X_val_split['Store'] == store_id) & (X_val_split['Dept'] == dept_id)

    single_train_X = X_train_processed.loc[train_idx_combo]
    single_train_y = y_train_split.loc[train_idx_combo]

    single_val_X = X_val_processed.loc[val_idx_combo]
    single_val_y = y_val_split.loc[val_idx_combo]

    # Check if there's enough data for this combination
    min_obs_required = 2 * sarima_seasonal_order[3] + sum(sarima_order) + sum(sarima_seasonal_order[:3]) + len(final_exog_cols) + 1

    if len(single_train_X) < min_obs_required:
        print(f"  Skipping (Store {int(store_id)}, Dept {int(dept_id)}): Not enough training data points ({len(single_train_X)} < {min_obs_required} required).")
        overall_results_summary.append({
            'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': np.nan, 'Status': 'Skipped (Too few train data)',
            'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
        })
        gc.collect()
        continue
    if len(single_val_X) == 0:
        print(f"  Skipping (Store {int(store_id)}, Dept {int(dept_id)}): No validation data points.")
        overall_results_summary.append({
            'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': np.nan, 'Status': 'Skipped (No val data)',
            'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
        })
        gc.collect()
        continue

    try:
        # Set 'Date' as index for SARIMAX for the single series
        y_train_sarimax_single = single_train_y.set_axis(single_train_X['Date']).sort_index()
        y_val_sarimax_single = single_val_y.set_axis(single_val_X['Date']).sort_index()

        # Prepare weights for the single validation series, also indexed by Date
        single_val_weights = val_weights_raw.loc[val_idx_combo].set_axis(single_val_X['Date']).sort_index()

        # Prepare exogenous variables for the single series
        exog_train_single = single_train_X[final_exog_cols].set_index(single_train_X['Date']).sort_index()
        exog_val_single = single_val_X[final_exog_cols].set_index(single_val_X['Date']).sort_index()

        # Check for constant series (SARIMA often struggles with constant data)
        if y_train_sarimax_single.nunique() == 1:
            predicted_value = y_train_sarimax_single.iloc[0]
            print(f"  Training series is constant (Sales: {predicted_value:.2f}). Predicting constant value.")
            predictions_sarima_single = pd.Series(predicted_value, index=y_val_sarimax_single.index)

            # Store prediction with its original index for later concatenation
            predictions_for_overall_wmae = pd.Series(predictions_sarima_single.values, index=single_val_X.index, name='Weekly_Sales_Pred')
            all_individual_predictions.append(predictions_for_overall_wmae)

            wmae = weighted_mean_absolute_error(y_val_sarimax_single, predictions_sarima_single, single_val_weights)
            print(f"  WMAE: {wmae:.2f}")
            overall_results_summary.append({
                'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': wmae, 'Status': 'Success (Constant Series Prediction)',
                'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
            })
            gc.collect()
            continue

        # Fit SARIMAX model
        sarima_model = SARIMAX(
            y_train_sarimax_single,
            exog=exog_train_single,
            order=sarima_order,
            seasonal_order=sarima_seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        sarima_results = sarima_model.fit(disp=False, maxiter=500)

        # Store the fitted model
        fitted_models[(int(store_id), int(dept_id))] = sarima_results

        # Make predictions
        exog_val_aligned_single = exog_val_single.reindex(y_val_sarimax_single.index)

        if exog_val_aligned_single.empty:
            print(f"  Warning: exog_val_aligned_single is empty for (Store {int(store_id)}, Dept {int(dept_id)}). Cannot predict.")
            wmae = np.nan
            status = 'Failed: Empty exog_val for prediction'
        else:
            predictions_sarima_single = sarima_results.predict(
                start=y_val_sarimax_single.index[0],
                end=y_val_sarimax_single.index[-1],
                exog=exog_val_aligned_single
            )
            predictions_sarima_single[predictions_sarima_single < 0] = 0 # Ensure non-negative sales

            # Store prediction with its original index for later concatenation
            predictions_for_overall_wmae = pd.Series(predictions_sarima_single.values, index=single_val_X.index, name='Weekly_Sales_Pred')
            all_individual_predictions.append(predictions_for_overall_wmae)

            wmae = weighted_mean_absolute_error(y_val_sarimax_single, predictions_sarima_single, single_val_weights)
            status = 'Success'

        print(f"  WMAE: {wmae:.2f}")
        overall_results_summary.append({
            'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': wmae, 'Status': status,
            'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
        })

    except Exception as e:
        print(f"  Error fitting/predicting (Store {int(store_id)}, Dept {int(dept_id)}): {e}")
        overall_results_summary.append({
            'Store': int(store_id), 'Dept': int(dept_id), 'WMAE': np.nan, 'Status': f'Failed: {type(e).__name__}',
            'Train_Points': len(single_train_X), 'Val_Points': len(single_val_X)
        })
    finally:
        gc.collect()

print("\n--- All SARIMA Modeling Attempts Complete ---")

# --- Combine all predictions for overall WMAE ---
print("\n--- Combining all individual predictions for overall WMAE ---")

if all_individual_predictions:
    # Concatenate all predictions. Use pd.concat and then reindex to the original X_val_split index
    # to ensure all original validation points are covered, and missing ones are NaNs.
    combined_predictions = pd.concat(all_individual_predictions).sort_index()

    # Reindex y_val_split and val_weights_raw to ensure they align perfectly with the predictions
    # based on the original X_val_split index.
    y_true_overall = y_val_split.copy()
    y_pred_overall = combined_predictions.reindex(y_val_split.index) # Align predictions to true values' index
    weights_overall = val_weights_raw.copy()

    # Create a DataFrame to handle missing predictions gracefully for overall WMAE calculation
    overall_df = pd.DataFrame({
        'y_true': y_true_overall,
        'y_pred': y_pred_overall,
        'weights': weights_overall
    }).dropna(subset=['y_true', 'y_pred', 'weights']) # Only consider points where we have all three

    if not overall_df.empty and overall_df['weights'].sum() > 0:
        overall_wmae = (overall_df['weights'] * np.abs(overall_df['y_true'] - overall_df['y_pred'])).sum() / overall_df['weights'].sum()
        print(f"\n--- Overall Weighted Mean Absolute Error (WMAE) on Validation Set: {overall_wmae:.2f} ---")
    else:
        print("\n--- Cannot calculate overall WMAE: No valid predictions or weights available. ---")
else:
    print("\n--- No individual predictions were generated. Overall WMAE cannot be calculated. ---")


results_df = pd.DataFrame(overall_results_summary)
print("\n--- Summary of Individual Store-Dept Model Results ---")
print(results_df.head(15)) # Print first 15 results

successful_models_summary = results_df.dropna(subset=['WMAE'])
if not successful_models_summary.empty:
    print(f"\nTotal successful models fitted: {len(successful_models_summary)} (out of {len(unique_store_depts)})")
    print(f"Total failed/skipped models: {len(results_df) - len(successful_models_summary)}")
    print(f"Average WMAE for successfully fitted models (individual series): {successful_models_summary['WMAE'].mean():.2f}")
else:
    print("\nNo successful SARIMA models were fitted or predicted.")

print("\n--- Number of fitted models stored: ", len(fitted_models))



--- Applying Preprocessing Pipeline ---
Preprocessing complete.

--- Data Dimensions After Preprocessing ---
X_train_processed shape: (397841, 42)
X_val_processed shape: (23729, 42)
Final exogenous columns used (25): ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'week_of_year', 'day_of_week', 'is_super_bowl', 'is_labor_day', 'is_thanksgiving', 'is_christmas', 'days_until_super_bowl', 'days_since_super_bowl', 'days_until_labor_day', 'days_since_labor_day', 'days_until_thanksgiving', 'days_since_thanksgiving', 'days_until_christmas', 'days_since_christmas', 'holiday_week_flag', 'black_friday_week', 'christmas_buildup', 'post_super_bowl', 'seasonal_sin', 'seasonal_cos', 'holiday_economic_index']

--- Starting SARIMA Modeling for 3326 Store-Department Combinations ---

--- Processing Store: 1, Department: 1 (1/3326) ---
  WMAE: 6001.46

--- Processing Store: 1, Department: 2 (2/3326) ---
  WMAE: 1692.87

--- Processing Store: 1, Department: 3 (3/3326) ---
  WMAE: 9365.77

--- Proce

In [None]:
%pip install -q dagshub

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install mlflow==2.7.1

Collecting mlflow==2.7.1
  Downloading mlflow-2.7.1-py3-none-any.whl.metadata (12 kB)
Collecting cloudpickle<3 (from mlflow==2.7.1)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting databricks-cli<1,>=0.8.7 (from mlflow==2.7.1)
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting protobuf<5,>=3.12.0 (from mlflow==2.7.1)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pytz<2024 (from mlflow==2.7.1)
  Downloading pytz-2023.4-py2.py3-none-any.whl.metadata (22 kB)
Collecting packaging<24 (from mlflow==2.7.1)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Collecting importlib-metadata!=4.7.0,<7,>=3.7.0 (from mlflow==2.7.1)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)
Collecting alembic!=1.10.0,<2 (from mlflow==2.7.1)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<7,>=4.0.0 (from mlflow==2.7.1)
  Downlo

In [None]:

import dagshub
# Try to get credentials from environment first
dagshub.init(
    repo_owner='abarb22',
    repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
    mlflow=True
)



In [None]:

import mlflow
import mlflow.lightgbm  # Use sklearn logging instead
from datetime import datetime

mlflow.set_experiment("LightGBM_Training")

<Experiment: artifact_location='mlflow-artifacts:/d44fa10e33c34452b1f57e1d7b550038', creation_time=1751542655791, experiment_id='0', last_update_time=1751542655791, lifecycle_stage='active', name='LightGBM_Training', tags={}>

In [None]:
with mlflow.start_run(run_name="LightGBM_Data_Cleaning"):
    # Log data cleaning parameters
    mlflow.log_param("missing_value_strategy", "MarkDowns->0, others->ffill/bfill/mean")
    mlflow.log_param("date_features_extracted", True)


    # After cleaning, log metrics about data quality
    mlflow.log_metric("train_samples", len(train_df))
    mlflow.log_metric("missing_values_filled", train_df.isna().sum().sum())

🏃 View run LightGBM_Data_Cleaning at: https://dagshub.com/abarb22/Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/0/runs/00101f5024ba48a6babbe257b51d7453
🧪 View experiment at: https://dagshub.com/abarb22/Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/0
