<a href="https://colab.research.google.com/github/abarb2022/Walmart-Recruiting---Store-Sales-Forecasting/blob/main/model_experiment_N-BEATS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Downloading Kaggle data sets directly into Colab**

Install the kaggle python library

In [32]:
! pip install kaggle



Mount the Google drive so you can store your kaggle API credentials for future use

In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Make a directory for kaggle at the temporary instance location on Colab drive.

Download your kaggle API key (.json file). You can do this by going to your kaggle account page and clicking 'Create new API token' under the API section.

In [34]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


Upload the json file to Google Drive and then copy to the temporary location.

In [35]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

Change the file permissions to read/write to the owner only

In [36]:
! chmod 600 ~/.kaggle/kaggle.json

**Competitions and Datasets are the two types of Kaggle data**

**1. Download competition data**

If you get 403 Forbidden error, you need to click 'Late Submission' on the Kaggle page for that competition.

In [37]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)


Unzip, in case the downloaded file is zipped. Refresh the files on the left hand side to update the view.

In [38]:
! unzip walmart-recruiting-store-sales-forecasting

Archive:  walmart-recruiting-store-sales-forecasting.zip
replace features.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [53]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import gc # For garbage collection
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [54]:
stores = pd.read_csv('stores.csv')
train = pd.read_csv("train.csv.zip")
features = pd.read_csv('features.csv.zip')
sample = pd.read_csv('sampleSubmission.csv.zip')
test = pd.read_csv('test.csv.zip')

In [55]:
# Convert 'Date' columns to datetime objects for easier manipulation
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Merge features with train and test data.
# Note: 'IsHoliday' is present in both train/test and features.csv.
# We'll merge on it to ensure consistency, but if there were discrepancies,
# we'd need a more careful merge strategy.
train_df = pd.merge(train, features, on=['Store', 'Date', 'IsHoliday'], how='left')
test_df = pd.merge(test, features, on=['Store', 'Date', 'IsHoliday'], how='left')

# Merge store information
train_df = pd.merge(train_df, stores, on='Store', how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')

print("\n--- Merged Train Data Head ---")
print(train_df.head())
print("\n--- Merged Test Data Head ---")
print(test_df.head())

print("\n--- Merged Train Data Info ---")
print(train_df.info())
print("\n--- Merged Test Data Info ---")
print(test_df.info())

# Free up memory
del train, test, features, stores
gc.collect()


--- Merged Train Data Head ---
   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment Type    Size
0      1     1 2010-02-05      24924.50      False        42.31       2.572        NaN        NaN        NaN        NaN        NaN  211.096358         8.106    A  151315
1      1     1 2010-02-12      46039.49       True        38.51       2.548        NaN        NaN        NaN        NaN        NaN  211.242170         8.106    A  151315
2      1     1 2010-02-19      41595.55      False        39.93       2.514        NaN        NaN        NaN        NaN        NaN  211.289143         8.106    A  151315
3      1     1 2010-02-26      19403.54      False        46.63       2.561        NaN        NaN        NaN        NaN        NaN  211.319643         8.106    A  151315
4      1     1 2010-03-05      21827.90      False        46.50       2.625        NaN        NaN        NaN        Na

26

In [56]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7d4c59bc7b90>

In [57]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [58]:
class WalmartDataset(Dataset):
    """Custom dataset for Walmart sales data with support for weights"""
    def __init__(self, X, y, weights=None, lookback_window=52):
        self.X = torch.FloatTensor(X)  # Features (e.g., normalized sales)
        self.y = torch.FloatTensor(y)  # Targets (e.g., next week's sales)
        self.weights = torch.FloatTensor(weights) if weights is not None else None
        self.lookback_window = lookback_window

        # Validate shapes
        if len(self.X) != len(self.y):
            raise ValueError("X and y must have the same length")
        if self.weights is not None and len(self.weights) != len(self.y):
            raise ValueError("weights must match the length of y")
        if len(self.X) < self.lookback_window:
            raise ValueError(f"Not enough samples for lookback_window={lookback_window}")

    def __len__(self):
        return max(0, len(self.X) - self.lookback_window)

    def __getitem__(self, idx):
        x = self.X[idx : idx + self.lookback_window]
        y = self.y[idx + self.lookback_window]

        if self.weights is not None:
            w = self.weights[idx + self.lookback_window]
            return x, y, w
        else:
            return x, y

In [59]:
class NBeatsBlock(nn.Module):
    """Single N-BEATS block"""
    def __init__(self, input_size, theta_size, basis_function, layers, layer_size):
        super().__init__()
        self.input_size = input_size
        self.theta_size = theta_size
        self.basis_function = basis_function

        # Fully connected layers
        self.fc_layers = nn.ModuleList()
        self.fc_layers.append(nn.Linear(input_size, layer_size))
        for _ in range(layers - 1):
            self.fc_layers.append(nn.Linear(layer_size, layer_size))

        # Theta layer
        self.theta_layer = nn.Linear(layer_size, theta_size)

        # Basis functions
        if basis_function == 'generic':
            self.backcast_basis = nn.Linear(theta_size, input_size)
            self.forecast_basis = nn.Linear(theta_size, 1)  # Forecasting 1 step ahead

    def forward(self, x):
        # Forward through fully connected layers
        for layer in self.fc_layers:
            x = torch.relu(layer(x))

        # Get theta
        theta = self.theta_layer(x)

        # Generate backcast and forecast
        if self.basis_function == 'generic':
            backcast = self.backcast_basis(theta)
            forecast = self.forecast_basis(theta)

        return backcast, forecast

In [60]:
class NBeatsNet(nn.Module):
    """N-BEATS neural network"""
    def __init__(self, input_size, stacks=2, blocks_per_stack=3, layers=4, layer_size=256, theta_size=16):
        super().__init__()
        self.input_size = input_size
        self.stacks = stacks
        self.blocks_per_stack = blocks_per_stack

        # Create blocks
        self.blocks = nn.ModuleList()
        for stack in range(stacks):
            for block in range(blocks_per_stack):
                self.blocks.append(
                    NBeatsBlock(
                        input_size=input_size,
                        theta_size=theta_size,
                        basis_function='generic',
                        layers=layers,
                        layer_size=layer_size
                    )
                )

    def forward(self, x):
        # Flatten the input for processing
        batch_size = x.shape[0]
        x = x.view(batch_size, -1)  # Flatten to (batch_size, input_size)

        residual = x
        forecast = torch.zeros(batch_size, 1, device=x.device)

        for block in self.blocks:
            backcast, block_forecast = block(residual)
            residual = residual - backcast
            forecast = forecast + block_forecast

        return forecast

In [61]:
class MissingValueImputer(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to handle missing values for specific columns.
    - MarkDown columns: fill with 0.
    - Other specified numerical columns: fill with ffill then bfill, fallback to mean.
    """
    def __init__(self, markdown_cols=None, numerical_cols_to_impute=None):
        self.markdown_cols = markdown_cols if markdown_cols is not None else [f'MarkDown{i}' for i in range(1, 6)]
        self.numerical_cols_to_impute = numerical_cols_to_impute if numerical_cols_to_impute is not None else ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        self.means = {} # To store means for fallback imputation during transform

    def fit(self, X, y=None):
        # Calculate means for fallback imputation from the training data
        for col in self.numerical_cols_to_impute:
            if col in X.columns:
                self.means[col] = X[col].mean()
        return self

    def transform(self, X):
        X_copy = X.copy()


        for col in self.markdown_cols:
          if col in X_copy.columns:
            X_copy[f"{col}_was_missing"] = X_copy[col].isna().astype(int)
            X_copy[col] = X_copy[col].fillna(0)


        # Impute other numerical columns with ffill then bfill, fallback to mean
        for col in self.numerical_cols_to_impute:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')
                # Fallback to mean if NaNs still exist (e.g., if all values were NaN in a column)
                if X_copy[col].isnull().any() and col in self.means:
                    X_copy[col] = X_copy[col].fillna(self.means[col])
        return X_copy

In [62]:
class NBEATSLabelEncoder(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to encode categorical features.
    N-BEATS works better with label-encoded categoricals than pandas categories.
    """
    def __init__(self, categorical_cols=None):
        self.categorical_cols = categorical_cols if categorical_cols is not None else ['Store', 'Dept', 'Type']
        self.label_encoders = {}

    def fit(self, X, y=None):
        for col in self.categorical_cols:
            if col in X.columns:
                self.label_encoders[col] = LabelEncoder()
                self.label_encoders[col].fit(X[col].astype(str))
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.categorical_cols:
            if col in X_copy.columns and col in self.label_encoders:
                # Handle unseen categories by using a default value
                X_copy[col] = X_copy[col].astype(str)
                known_categories = set(self.label_encoders[col].classes_)
                X_copy[col] = X_copy[col].apply(lambda x: x if x in known_categories else 'unknown')

                # Add 'unknown' to encoder if needed
                if 'unknown' not in self.label_encoders[col].classes_:
                    current_classes = list(self.label_encoders[col].classes_)
                    current_classes.append('unknown')
                    self.label_encoders[col].classes_ = np.array(current_classes)

                X_copy[col] = self.label_encoders[col].transform(X_copy[col])
        return X_copy

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

preprocessing_pipeline = Pipeline([
    ('missing_value_imputer', MissingValueImputer()),
    ('label_encoder', NBEATSLabelEncoder())
])

print("Preparing training data...")
X_train = train_df.drop(['Weekly_Sales'], axis=1)
y_train = train_df['Weekly_Sales']

print("\n--- Applying Preprocessing Pipeline to Train Data ---")
X_train_processed = preprocessing_pipeline.fit_transform(X_train, y_train)
X_train_processed['Weekly_Sales'] = y_train.values

print("\n--- Applying Preprocessing Pipeline to Test Data ---")
# For the test set, we only call transform, as fit was done on the training data.
X_test_processed = preprocessing_pipeline.transform(test_df.drop(columns=['Id'], errors='ignore'))


print("\nProcessed X_train_processed info:")
print(X_train_processed.info())
print("\nProcessed X_test_processed info:")
print(X_test_processed.info())

# Verify no missing values in processed data
print("\nMissing values in processed X_train_processed:\n", X_train_processed.isnull().sum().sum())
print("Missing values in processed X_test_processed:\n", X_test_processed.isnull().sum().sum())

Preparing training data...

--- Applying Preprocessing Pipeline to Train Data ---

--- Applying Preprocessing Pipeline to Test Data ---

Processed X_train_processed info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 21 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Store                  421570 non-null  int64         
 1   Dept                   421570 non-null  int64         
 2   Date                   421570 non-null  datetime64[ns]
 3   IsHoliday              421570 non-null  bool          
 4   Temperature            421570 non-null  float64       
 5   Fuel_Price             421570 non-null  float64       
 6   MarkDown1              421570 non-null  float64       
 7   MarkDown2              421570 non-null  float64       
 8   MarkDown3              421570 non-null  float64       
 9   MarkDown4              421570 non-null  float64      

In [64]:

validation_cutoff_date = pd.to_datetime('2012-09-01')

# Sort by date
single_series_df = train_df.copy()
single_series_df = single_series_df.sort_values(["Store", "Dept", "Date"])

# Use the same cutoff
train_series = single_series_df[single_series_df["Date"] < validation_cutoff_date]
val_series = single_series_df[single_series_df["Date"] >= validation_cutoff_date]

# Normalize based on training only
sales_mean = train_series["Weekly_Sales"].mean()
sales_std = train_series["Weekly_Sales"].std()

train_sales = (train_series["Weekly_Sales"].values - sales_mean) / sales_std
val_sales = (val_series["Weekly_Sales"].values - sales_mean) / sales_std  # normalize using train stats

# Prepare dataset
lookback_window = 52
X_train = train_sales.reshape(-1, 1)
y_train = train_sales.reshape(-1)

X_val = val_sales.reshape(-1, 1)
y_val = val_sales.reshape(-1)

def weighted_mean_absolute_error(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

val_weights = np.where(val_series['IsHoliday'] == 1, 5, 1)
train_weights_split = np.where(train_series['IsHoliday'] == 1, 5, 1)

train_dataset = WalmartDataset(
    X_train,
    y_train,
    weights=train_weights_split,  # Pass weights here
    lookback_window=lookback_window
)

val_dataset = WalmartDataset(
    X_val,
    y_val,
    weights=val_weights,  # Pass weights here
    lookback_window=lookback_window
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [65]:
%pip install -q dagshub


In [29]:
!pip install mlflow==2.7.1



In [27]:
import dagshub
# Try to get credentials from environment first
dagshub.init(
    repo_owner='abarb22',
    repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
    mlflow=True
)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=812522d3-30ea-4ac8-bf67-68ea0039c902&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=d2a8217de374f01cf4740237fcbbd3010ac4e7322c7150a0ef1de3dc8b72032c




In [66]:
class ModelTrainer:
    def __init__(self, model, train_loader, val_loader, sales_mean, sales_std,
                 lr=1e-3, num_epochs=100, use_wmae_loss=False):
        # Remove val_weights and train_weights from __init__
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.sales_mean = sales_mean
        self.sales_std = sales_std
        self.lr = lr
        self.num_epochs = num_epochs
        self.use_wmae_loss = use_wmae_loss

        # Loss function
        if use_wmae_loss:
            self.criterion = self.weighted_mae_loss
        else:
            self.criterion = nn.MSELoss()

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        self.train_losses = []
        self.val_losses = []
        self.val_wmaes = []
        self.val_preds = []
        self.val_targets = []
        self.val_weights_used = []

    def weighted_mae_loss(self, y_pred, y_true, weights):

      return torch.mean(weights * torch.abs(y_pred - y_true))

    def train(self):
      for epoch in range(self.num_epochs):
          self.model.train()
          running_train_loss = 0.0
          train_samples = 0

          for batch_data in self.train_loader:
              if len(batch_data) == 3:  # (x, y, weights)
                  x_batch, y_batch, weights_batch = batch_data
                  weights_batch = weights_batch.to(device)
              else:  # (x, y)
                  x_batch, y_batch = batch_data
                  weights_batch = None

              x_batch, y_batch = x_batch.to(device), y_batch.to(device)
              self.optimizer.zero_grad()
              output = self.model(x_batch).squeeze()

              if self.use_wmae_loss and weights_batch is not None:
                  loss = self.weighted_mae_loss(output, y_batch, weights_batch)
              else:
                  loss = self.criterion(output, y_batch)  # Default MSE/MAE

              loss.backward()
              self.optimizer.step()
              running_train_loss += loss.item() * x_batch.size(0)
              train_samples += x_batch.size(0)

    def _validate(self):
        """Updated validation method"""
        self.model.eval()
        running_val_loss = 0.0
        val_samples = 0
        self.val_preds.clear()
        self.val_targets.clear()
        self.val_weights_used.clear()

        with torch.no_grad():
            for batch_data in self.val_loader:
                if len(batch_data) == 3:  # x, y, weights
                    x_batch, y_batch, weights_batch = batch_data
                    x_batch = x_batch.to(device)
                    y_batch = y_batch.to(device).squeeze()
                    weights_batch = weights_batch.squeeze()
                else:  # x, y only
                    x_batch, y_batch = batch_data
                    x_batch, y_batch = x_batch.to(device), y_batch.to(device).squeeze()
                    weights_batch = torch.ones_like(y_batch)  # Default weights

                output = self.model(x_batch)

                # Regular loss for tracking
                loss = nn.MSELoss()(output.squeeze(), y_batch)
                running_val_loss += loss.item() * x_batch.size(0)
                val_samples += x_batch.size(0)

                # Store predictions, targets, and weights
                self.val_preds.extend(output.squeeze().cpu().numpy())
                self.val_targets.extend(y_batch.cpu().numpy())
                self.val_weights_used.extend(weights_batch.cpu().numpy())

        val_loss = running_val_loss / val_samples
        val_wmae = self._calculate_wmae()

        return val_loss, val_wmae

In [None]:
import mlflow


# Option 2: Create a new model instance for the second run
model_2 = NBeatsNet(input_size=lookback_window)

trainer_2 = ModelTrainer(
    model=model_2,
    train_loader=train_loader,
    val_loader=val_loader,
    sales_mean=sales_mean,
    sales_std=sales_std,
    lr=1e-3,
    num_epochs=100,
    use_wmae_loss=True
)

mlflow.set_experiment("N-BEATS_Training")

with mlflow.start_run(run_name=f"N-BEATS_HyperParameter_Tuning"):
    trainer_2.train()
    wmae = trainer_2.evaluate()

    mlflow.log_param("lookback_window", lookback_window)
    mlflow.log_param("epochs", trainer_2.num_epochs)
    mlflow.log_param("batch_size", 32)
    mlflow.log_metric("val_wmae", wmae)
    mlflow.log_metric("val_loss", trainer_2.val_losses[-1])

    # Log loss plot
    plot_path = f"loss_plot.png"
    trainer_2.plot_losses(save_path=plot_path)
    mlflow.log_artifact(plot_path)

    # Log trained model
    mlflow.pytorch.log_model(trainer_2.model, artifact_path="model")