<a href="https://colab.research.google.com/github/alaki22/Contacts/blob/main/model_experiment_N-BEATS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Downloading Kaggle data sets directly into Colab**

Install the kaggle python library

In [None]:
! pip install kaggle



Mount the Google drive so you can store your kaggle API credentials for future use

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Make a directory for kaggle at the temporary instance location on Colab drive.

Download your kaggle API key (.json file). You can do this by going to your kaggle account page and clicking 'Create new API token' under the API section.

In [None]:
! mkdir ~/.kaggle

Upload the json file to Google Drive and then copy to the temporary location.

In [None]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

Change the file permissions to read/write to the owner only

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

**Competitions and Datasets are the two types of Kaggle data**

**1. Download competition data**

If you get 403 Forbidden error, you need to click 'Late Submission' on the Kaggle page for that competition.

In [None]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 417MB/s]


Unzip, in case the downloaded file is zipped. Refresh the files on the left hand side to update the view.

In [None]:
! unzip walmart-recruiting-store-sales-forecasting

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import gc # For garbage collection
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [None]:

stores = pd.read_csv('stores.csv')
train = pd.read_csv("train.csv.zip")
features = pd.read_csv('features.csv.zip')
sample = pd.read_csv('sampleSubmission.csv.zip')
test = pd.read_csv('test.csv.zip')

In [None]:
# Convert 'Date' columns to datetime objects for easier manipulation
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Merge features with train and test data.
# Note: 'IsHoliday' is present in both train/test and features.csv.
# We'll merge on it to ensure consistency, but if there were discrepancies,
# we'd need a more careful merge strategy.
train_df = pd.merge(train, features, on=['Store', 'Date', 'IsHoliday'], how='left')
test_df = pd.merge(test, features, on=['Store', 'Date', 'IsHoliday'], how='left')

# Merge store information
train_df = pd.merge(train_df, stores, on='Store', how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')

print("\n--- Merged Train Data Head ---")
print(train_df.head())
print("\n--- Merged Test Data Head ---")
print(test_df.head())

print("\n--- Merged Train Data Info ---")
print(train_df.info())
print("\n--- Merged Test Data Info ---")
print(test_df.info())

# Free up memory
del train, test, features, stores
gc.collect()


--- Merged Train Data Head ---
   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment Type    Size
0      1     1 2010-02-05      24924.50      False        42.31       2.572        NaN        NaN        NaN        NaN        NaN  211.096358         8.106    A  151315
1      1     1 2010-02-12      46039.49       True        38.51       2.548        NaN        NaN        NaN        NaN        NaN  211.242170         8.106    A  151315
2      1     1 2010-02-19      41595.55      False        39.93       2.514        NaN        NaN        NaN        NaN        NaN  211.289143         8.106    A  151315
3      1     1 2010-02-26      19403.54      False        46.63       2.561        NaN        NaN        NaN        NaN        NaN  211.319643         8.106    A  151315
4      1     1 2010-03-05      21827.90      False        46.50       2.625        NaN        NaN        NaN        Na

13

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
class WalmartDataset(Dataset):
    """Custom dataset for Walmart sales data"""
    def __init__(self, X, y, lookback_window=52):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        self.lookback_window = lookback_window

    def __len__(self):
        return len(self.X) - self.lookback_window

    def __getitem__(self, idx):
        x = self.X[idx:idx + self.lookback_window]
        y = self.y[idx + self.lookback_window]
        return x, y

In [None]:
class NBeatsBlock(nn.Module):
    """Single N-BEATS block"""
    def __init__(self, input_size, theta_size, basis_function, layers, layer_size):
        super().__init__()
        self.input_size = input_size
        self.theta_size = theta_size
        self.basis_function = basis_function

        # Fully connected layers
        self.fc_layers = nn.ModuleList()
        self.fc_layers.append(nn.Linear(input_size, layer_size))
        for _ in range(layers - 1):
            self.fc_layers.append(nn.Linear(layer_size, layer_size))

        # Theta layer
        self.theta_layer = nn.Linear(layer_size, theta_size)

        # Basis functions
        if basis_function == 'generic':
            self.backcast_basis = nn.Linear(theta_size, input_size)
            self.forecast_basis = nn.Linear(theta_size, 1)  # Forecasting 1 step ahead

    def forward(self, x):
        # Forward through fully connected layers
        for layer in self.fc_layers:
            x = torch.relu(layer(x))

        # Get theta
        theta = self.theta_layer(x)

        # Generate backcast and forecast
        if self.basis_function == 'generic':
            backcast = self.backcast_basis(theta)
            forecast = self.forecast_basis(theta)

        return backcast, forecast

In [None]:
class NBeatsNet(nn.Module):
    """N-BEATS neural network"""
    def __init__(self, input_size, stacks=2, blocks_per_stack=3, layers=4, layer_size=256, theta_size=16):
        super().__init__()
        self.input_size = input_size
        self.stacks = stacks
        self.blocks_per_stack = blocks_per_stack

        # Create blocks
        self.blocks = nn.ModuleList()
        for stack in range(stacks):
            for block in range(blocks_per_stack):
                self.blocks.append(
                    NBeatsBlock(
                        input_size=input_size,
                        theta_size=theta_size,
                        basis_function='generic',
                        layers=layers,
                        layer_size=layer_size
                    )
                )

    def forward(self, x):
        # Flatten the input for processing
        batch_size = x.shape[0]
        x = x.view(batch_size, -1)  # Flatten to (batch_size, input_size)

        residual = x
        forecast = torch.zeros(batch_size, 1, device=x.device)

        for block in self.blocks:
            backcast, block_forecast = block(residual)
            residual = residual - backcast
            forecast = forecast + block_forecast

        return forecast

In [None]:
class MissingValueImputer(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to handle missing values for specific columns.
    - MarkDown columns: fill with 0.
    - Other specified numerical columns: fill with ffill then bfill, fallback to mean.
    """
    def __init__(self, markdown_cols=None, numerical_cols_to_impute=None):
        self.markdown_cols = markdown_cols if markdown_cols is not None else [f'MarkDown{i}' for i in range(1, 6)]
        self.numerical_cols_to_impute = numerical_cols_to_impute if numerical_cols_to_impute is not None else ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        self.means = {} # To store means for fallback imputation during transform

    def fit(self, X, y=None):
        # Calculate means for fallback imputation from the training data
        for col in self.numerical_cols_to_impute:
            if col in X.columns:
                self.means[col] = X[col].mean()
        return self

    def transform(self, X):
        X_copy = X.copy()


        for col in self.markdown_cols:
          if col in X_copy.columns:
            X_copy[f"{col}_was_missing"] = X_copy[col].isna().astype(int)
            X_copy[col] = X_copy[col].fillna(0)


        # Impute other numerical columns with ffill then bfill, fallback to mean
        for col in self.numerical_cols_to_impute:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')
                # Fallback to mean if NaNs still exist (e.g., if all values were NaN in a column)
                if X_copy[col].isnull().any() and col in self.means:
                    X_copy[col] = X_copy[col].fillna(self.means[col])
        return X_copy

In [None]:
class NBEATSLabelEncoder(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to encode categorical features.
    N-BEATS works better with label-encoded categoricals than pandas categories.
    """
    def __init__(self, categorical_cols=None):
        self.categorical_cols = categorical_cols if categorical_cols is not None else ['Store', 'Dept', 'Type']
        self.label_encoders = {}

    def fit(self, X, y=None):
        for col in self.categorical_cols:
            if col in X.columns:
                self.label_encoders[col] = LabelEncoder()
                self.label_encoders[col].fit(X[col].astype(str))
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.categorical_cols:
            if col in X_copy.columns and col in self.label_encoders:
                # Handle unseen categories by using a default value
                X_copy[col] = X_copy[col].astype(str)
                known_categories = set(self.label_encoders[col].classes_)
                X_copy[col] = X_copy[col].apply(lambda x: x if x in known_categories else 'unknown')

                # Add 'unknown' to encoder if needed
                if 'unknown' not in self.label_encoders[col].classes_:
                    current_classes = list(self.label_encoders[col].classes_)
                    current_classes.append('unknown')
                    self.label_encoders[col].classes_ = np.array(current_classes)

                X_copy[col] = self.label_encoders[col].transform(X_copy[col])
        return X_copy

In [None]:
preprocessing_pipeline = Pipeline([
    ('missing_value_imputer', MissingValueImputer()),
    ('label_encoder', NBEATSLabelEncoder())
])

print("Preparing training data...")
X_train = train_df.drop(['Weekly_Sales'], axis=1)
y_train = train_df['Weekly_Sales']

print("\n--- Applying Preprocessing Pipeline to Train Data ---")
X_train_processed = preprocessing_pipeline.fit_transform(X_train, y_train)

print("\n--- Applying Preprocessing Pipeline to Test Data ---")
# For the test set, we only call transform, as fit was done on the training data.
X_test_processed = preprocessing_pipeline.transform(test_df.drop(columns=['Id'], errors='ignore'))


print("\nProcessed X_train_processed info:")
print(X_train_processed.info())
print("\nProcessed X_test_processed info:")
print(X_test_processed.info())

# Verify no missing values in processed data
print("\nMissing values in processed X_train_processed:\n", X_train_processed.isnull().sum().sum())
print("Missing values in processed X_test_processed:\n", X_test_processed.isnull().sum().sum())

In [None]:

validation_cutoff_date = pd.to_datetime('2012-09-01')

# Sort by date
single_series_df = X_train_processed[(X_train_processed["Store"] == 1) & (X_train_processed["Dept"] == 1)].copy()
single_series_df = single_series_df.sort_values("Date")

# Use the same cutoff
train_series = single_series_df[single_series_df["Date"] < validation_cutoff_date]
val_series = single_series_df[single_series_df["Date"] >= validation_cutoff_date]

# Normalize based on training only
sales_mean = train_series["Weekly_Sales"].mean()
sales_std = train_series["Weekly_Sales"].std()

train_sales = (train_series["Weekly_Sales"].values - sales_mean) / sales_std
val_sales = (val_series["Weekly_Sales"].values - sales_mean) / sales_std  # normalize using train stats

# Prepare dataset
lookback_window = 52
X_train = train_sales.reshape(-1, 1)
y_train = train_sales.reshape(-1)

X_val = val_sales.reshape(-1, 1)
y_val = val_sales.reshape(-1)

train_dataset = WalmartDataset(X_train, y_train, lookback_window)
val_dataset = WalmartDataset(X_val, y_val, lookback_window)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

def weighted_mean_absolute_error(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

val_weights = np.where(val_series['IsHoliday'] == 1, 5, 1)
train_weights_split = np.where(train_series['IsHoliday'] == 1, 5, 1)

## **DATA CLEANING**


In [None]:
%pip install -q dagshub


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install mlflow==2.7.1

Collecting mlflow==2.7.1
  Downloading mlflow-2.7.1-py3-none-any.whl.metadata (12 kB)
Collecting cloudpickle<3 (from mlflow==2.7.1)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting databricks-cli<1,>=0.8.7 (from mlflow==2.7.1)
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting protobuf<5,>=3.12.0 (from mlflow==2.7.1)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pytz<2024 (from mlflow==2.7.1)
  Downloading pytz-2023.4-py2.py3-none-any.whl.metadata (22 kB)
Collecting packaging<24 (from mlflow==2.7.1)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Collecting importlib-metadata!=4.7.0,<7,>=3.7.0 (from mlflow==2.7.1)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)
Collecting alembic!=1.10.0,<2 (from mlflow==2.7.1)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<7,>=4.0.0 (from mlflow==2.7.1)
  Downlo

In [None]:

import dagshub
# Try to get credentials from environment first
dagshub.init(
    repo_owner='abarb22',
    repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
    mlflow=True
)



In [None]:
import mlflow
import mlflow.pytorch
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

class ModelTrainer:
    def __init__(self, model, train_loader, val_loader, sales_mean, sales_std, val_weights, lr=1e-3, num_epochs=100):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.sales_mean = sales_mean
        self.sales_std = sales_std
        self.val_weights = val_weights
        self.lr = lr
        self.num_epochs = num_epochs
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        self.train_losses = []
        self.val_losses = []
        self.val_preds = []
        self.val_targets = []

    def train(self):
        for epoch in range(self.num_epochs):
            self.model.train()
            running_train_loss = 0.0

            for x_batch, y_batch in self.train_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                self.optimizer.zero_grad()
                output = self.model(x_batch)
                loss = self.criterion(output.squeeze(), y_batch)
                loss.backward()
                self.optimizer.step()
                running_train_loss += loss.item() * x_batch.size(0)

            train_loss = running_train_loss / len(self.train_loader.dataset)
            self.train_losses.append(train_loss)

            # Validation
            self.model.eval()
            running_val_loss = 0.0
            self.val_preds.clear()
            self.val_targets.clear()

            with torch.no_grad():
                for x_batch, y_batch in self.val_loader:
                    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                    output = self.model(x_batch)
                    loss = self.criterion(output.squeeze(), y_batch)
                    running_val_loss += loss.item() * x_batch.size(0)
                    self.val_preds.extend(output.squeeze().cpu().numpy())
                    self.val_targets.extend(y_batch.cpu().numpy())

            val_loss = running_val_loss / len(self.val_loader.dataset)
            self.val_losses.append(val_loss)

            print(f"Epoch {epoch+1}/{self.num_epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        return self

    def evaluate(self):
        denorm_preds = np.array(self.val_preds) * self.sales_std + self.sales_mean
        denorm_targets = np.array(self.val_targets) * self.sales_std + self.sales_mean
        wmae = weighted_mean_absolute_error(denorm_targets, denorm_preds, self.val_weights[lookback_window:])
        return wmae

    def plot_losses(self, save_path=None):
        plt.figure(figsize=(10, 5))
        plt.plot(self.train_losses, label='Train Loss')
        plt.plot(self.val_losses, label='Val Loss')
        plt.legend()
        plt.title("Loss Curves")
        plt.grid(True)
        if save_path:
            plt.savefig(save_path)
        plt.show()


In [None]:
store, dept = 1, 1  # just one combo to overfit

df = train_data[(train_data['Store'] == store) & (train_data['Dept'] == dept)].copy()
df = df.sort_values('Date')

sales = df['Weekly_Sales'].values.astype(np.float32)
weights = df['IsHoliday'].apply(lambda x: 5 if x else 1).values.astype(np.float32)

sales_mean = sales.mean()
sales_std = sales.std() if sales.std() > 0 else 1
sales_norm = (sales - sales_mean) / sales_std

X, y = create_sequences(sales_norm, lookback_window)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

model = NBeatsNet(input_size=lookback_window)

trainer = ModelTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    sales_mean=sales_mean,
    sales_std=sales_std,
    val_weights=weights,
    lr=1e-3,
    num_epochs=100
)


In [None]:
with mlflow.start_run(run_name=f"Overfit_Store_{store}_Dept_{dept}"):
    trainer.train()
    wmae = trainer.evaluate()

    # Log params and metrics
    mlflow.log_param("store", store)
    mlflow.log_param("dept", dept)
    mlflow.log_param("lookback_window", lookback_window)
    mlflow.log_param("epochs", trainer.num_epochs)
    mlflow.log_param("batch_size", 32)
    mlflow.log_metric("val_wmae", wmae)
    mlflow.log_metric("val_loss", trainer.val_losses[-1])

    # Log loss plot
    plot_path = f"loss_plot_store{store}_dept{dept}.png"
    trainer.plot_losses(save_path=plot_path)
    mlflow.log_artifact(plot_path)

    # Log trained model
    mlflow.pytorch.log_model(trainer.model, artifact_path="model")