# Define the model class

In [None]:
# always depends :(

class PerfectModel(nn.Module)

  def __init__(self):
    pass

  self.module = None

  def forward(self):
    pass


# Hyperparameter Definition

In [None]:
# Training configuration
LEARNING_RATE = 1e-3
EPOCHS = 500
PATIENCE = 50

# Architecture
HIDDEN_LAYERS = 1        # Hidden layers
HIDDEN_SIZE = 256        # Neurons per layer
MODEL_TYPE = 'LSTM'      # RNN, LSTM, or GRU
BIDIRECTIONAL = False    # Use bidirectional RNN layers

# Regularisation
DROPOUT_RATE = 0         # Dropout probability
L1_LAMBDA = 0            # L1 penalty
L2_LAMBDA = 0            # L2 penalty

# Set up loss function and optimizer
criterion = nn.MSELoss()

# All the boilerplate code for model training

In [None]:
def train_one_epoch(model, train_loader, criterion, optimizer, scaler, device, l1_lambda=0, l2_lambda=0):
    """
    Perform one complete training epoch through the entire training dataset.
    Adapted for Regression (Forecasting).
    Calculates and reports RMSE.
    Optimizes on the provided criterion (assumed to be MSELoss).

    Args:
        model (nn.Module): The neural network model to train
        train_loader (DataLoader): PyTorch DataLoader containing training data batches
        criterion (nn.Module): Loss function (e.g., MSELoss)
        optimizer (torch.optim): Optimization algorithm (e.g., Adam, SGD)
        scaler (GradScaler): PyTorch's gradient scaler for mixed precision training
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)
        l1_lambda (float): Lambda for L1 regularization
        l2_lambda (float): Lambda for L2 regularization

    Returns:
        float: average_rmse - Training RMSE for this epoch
    """
    model.train()  # Set model to training mode

    running_mse_loss = 0.0

    # Iterate through training batches
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # Move data to device (GPU/CPU)
        inputs, targets = inputs.to(device), targets.to(device)

        # Clear gradients from previous step
        optimizer.zero_grad(set_to_none=True)

        # Forward pass with mixed precision (if CUDA available)
        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            predictions = model(inputs)
            # Calculate the loss (e.G., MSE)
            loss = criterion(predictions, targets)

            # Add L1 and L2 regularization
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            l2_norm = sum(p.pow(2).sum() for p in model.parameters())
            loss = loss + l1_lambda * l1_norm + l2_lambda * l2_norm

        # Backward pass with gradient scaling
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Accumulate metrics
        # We store the *squared* error (MSE) from the loss function
        running_mse_loss += loss.item() * inputs.size(0)

    # Calculate epoch metrics
    epoch_mse = running_mse_loss / len(train_loader.dataset)
    epoch_rmse = np.sqrt(epoch_mse) # Convert final MSE to RMSE for reporting

    return epoch_rmse


def validate_one_epoch(model, val_loader, criterion, device):
    """
    Perform one complete validation epoch through the entire validation dataset.
    Adapted for Regression (Forecasting).
    Calculates and reports RMSE.

    Args:
        model (nn.Module): The neural network model to evaluate (must be in eval mode)
        val_loader (DataLoader): PyTorch DataLoader containing validation data batches
        criterion (nn.Module): Loss function used to calculate validation loss (e.g., MSELoss)
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)

    Returns:
        float: average_rmse - Validation RMSE for this epoch
    """
    model.eval()  # Set model to evaluation mode

    running_mse_loss = 0.0

    # Disable gradient computation for validation
    with torch.no_grad():
        for inputs, targets in val_loader:
            # Move data to device
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass with mixed precision (if CUDA available)
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                predictions = model(inputs)
                # Calculate the loss (e.g., MSE)
                loss = criterion(predictions, targets)

            # Accumulate metrics
            running_mse_loss += loss.item() * inputs.size(0)

    # Calculate epoch metrics
    epoch_mse = running_mse_loss / len(val_loader.dataset)
    epoch_rmse = np.sqrt(epoch_mse) # Convert final MSE to RMSE for reporting

    return epoch_rmse


def log_metrics_to_tensorboard(writer, epoch, train_rmse, val_rmse, model):
    """
    Log training metrics and model parameters to TensorBoard for visualization.
    Adapted for Regression (Forecasting) metrics (RMSE only).

    Args:
        writer (SummaryWriter): TensorBoard SummaryWriter object for logging
        epoch (int): Current epoch number (used as x-axis in TensorBoard plots)
        train_rmse (float): Training RMSE for this epoch
        val_rmse (float): Validation RMSE for this epoch
        model (nn.Module): The neural network model (for logging weights/gradients)
    """
    # Log scalar metrics
    writer.add_scalar('RMSE/Training', train_rmse, epoch)
    writer.add_scalar('RMSE/Validation', val_rmse, epoch)

    # Log model parameters and gradients
    for name, param in model.named_parameters():
        if param.requires_grad:
            # Check if the tensor is not empty before adding a histogram
            if param.numel() > 0:
                writer.add_histogram(f'{name}/weights', param.data, epoch)
            if param.grad is not None:
                # Check if the gradient tensor is not empty before adding a histogram
                if param.grad.numel() > 0:
                    if torch.isfinite(param.grad).all():
                        writer.add_histogram(f'{name}/gradients', param.grad.data, epoch)


def fit(model, train_loader, val_loader, epochs, criterion, optimizer, scaler, device,
        l1_lambda=0, l2_lambda=0, patience=0, scheduler=None, # Added scheduler parameter
        evaluation_metric="val_rmse", mode='min', # Monitors val_rmse and minimizes
        restore_best_weights=True, writer=None, verbose=10, experiment_name=""):
    """
    Train the neural network model on the training data and validate on the validation data.
    Adapted for Regression (Forecasting), using RMSE as the sole metric.

    Args:
        model (nn.Module): The neural network model to train
        train_loader (DataLoader): PyTorch DataLoader containing training data batches
        val_loader (DataLoader): PyTorch DataLoader containing validation data batches
        epochs (int): Number of training epochs
        criterion (nn.Module): Loss function (e.g., MSELoss)
        optimizer (torch.optim): Optimization algorithm (e.g., Adam, SGD)
        scaler (GradScaler): PyTorch's gradient scaler for mixed precision training
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)
        l1_lambda (float): L1 regularization coefficient (default: 0)
        l2_lambda (float): L2 regularization coefficient (default: 0)
        patience (int): Number of epochs to wait for improvement before early stopping (default: 0)
        scheduler (torch.optim.lr_scheduler, optional): Learning rate scheduler (default: None)
        evaluation_metric (str): Metric to monitor for early stopping (default: "val_rmse")
        mode (str): 'max' for maximizing the metric, 'min' for minimizing (default: 'min')
        restore_best_weights (bool): Whether to restore model weights from best epoch (default: True)
        writer (SummaryWriter, optional): TensorBoard SummaryWriter object for logging (default: None)
        verbose (int, optional): Frequency of printing training progress (default: 10)
        experiment_name (str, optional): Experiment name for saving models (default: "")

    Returns:
        tuple: (model, training_history) - Trained model and metrics history
    """

    # Ensure models directory exists
    if (patience > 0 or restore_best_weights) and not os.path.exists("models"):
        os.makedirs("models")
        print("Created 'models' directory for saving model checkpoints.")

    model_path = os.path.join("models", f"{experiment_name}_model.pt")

    # Initialize metrics tracking
    training_history = {
        'train_rmse': [], 'val_rmse': [], 'lr': []
    }

    # Configure early stopping if patience is set
    if patience > 0:
        patience_counter = 0
        best_metric = float('inf') if mode == 'min' else float('-inf')
        best_epoch = 0

    print(f"Training {epochs} epochs...")

    # Main training loop: iterate through epochs
    for epoch in range(1, epochs + 1):

        # Forward pass through training data, compute gradients, update weights
        train_rmse = train_one_epoch(
            model, train_loader, criterion, optimizer, scaler, device, l1_lambda, l2_lambda
        )

        # Evaluate model on validation data without updating weights
        val_rmse = validate_one_epoch(
            model, val_loader, criterion, device
        )

        # Step the scheduler if provided (typically after validation)
        if scheduler is not None:
            if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                scheduler.step(val_rmse)
            else:
                scheduler.step()


        # Store metrics for plotting and analysis
        training_history['train_rmse'].append(train_rmse)
        training_history['val_rmse'].append(val_rmse)
        training_history['lr'].append(optimizer.param_groups[0]['lr'])


        # Write metrics to TensorBoard for visualization
        if writer is not None:
            log_metrics_to_tensorboard(
                writer, epoch, train_rmse, val_rmse, model
            )
            # Log learning rate
            writer.add_scalar('Learning Rate', optimizer.param_groups[0]['lr'], epoch)


        # Print progress every N epochs or on first epoch
        if verbose > 0:
            if epoch % verbose == 0 or epoch == 1:
                print(f"Epoch {epoch:3d}/{epochs} | "
                      f"Train: RMSE={train_rmse:.4f} | "
                      f"Val: RMSE={val_rmse:.4f} | "
                      f"LR: {optimizer.param_groups[0]['lr']:.6f}")


        # Early stopping logic: monitor metric and save best model
        if patience > 0:
            # We monitor the metric specified in 'evaluation_metric' (default: 'val_rmse')
            current_metric = training_history[evaluation_metric][-1]
            is_improvement = (current_metric < best_metric) if mode == 'min' else (current_metric > best_metric)

            if is_improvement:
                best_metric = current_metric
                best_epoch = epoch
                torch.save(model.state_dict(), model_path)
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping triggered after {epoch} epochs.")
                    break

    # Restore best model weights if early stopping was used
    if restore_best_weights and patience > 0:
        try:
            model.load_state_dict(torch.load(model_path))
            print(f"Best model restored from epoch {best_epoch} with {evaluation_metric} {best_metric:.4f}")
        except FileNotFoundError:
            print(f"Warning: Could not find best model checkpoint at {model_path}. Using last model.")

    # Save final model if no early stopping
    if patience == 0:
        torch.save(model.state_dict(), model_path)

    # Close TensorBoard writer
    if writer is not None:
        writer.close()

    return model, training_history

In [None]:
model = model_name...

In [None]:
# Define optimizer with L2 regularization
optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=LEARNING_RATE, weight_decay=L2_LAMBDA)

# Enable mixed precision training for GPU acceleration
scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

In [None]:
%%time
# Train model and track training history
# Define the learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',        # Monitor a metric that should be minimized (RMSE)
    factor=0.1,        # Factor by which the learning rate will be reduced
    patience=max(10,PATIENCE//2),       # Number of epochs with no improvement after which learning rate will be reduced
    min_lr=1e-6        # Minimum learning rate
)

rnn_model, training_history = fit(
    model=rnn_model,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=EPOCHS,
    criterion=criterion,
    optimizer=optimizer,
    scaler=scaler,
    device=device,
    l1_lambda=L1_LAMBDA,
    l2_lambda=L2_LAMBDA,
    patience=PATIENCE,
    evaluation_metric="val_rmse",
    mode='min',
    restore_best_weights=True,
    writer=writer,  # Set to writer if you want TensorBoard logging
    verbose=1,
    experiment_name="direct_lstm_forecaster_50",
    scheduler=scheduler # Pass the scheduler to the fit function
)