In [None]:
# Step 1: Import necessary libraries
import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from scripts.data_utils import HistopathologyDataModule
from scripts.model_utils import BaselineCNN
from scripts.config import BATCH_SIZE, LEARNING_RATE, EPOCHS, TARGET_SIZE

In [None]:
# Step 2: Setup MLflow for logging
mlflow_logger = MLFlowLogger(
    experiment_name="Histopathology Cancer Detection - Training",
    tracking_uri="file:./experiments/mlruns"
)

In [None]:
# Step 3: Define the function for training the model
def train_model():
    """
    Trains the histopathology CNN model with PyTorch Lightning.
    Handles logging, checkpoints, and validation during training.
    """
    print("Starting training...")

    # Setup MLflow Logger
    mlflow_logger = MLFlowLogger(
        experiment_name="Histopathology Cancer Detection - Training",
        tracking_uri="file:./experiments/mlruns"
    )

    # Initialize DataModule
    data_module = HistopathologyDataModule(
        batch_size=BATCH_SIZE,
        target_size=TARGET_SIZE
    )

    # Initialize model
    model = BaselineCNN(input_shape=(3, *TARGET_SIZE), learning_rate=LEARNING_RATE)

    # Define callbacks
    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath="checkpoints/",
        filename="best_model-{epoch:02d}-{val_loss:.4f}",
        save_top_k=1,
        mode="min"
    )
    early_stopping = EarlyStopping(
        monitor="val_loss",
        patience=5,
        mode="min"
    )
    lr_monitor = LearningRateMonitor(logging_interval="epoch")

    # Initialize trainer (removing parallelization)
    trainer = pl.Trainer(
        max_epochs=EPOCHS,
        logger=mlflow_logger,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",  # Use GPU if available
        devices=1,  # Only one device (CPU or GPU)
        callbacks=[checkpoint_callback, early_stopping, lr_monitor],
        log_every_n_steps=50,
    )

    # Train the model
    trainer.fit(model, data_module)

    # Save the best model
    best_model_path = checkpoint_callback.best_model_path
    print(f"Best model saved at: {best_model_path}")

    # Log the best model path to MLflow
    mlflow_logger.experiment.log_param(
        run_id=mlflow_logger.run_id,
        key="best_model_path",
        value=best_model_path
    )

    print("Training completed.")

In [None]:
# Step 4: Execute the training if running as a standalone notebook
train_model()