# Notebook to train the model for Histopathology Cancer Detection

In [None]:
# Import libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from PIL import Image
import mlflow
import mlflow.pytorch

In [None]:
# Import custom scripts from the GitHub repository
!git clone https://github.com/astoreyai/Histopathology-Cancer-Detection.git
from Histopathology-Cancer-Detection.scripts.data_utils import HistologyDataset
from Histopathology-Cancer-Detection.scripts.model_utils import BaselineCNN
from Histopathology-Cancer-Detection.scripts.train_utils import train_one_epoch, validate
from Histopathology-Cancer-Detection.scripts.config import TRAIN_DIR, LABELS_FILE, BATCH_SIZE, LEARNING_RATE, EPOCHS, TARGET_SIZE

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Data preprocessing transformations
train_transform = transforms.Compose([
    transforms.Resize(TARGET_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize(TARGET_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
# Load and split dataset
labels_df = pd.read_csv(LABELS_FILE)
train_df, val_df = train_test_split(labels_df, test_size=0.2, stratify=labels_df['label'], random_state=42)

# Create data loaders
train_dataset = HistologyDataset(dataframe=train_df, img_dir=TRAIN_DIR, transform=train_transform)
val_dataset = HistologyDataset(dataframe=val_df, img_dir=TRAIN_DIR, transform=val_transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

In [None]:
# Initialize model, optimizer, and loss function
model = BaselineCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCELoss()

In [None]:
# Set up MLFlow tracking
mlflow.set_tracking_uri("file:./experiments")
mlflow.set_experiment("Histopathology Cancer Detection")

In [None]:
# Training loop with MLFlow tracking
with mlflow.start_run(run_name="Baseline CNN Training"):
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("learning_rate", LEARNING_RATE)
    mlflow.log_param("epochs", EPOCHS)

    for epoch in range(EPOCHS):
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_auc = validate(model, val_loader, criterion, device)
        
        # Log metrics to MLFlow
        mlflow.log_metric("train_loss", train_loss, step=epoch)
        mlflow.log_metric("val_loss", val_loss, step=epoch)
        mlflow.log_metric("val_auc", val_auc, step=epoch)

        print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}")
    
    # Save the model to MLFlow
    mlflow.pytorch.log_model(model, "model")

In [None]:
# Save model locally
torch.save(model.state_dict(), "baseline_cnn.pth")
print("Model saved to baseline_cnn.pth")