# ISIC 2018 Skin Lesion Classification - EfficientNet B1

**Environment:** GitHub Codespaces (Linux)  
**Dataset:** ISIC 2018 Task 3 (7 classes)  
**Model:** EfficientNet-B1 with pretrained ImageNet weights


---
## Cell 1: Setup Environment

In [None]:
# Cell 1: Setup Environment

import sys
import os

# Set working directory (Codespaces)
REPO_ROOT = "/workspaces/ISIC_2018"
if os.path.exists(REPO_ROOT):
    os.chdir(REPO_ROOT)
    print(f"✓ Working directory: {os.getcwd()}")
else:
    print(f"⚠ Codespaces root not found, using current directory: {os.getcwd()}")
    REPO_ROOT = os.getcwd()

# Add to Python path
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)
print(f"✓ Python path updated")

# Check src folder
if os.path.exists('src'):
    print("✓ 'src' folder found!")
    print("  Files:", os.listdir('src'))
else:
    print("❌ 'src' folder NOT found!")

In [None]:
# Install dependencies
%pip install -r requirements.txt -q

In [None]:
# Check PyTorch and CUDA
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    !nvidia-smi
else:
    print("⚠ Running on CPU")

---
## Cell 2: Unzip Dataset (Idempotent)

In [None]:
%%bash
set -e

cd /workspaces/ISIC_2018

# Create data directory
mkdir -p data/ISIC2018

# Marker file to check if already unzipped
MARKER="data/ISIC2018/.unzip_complete"

if [ -f "$MARKER" ]; then
    echo "✓ Dataset already unzipped (marker file exists)"
    echo "  Delete $MARKER to re-extract"
else
    echo "Extracting dataset..."
    
    # Unzip each file (-n = no overwrite, -q = quiet)
    for zip in Training_Input.zip Validation_Input.zip Test_Input.zip \
               Training_GroundTruth.zip Validation_GroundTruth.zip Test_GroundTruth.zip; do
        if [ -f "$zip" ]; then
            echo "  Extracting $zip..."
            unzip -n -q "$zip" -d data/ISIC2018/
        else
            echo "  ⚠ $zip not found, skipping"
        fi
    done
    
    echo ""
    echo "Normalizing folder names..."
    cd data/ISIC2018
    
    # Rename ISIC2018_Task3_* folders to simpler names if they exist
    for dir in ISIC2018_Task3_Training_Input ISIC2018_Task3_Training Input; do
        [ -d "$dir" ] && [ ! -d "Training_Input" ] && mv "$dir" Training_Input && echo "  Renamed $dir -> Training_Input"
    done
    
    for dir in ISIC2018_Task3_Validation_Input ISIC2018_Task3_Validation Input; do
        [ -d "$dir" ] && [ ! -d "Validation_Input" ] && mv "$dir" Validation_Input && echo "  Renamed $dir -> Validation_Input"
    done
    
    for dir in ISIC2018_Task3_Test_Input ISIC2018_Task3_Test Input; do
        [ -d "$dir" ] && [ ! -d "Test_Input" ] && mv "$dir" Test_Input && echo "  Renamed $dir -> Test_Input"
    done
    
    for dir in ISIC2018_Task3_Training_GroundTruth; do
        [ -d "$dir" ] && [ ! -d "Training_GroundTruth" ] && mv "$dir" Training_GroundTruth && echo "  Renamed $dir -> Training_GroundTruth"
    done
    
    for dir in ISIC2018_Task3_Validation_GroundTruth; do
        [ -d "$dir" ] && [ ! -d "Validation_GroundTruth" ] && mv "$dir" Validation_GroundTruth && echo "  Renamed $dir -> Validation_GroundTruth"
    done
    
    for dir in ISIC2018_Task3_Test_GroundTruth; do
        [ -d "$dir" ] && [ ! -d "Test_GroundTruth" ] && mv "$dir" Test_GroundTruth && echo "  Renamed $dir -> Test_GroundTruth"
    done
    
    # Create marker file
    cd /workspaces/ISIC_2018
    touch "$MARKER"
    echo ""
    echo "✓ Extraction complete!"
fi

echo ""
echo "Directory structure:"
ls -la data/ISIC2018/ 2>/dev/null || echo "  (empty)"

---
## Cell 3: Verify Dataset

In [None]:
# Cell 3: Verify Dataset

from pathlib import Path

DATA_ROOT = Path("/workspaces/ISIC_2018/data/ISIC2018")

# Check directories
required_dirs = [
    "Training_Input", "Validation_Input", "Test_Input",
    "Training_GroundTruth", "Validation_GroundTruth", "Test_GroundTruth"
]

print("=" * 60)
print("DATASET VERIFICATION")
print("=" * 60)

all_found = True
for d in required_dirs:
    path = DATA_ROOT / d
    if path.exists():
        if "Input" in d:
            count = len(list(path.glob("*.jpg")))
            print(f"✓ {d}: {count} images")
        else:
            csvs = list(path.glob("*.csv"))
            print(f"✓ {d}: {len(csvs)} CSV(s) - {[c.name for c in csvs]}")
    else:
        print(f"❌ {d}: NOT FOUND")
        all_found = False

print("=" * 60)

if all_found:
    print("✓ All dataset files found!")
else:
    print("⚠ Some files missing. Check unzip step.")

# Show sample paths
print("\nSample image paths:")
for d in ["Training_Input", "Validation_Input", "Test_Input"]:
    path = DATA_ROOT / d
    if path.exists():
        samples = list(path.glob("*.jpg"))[:2]
        for s in samples:
            print(f"  {s}")

---
## Cell 4: Import Modules

In [None]:
# Cell 4: Import Modules

import warnings
warnings.filterwarnings('ignore')

import torch
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# Import custom modules from src folder
from src.config import *
from src.data_processing import load_all_data
from src.dataset import ISICDataset
from src.transforms import get_train_transform, get_val_transform
from src.model import build_model, count_parameters, load_checkpoint, save_checkpoint, print_model_info
from src.train import train_one_epoch, create_dataloaders, get_optimizer, get_scheduler, get_criterion
from src.evaluate import evaluate, plot_confusion_matrix, print_classification_report, create_submission, visualize_predictions

# Set random seed
set_seed(SEED)

# Print configuration
print_config()

print(f"\n✓ All modules imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

---
## Cell 5: Load and Prepare Data

In [None]:
# Cell 5: Load and Prepare Data

# Load all data
df_train, df_val, df_test, label2idx, idx2label, num_classes, use_weighted_sampler = load_all_data(
    PATH_TRAIN_CSV, PATH_VAL_CSV, PATH_TEST_CSV,
    DIR_TRAIN_IMG, DIR_VAL_IMG, DIR_TEST_IMG
)

# Store label mappings in config (for checkpoint saving)
LABEL2IDX = label2idx
IDX2LABEL = idx2label

# Create datasets
train_transform = get_train_transform()
val_transform = get_val_transform()

train_dataset = ISICDataset(df_train, transform=train_transform)
val_dataset = ISICDataset(df_val, transform=val_transform)
test_dataset = ISICDataset(df_test, transform=val_transform)

print(f"\n✓ Datasets created:")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Val: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")

# Create dataloaders
train_loader, val_loader, test_loader = create_dataloaders(
    df_train, df_val, df_test,
    train_dataset, val_dataset, test_dataset,
    BATCH_SIZE, NUM_WORKERS, use_weighted_sampler
)

print(f"\n✓ Dataloaders created:")
print(f"  Train: {len(train_loader)} batches")
print(f"  Val: {len(val_loader)} batches")
print(f"  Test: {len(test_loader)} batches")

---
## Cell 6: Build Model and Training Components

In [None]:
# Cell 6: Build Model and Training Components

# Build model
model = build_model(num_classes=num_classes, pretrained=True, model_name=MODEL_NAME)
model = model.to(DEVICE)

# Print model info
print_model_info(model, MODEL_NAME.upper())

# Setup training components
optimizer = get_optimizer(model, LEARNING_RATE, WEIGHT_DECAY)
scheduler = get_scheduler(optimizer, NUM_EPOCHS, USE_COSINE_SCHEDULER)
criterion = get_criterion(USE_LABEL_SMOOTHING, LABEL_SMOOTHING)

# Initialize mixed precision scaler
from torch.cuda.amp import GradScaler
scaler = GradScaler() if torch.cuda.is_available() else None

print("\n✓ Model and training components initialized!")

---
## Cell 7: Training

In [None]:
# Cell 7: Training

from src.engine import train_model
import src.config as config  # NOTE: Correct import (not scr.config)

# 1. Initialize history and tracking
history = {
    'epoch': [], 'train_loss': [], 'train_acc': [],
    'val_loss': [], 'val_acc': [], 'val_f1': [], 'val_bal_acc': [], 'lr': []
}
best_val_f1 = 0.0
best_epoch = 0
start_epoch = 0

# 2. Load existing checkpoint if it exists
if os.path.exists(MODEL_PATH):
    checkpoint = load_checkpoint(model, optimizer, MODEL_PATH, DEVICE)
    start_epoch = checkpoint.get('epoch', 0)
    best_val_f1 = checkpoint.get('best_val_f1', 0.0)
    best_epoch = checkpoint.get('best_epoch', 0)
    if 'history' in checkpoint:
        history = checkpoint['history']
    print(f"✓ Resumed from epoch {start_epoch}")
else:
    print("✓ Starting fresh training")

# 3. Prepare config for engine
cfg_dict = {
    'VAL_EVERY_N_EPOCHS': config.VAL_EVERY_N_EPOCHS,
    'SAVE_EVERY_N_EPOCHS': config.SAVE_EVERY_N_EPOCHS,
    'EARLY_STOP_PATIENCE': config.EARLY_STOP_PATIENCE,
    'USE_COSINE_SCHEDULER': config.USE_COSINE_SCHEDULER,
    'MODEL_PATH': config.MODEL_PATH,
    'NUM_CLASSES': config.NUM_CLASSES,
    'label2idx': label2idx,
    'idx2label': idx2label
}

# 4. Run Training
model, history, best_val_f1, best_epoch = train_model(
    model, train_loader, val_loader, criterion, optimizer, scheduler,
    num_epochs=config.NUM_EPOCHS,
    device=DEVICE,
    config_dict=cfg_dict,
    start_epoch=start_epoch,
    best_val_f1=best_val_f1,
    best_epoch=best_epoch,
    history=history
)

---
## Cell 8: Evaluate on Validation Set

In [None]:
# Cell 8: Evaluate on Validation Set

# Load best model
print("Loading best model...")
checkpoint = load_checkpoint(model, None, MODEL_PATH, DEVICE)

# Evaluate on validation set
print("\nEvaluating on validation set...")
val_loss, val_acc, val_f1, val_bal_acc, val_preds, val_labels, val_probs, val_image_ids = evaluate(
    model, val_loader, criterion, DEVICE
)

print(f"\nValidation Results:")
print(f"  Loss: {val_loss:.4f}")
print(f"  Accuracy: {val_acc:.4f}")
print(f"  Macro F1: {val_f1:.4f}")
print(f"  Balanced Accuracy: {val_bal_acc:.4f}")

# Confusion Matrix
plot_confusion_matrix(
    val_labels, val_preds, idx2label,
    save_path=os.path.join(DIR_FIGURES, 'val_confusion_matrix.png'),
    title='Validation Confusion Matrix'
)

# Classification Report
print_classification_report(val_labels, val_preds, idx2label)

---
## Cell 9: Evaluate on Test Set

In [None]:
# Cell 9: Evaluate on Test Set

# Evaluate on test set
print("Evaluating on test set...")
test_loss, test_acc, test_f1, test_bal_acc, test_preds, test_labels, test_probs, test_image_ids = evaluate(
    model, test_loader, criterion, DEVICE
)

print(f"\nTest Results:")
print(f"  Loss: {test_loss:.4f}")
print(f"  Accuracy: {test_acc:.4f}")
print(f"  Macro F1: {test_f1:.4f}")
print(f"  Balanced Accuracy: {test_bal_acc:.4f}")

# Confusion Matrix
plot_confusion_matrix(
    test_labels, test_preds, idx2label,
    save_path=os.path.join(DIR_FIGURES, 'test_confusion_matrix.png'),
    title='Test Confusion Matrix'
)

# Classification Report
print_classification_report(test_labels, test_preds, idx2label)

# Create submission
submission_path = os.path.join(DIR_SUBMISSIONS, 'test_predictions.csv')
submission_df = create_submission(test_image_ids, test_preds, test_probs, idx2label, submission_path)

---
## Cell 10: Inference Demo (Optional)

In [None]:
# Cell 10: Inference Demo (Optional)

# Demo images - check if they exist first
demo_paths = [
    os.path.join(DIR_TEST_IMG, "ISIC_0034524.jpg"),
    os.path.join(DIR_VAL_IMG, "ISIC_0034321.jpg"),
]

demo_images = []
for path in demo_paths:
    if os.path.exists(path):
        img_id = os.path.basename(path).replace(".jpg", "")
        demo_images.append((path, img_id))

if demo_images:
    print("\n=== INFERENCE DEMO ===")
    
    # Visualize predictions
    visualize_predictions(
        model, demo_images, val_transform, DEVICE, idx2label,
        save_path=os.path.join(DIR_FIGURES, 'inference_demo.png'),
        top_k=3
    )
    
    print("\n✓ Inference demo completed!")
else:
    print("\n⚠ Demo images not found, skipping inference demo.")

---
## Cell 11: Training History Visualization (Optional)

In [None]:
# Cell 11: Training History Visualization (Optional)

if history and len(history.get('epoch', [])) > 0:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Loss
    axes[0].plot(history['epoch'], history['train_loss'], label='Train')
    axes[0].plot(history['epoch'], history['val_loss'], label='Val')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Training & Validation Loss')
    axes[0].legend()
    axes[0].grid(True)
    
    # Accuracy
    axes[1].plot(history['epoch'], history['train_acc'], label='Train')
    axes[1].plot(history['epoch'], history['val_acc'], label='Val')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].set_title('Training & Validation Accuracy')
    axes[1].legend()
    axes[1].grid(True)
    
    # F1 Score
    axes[2].plot(history['epoch'], history['val_f1'], label='Val F1', color='green')
    if best_epoch > 0:
        axes[2].axvline(x=best_epoch, color='r', linestyle='--', label=f'Best (Epoch {best_epoch})')
    axes[2].set_xlabel('Epoch')
    axes[2].set_ylabel('F1 Score')
    axes[2].set_title('Validation F1 Score')
    axes[2].legend()
    axes[2].grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(DIR_FIGURES, 'training_history.png'), dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"\n✓ Training history saved: {os.path.join(DIR_FIGURES, 'training_history.png')}")
else:
    print("\n⚠ No training history available to plot.")

---
## Summary

This notebook:
1. ✓ Sets up the environment for GitHub Codespaces
2. ✓ Extracts dataset from zip files (idempotent)
3. ✓ Verifies dataset structure
4. ✓ Loads data and creates dataloaders
5. ✓ Builds EfficientNet-B1 model
6. ✓ Trains with early stopping and checkpointing
7. ✓ Evaluates on validation and test sets
8. ✓ Creates submission file