# Land Cover Classification using Vision Transformers (SegFormer)

Optimized for NVIDIA P100 GPU

## 1. Setup Environment

Install and import necessary libraries.

**Note:** This notebook is optimized for NVIDIA P100 GPU with 16GB memory.

In [1]:
# Install base libraries with optimizations for full system utilization
!pip install --upgrade pip
!pip install -q transformers datasets evaluate accelerate Pillow torch torchvision torchaudio numpy matplotlib seaborn scikit-learn
# Install additional performance optimization libraries
!pip install -q ninja psutil gputil
!pip install -q pyarrow fastparquet # For faster data serialization
!pip install -q pytables h5py # For efficient data storage
!pip install -q albumentations # For optimized image augmentations

# Install NVIDIA Apex for mixed precision training optimization
!pip install -q ninja
!git clone --recursive https://github.com/NVIDIA/apex
!cd apex && pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./

Collecting pip
  Downloading pip-25.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m125.9 MB/s[0m eta [36m0:00:00[0m00:01[0

In [2]:
# Set environment variables for maximum performance
import os

# Maximum thread optimization
os.environ['OMP_NUM_THREADS'] = str(os.cpu_count())
os.environ['MKL_NUM_THREADS'] = str(os.cpu_count())
os.environ['NUMEXPR_NUM_THREADS'] = str(os.cpu_count())
os.environ['NUMEXPR_MAX_THREADS'] = str(os.cpu_count())
os.environ['OPENBLAS_NUM_THREADS'] = str(os.cpu_count()) 

# GPU optimization
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'false'  # Allocate all memory immediately
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'

# Enable TF32 on Ampere and newer GPUs
os.environ['TORCH_ALLOW_TF32_CUBLAS_OVERRIDE'] = '1'
os.environ['TORCH_CUDNN_V8_API_ENABLED'] = '1'

print("Environment configured for maximum performance")

Environment configured for maximum performance


In [3]:
import os
import numpy as np
import pandas as pd
import torch
import psutil
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from datasets import Dataset, DatasetDict, Image as HFImage
from transformers import SegformerForSemanticSegmentation, SegformerFeatureExtractor, Trainer, TrainingArguments
import evaluate
from huggingface_hub import notebook_login
import random
import gc

# Maximum GPU optimization
torch.backends.cudnn.benchmark = True  # Enable cuDNN auto-tuner
torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for faster math
torch.backends.cudnn.deterministic = False  # Non-deterministic mode for better performance
torch.backends.cudnn.enabled = True  # Ensure cuDNN is used

# Set numpy to use all cores
np.set_printoptions(precision=4, suppress=True)

# Garbage collect to free memory
gc.collect()

# Check system resources
cpu_count = os.cpu_count()
ram_gb = psutil.virtual_memory().total / (1024 ** 3)

print(f"System resources: {cpu_count} CPU cores, {ram_gb:.1f}GB RAM")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    # Display detailed GPU information
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"CUDA Capability: {torch.cuda.get_device_capability()}")
    
    # Configure PyTorch to use all available GPU memory
    for i in range(torch.cuda.device_count()):
        torch.cuda.set_device(i)
        torch.cuda.empty_cache()
        if hasattr(torch.cuda, 'memory_reserved'):
            print(f"GPU {i}: Memory reserved: {torch.cuda.memory_reserved(i) / 1e9:.2f}GB")
            print(f"GPU {i}: Memory allocated: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB")
        if hasattr(torch.cuda, 'get_device_properties'):
            print(f"GPU {i}: Name: {torch.cuda.get_device_properties(i).name}")
else:
    print("Warning: No GPU detected. Running on CPU will be significantly slower.")

2025-04-27 13:15:15.768122: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745759715.946254      71 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745759715.996995      71 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


System resources: 4 CPU cores, 31.4GB RAM
PyTorch version: 2.5.1+cu124
CUDA available: True
CUDA version: 12.4
GPU: Tesla P100-PCIE-16GB
Number of GPUs: 1
GPU Memory: 17.06 GB
CUDA Capability: (6, 0)
GPU 0: Memory reserved: 0.00GB
GPU 0: Memory allocated: 0.00GB
GPU 0: Name: Tesla P100-PCIE-16GB


## 2. Load Dataset

Load the DeepGlobe Land Cover Classification dataset. 
You might need to download it from Kaggle first: https://www.kaggle.com/datasets/balraj98/deepglobe-land-cover-classification-dataset

**Important:** Ensure the dataset is placed in the Kaggle input directory.

In [4]:
# Define the standard Kaggle input directory
dataset_base_dir = '/kaggle/input/deepglobe-land-cover-classification-dataset'
# Check for different directory structures
deepglobe_subdir = os.path.join(dataset_base_dir, 'deepglobe')
metadata_path = os.path.join(deepglobe_subdir, 'metadata.csv')

# Check if the dataset path exists in either structure
if os.path.exists(dataset_base_dir):
    print(f"Dataset base directory found at: {dataset_base_dir}")
    
    # Check if we have the expected subdirectory structure with metadata
    if os.path.exists(deepglobe_subdir) and os.path.exists(metadata_path):
        print(f"Using metadata-based loading from: {metadata_path}")
        metadata_df = pd.read_csv(metadata_path)
        # Prepend the root directory to the paths in the CSV
        metadata_df['sat_image_path'] = metadata_df['sat_image_path'].apply(lambda x: os.path.join(deepglobe_subdir, x))
        metadata_df['mask_path'] = metadata_df['mask_path'].apply(lambda x: os.path.join(deepglobe_subdir, x))
        
        # Define function to load data paths from metadata
        def load_data_paths(df, split):
            split_df = df[df['split'] == split]
            image_paths = split_df['sat_image_path'].tolist()
            mask_paths = split_df['mask_path'].tolist()
            # Verify files exist
            image_paths = [p for p in image_paths if os.path.exists(p)]
            mask_paths = [p for p in mask_paths if os.path.exists(p)]
            print(f"Found {len(image_paths)} images and {len(mask_paths)} masks for split '{split}'.")
            return image_paths, mask_paths
    else:
        # No metadata.csv found - try direct directory structure
        print("No metadata.csv found. Using direct directory structure.")
        metadata_df = None
        
        # Function to find image-mask pairs in directories
        def find_image_mask_pairs(image_dir, mask_dir=None):
            """Find matching images and masks in the given directories."""
            if not os.path.exists(image_dir):
                print(f"Warning: Image directory {image_dir} not found.")
                return [], []
                
            # If mask_dir is not specified, try to infer it
            if mask_dir is None:
                # Check common mask directory naming patterns
                possible_mask_dirs = [
                    image_dir.replace('sat', 'mask').replace('image', 'mask'),
                    os.path.join(os.path.dirname(image_dir), 'mask'),
                    os.path.join(os.path.dirname(image_dir), 'masks'),
                    os.path.join(os.path.dirname(image_dir), 'label'),
                    os.path.join(os.path.dirname(image_dir), 'labels')
                ]
                
                # Try standard pattern where masks are in same directory with different extension
                mask_dir = image_dir  # Default: assume masks are in same directory
                
                # Check if any of the possible mask directories exist
                for dir_path in possible_mask_dirs:
                    if os.path.exists(dir_path) and dir_path != image_dir:
                        mask_dir = dir_path
                        print(f"Found mask directory at: {mask_dir}")
                        break
            
            # Get image files with common extensions
            image_files = []
            for ext in [".jpg", ".jpeg", ".png", ".tif", ".tiff"]:
                image_files.extend(glob.glob(os.path.join(image_dir, f"*{ext}")))
            
            if not image_files:
                print(f"No image files found in {image_dir}")
                return [], []
                
            print(f"Found {len(image_files)} potential image files in {image_dir}")
            
            # Find matching mask files
            image_paths = []
            mask_paths = []
            
            for img_path in image_files:
                img_name = os.path.basename(img_path)
                img_stem = os.path.splitext(img_name)[0]
                
                # Try different possible mask naming patterns
                mask_patterns = [
                    os.path.join(mask_dir, f"{img_stem}*.png"),
                    os.path.join(mask_dir, f"{img_stem}*.jpg"),
                    os.path.join(mask_dir, f"{img_stem.replace('sat', 'mask')}*.png"),
                    os.path.join(mask_dir, f"{img_stem}_mask.*"),
                    os.path.join(mask_dir, f"{img_stem}_label.*")
                ]
                
                # Try to find matching mask
                mask_found = False
                for pattern in mask_patterns:
                    matching_masks = glob.glob(pattern)
                    if matching_masks:
                        mask_path = matching_masks[0]  # Take the first match
                        image_paths.append(img_path)
                        mask_paths.append(mask_path)
                        mask_found = True
                        break
                        
            print(f"Found {len(image_paths)} image-mask pairs.")
            return image_paths, mask_paths

        # Function to load data paths directly from directories
        def load_data_paths(df, split):
            """Load data paths for the given split using directory structure."""
            # Check common directory naming patterns for this split
            possible_dirs = [
                os.path.join(dataset_base_dir, split),
                os.path.join(dataset_base_dir, f"{split}_set")
            ]
            
            split_dir = None
            for dir_path in possible_dirs:
                if os.path.exists(dir_path):
                    split_dir = dir_path
                    break
                    
            if split_dir is None:
                print(f"No directory found for split '{split}'")
                return [], []
                
            print(f"Loading {split} data from: {split_dir}")
            return find_image_mask_pairs(split_dir)
else:
    print(f"Error: Dataset directory not found at {dataset_base_dir}")
    print("Please ensure the DeepGlobe dataset is correctly placed in the Kaggle input directory.")
    metadata_df = pd.DataFrame(columns=['image_id', 'split', 'sat_image_path', 'mask_path']) # Dummy df

# Import glob for file pattern matching
import glob

# Define class names and their corresponding IDs
id2label = {
    0: 'urban_land',
    1: 'agriculture_land',
    2: 'rangeland',
    3: 'forest_land',
    4: 'water',
    5: 'barren_land',
    6: 'unknown'
}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)
class_names = list(id2label.values())

# Load data for each split
train_image_paths, train_mask_paths = load_data_paths(metadata_df, 'train')
val_image_paths, val_mask_paths = load_data_paths(metadata_df, 'valid')
test_image_paths, test_mask_paths = load_data_paths(metadata_df, 'test')

# If no validation set found, use a portion of train or test as validation
if not val_image_paths and train_image_paths:
    # Use 15% of train data as validation
    val_size = max(1, int(len(train_image_paths) * 0.15))
    val_indices = random.sample(range(len(train_image_paths)), val_size)
    val_image_paths = [train_image_paths[i] for i in val_indices]
    val_mask_paths = [train_mask_paths[i] for i in val_indices]
    
    # Remove validation examples from train set
    val_set = set(val_image_paths)
    train_image_paths = [p for p in train_image_paths if p not in val_set]
    train_mask_paths = [p for i, p in enumerate(train_mask_paths) if train_image_paths[i] not in val_set]
    
    print(f"Created validation set with {len(val_image_paths)} samples from train data.")
    print(f"Updated train set has {len(train_image_paths)} samples.")

# Create Hugging Face Datasets
def create_hf_dataset(image_paths, mask_paths):
    if not image_paths or not mask_paths or len(image_paths) != len(mask_paths):
        print(f"Warning: Mismatch or empty paths. Creating empty dataset.")
        return Dataset.from_dict({'image': [], 'label': []}).cast_column('image', HFImage()).cast_column('label', HFImage())
    dataset = Dataset.from_dict({'image': image_paths, 'label': mask_paths})
    # Casting ensures the columns are treated as images
    dataset = dataset.cast_column('image', HFImage())
    dataset = dataset.cast_column('label', HFImage())
    return dataset

train_dataset = create_hf_dataset(train_image_paths, train_mask_paths)
val_dataset = create_hf_dataset(val_image_paths, val_mask_paths)
test_dataset = create_hf_dataset(test_image_paths, test_mask_paths)

ds = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print("\nDataset structure:")
print(ds)

Dataset base directory found at: /kaggle/input/deepglobe-land-cover-classification-dataset
No metadata.csv found. Using direct directory structure.
Loading train data from: /kaggle/input/deepglobe-land-cover-classification-dataset/train
Found 1606 potential image files in /kaggle/input/deepglobe-land-cover-classification-dataset/train
Found 1606 image-mask pairs.
Loading valid data from: /kaggle/input/deepglobe-land-cover-classification-dataset/valid
Found 171 potential image files in /kaggle/input/deepglobe-land-cover-classification-dataset/valid
Found 171 image-mask pairs.
Loading test data from: /kaggle/input/deepglobe-land-cover-classification-dataset/test
Found 172 potential image files in /kaggle/input/deepglobe-land-cover-classification-dataset/test
Found 172 image-mask pairs.

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1606
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 

## 3. Preprocessing

Define feature extractor and transformations. The masks are converted to class ID masks using maximum performance optimization for Kaggle environment.

In [5]:
from transformers import SegformerImageProcessor  # Use the updated class instead of FeatureExtractor
import multiprocessing
import psutil
import tempfile
import os
from concurrent.futures import ThreadPoolExecutor
import time

# Use the recommended ImageProcessor instead of FeatureExtractor
try:
    feature_extractor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
    print("Using SegformerImageProcessor")
except:
    from transformers import SegformerFeatureExtractor
    feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
    print("Using SegformerFeatureExtractor (deprecated)")

# Define the RGB to Class ID mapping
rgb_to_id = {
    (0, 255, 255): 0,  # Urban land (Cyan)
    (255, 255, 0): 1,  # Agriculture land (Yellow)
    (255, 0, 255): 2,  # Rangeland (Magenta)
    (0, 255, 0): 3,    # Forest land (Green)
    (0, 0, 255): 4,    # Water (Blue)
    (255, 255, 255): 5,# Barren land (White)
    (0, 0, 0): 6       # Unknown (Black)
}
id_to_rgb = {v: k for k, v in rgb_to_id.items()} # Invert mapping for visualization

# Specialized RGB to class ID conversion optimized for Kaggle
def rgb_mask_to_class_id_mask_optimized(mask_img):
    """Highly optimized version that converts an RGB mask to class ID mask"""
    # Convert to numpy array with uint8 for memory efficiency
    mask_arr = np.array(mask_img.convert('RGB'), dtype=np.uint8)
    
    # Pre-allocate output array with default class (unknown)
    class_mask = np.full(mask_arr.shape[:2], 6, dtype=np.uint8)
    
    # Use memory-efficient operations - creates a single view instead of multiple copies
    # This specialized version shifts image data to create a single integer for comparison
    # which is much faster than comparing three channels separately
    rgb_packed = (mask_arr[:,:,0].astype(np.uint32) << 16) + \
                 (mask_arr[:,:,1].astype(np.uint32) << 8) + \
                 (mask_arr[:,:,2].astype(np.uint32))
    
    # Convert RGB tuples to packed integers for faster comparison
    for rgb, class_id in rgb_to_id.items():
        rgb_packed_val = (rgb[0] << 16) + (rgb[1] << 8) + rgb[2]
        class_mask[rgb_packed == rgb_packed_val] = class_id
    
    return Image.fromarray(class_mask)

def preprocess_data(examples):
    """Optimized preprocessing function"""
    # Convert all images to RGB mode
    images = [img.convert("RGB") for img in examples['image']]
    
    # Process masks in parallel for better performance
    # Note: This uses a thread pool, which works better for I/O-bound operations
    # than Python's multiprocessing due to the GIL
    with ThreadPoolExecutor(max_workers=8) as executor:
        labels = list(executor.map(rgb_mask_to_class_id_mask_optimized, examples['label']))
    
    # Process with feature extractor
    inputs = feature_extractor(images, labels, return_tensors="pt")
    inputs['labels'] = inputs['labels'].squeeze(1)  # Remove channel dimension
    
    return inputs

# Set up proper cache directory for Kaggle
# First try the shared memory location
if os.path.exists("/dev/shm"):
    cache_dir = "/dev/shm/segformer_cache"
else:
    # Fallback to temp directory
    cache_dir = os.path.join(tempfile.gettempdir(), 'segformer_cache')
    
os.makedirs(cache_dir, exist_ok=True)
print(f"Cache directory: {cache_dir}")

# Use all available CPU resources for Kaggle
num_cpu = os.cpu_count()
optimal_processes = max(1, min(8, num_cpu))  # Use between 1 and 8 processes
batch_size = 16  # Batch size that works reliably

print(f"\nApplying preprocessing using {optimal_processes} processes with batch size {batch_size}...")
print(f"Available CPU cores: {num_cpu}, Total RAM: {psutil.virtual_memory().total / 1e9:.1f}GB")

if len(ds['train']) > 0:
    # Create properly formatted cache file names with extensions to avoid the rindex error
    cache_files = {
        "train": os.path.join(cache_dir, "train_processed.arrow"),
        "validation": os.path.join(cache_dir, "val_processed.arrow"),
        "test": os.path.join(cache_dir, "test_processed.arrow"),
    }
    
    # Clear any existing cache files if they exist
    for cache_file in cache_files.values():
        if os.path.exists(cache_file):
            try:
                os.remove(cache_file)
                print(f"Removed existing cache file: {cache_file}")
            except:
                print(f"Could not remove existing cache file: {cache_file}")
    
    # Process datasets one by one to avoid memory issues
    processed_train = None
    processed_validation = None
    processed_test = None
    
    try:
        # Process train dataset if available
        if len(ds['train']) > 0:
            print("Processing training dataset...")
            processed_train = ds['train'].map(
                preprocess_data,
                batched=True,
                batch_size=batch_size,
                num_proc=optimal_processes,
                desc="Processing train images",
                cache_file_name=cache_files["train"],
                # Don't load from cache to force fresh processing
                load_from_cache_file=False
            )
            print(f"Train dataset processed: {len(processed_train)} items")
        
        # Process validation dataset if available
        if len(ds['validation']) > 0:
            print("Processing validation dataset...")
            processed_validation = ds['validation'].map(
                preprocess_data,
                batched=True,
                batch_size=batch_size,
                num_proc=optimal_processes,
                desc="Processing validation images",
                cache_file_name=cache_files["validation"],
                load_from_cache_file=False
            )
            print(f"Validation dataset processed: {len(processed_validation)} items")
        
        # Process test dataset if available
        if len(ds['test']) > 0:
            print("Processing test dataset...")
            processed_test = ds['test'].map(
                preprocess_data,
                batched=True,
                batch_size=batch_size,
                num_proc=optimal_processes,
                desc="Processing test images",
                cache_file_name=cache_files["test"],
                load_from_cache_file=False
            )
            print(f"Test dataset processed: {len(processed_test)} items")
        
        # Create processed dataset dictionary
        processed_ds = DatasetDict({
            'train': processed_train if processed_train is not None else Dataset.from_dict({'pixel_values': [], 'labels': []}),
            'validation': processed_validation if processed_validation is not None else Dataset.from_dict({'pixel_values': [], 'labels': []}),
            'test': processed_test if processed_test is not None else Dataset.from_dict({'pixel_values': [], 'labels': []})
        })
        
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        # Fallback to simpler processing without caching if error occurs
        print("Falling back to simple processing without caching...")
        
        # Process directly without caching for better compatibility
        processed_ds = {}
        for split in ['train', 'validation', 'test']:
            if split in ds and len(ds[split]) > 0:
                print(f"Processing {split} dataset...")
                processed_ds[split] = ds[split].map(
                    preprocess_data,
                    batched=True,
                    batch_size=batch_size,
                    num_proc=1,  # Use single process for compatibility
                    load_from_cache_file=False
                )
            else:
                processed_ds[split] = Dataset.from_dict({'pixel_values': [], 'labels': []})
        
        processed_ds = DatasetDict(processed_ds)
    
    # Force garbage collection after preprocessing
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
else:
   print("Skipping preprocessing as datasets are empty.")
   processed_ds = ds # Keep the empty structure

print("\nPreprocessing completed!")
print(f"Processed dataset sizes: Train: {len(processed_ds['train'])} images, ",
      f"Validation: {len(processed_ds['validation'])} images, ",
      f"Test: {len(processed_ds['test'])} images")

# Show sample of processed data
if len(processed_ds['train']) > 0:
    print("\nProcessed example structure:")
    for key, value in list(processed_ds['train'][0].items())[:5]:
        if isinstance(value, torch.Tensor):
            print(f"{key}: Tensor of shape {value.shape}, dtype {value.dtype}")
        else:
            print(f"{key}: {type(value)}")
else:
    print("Train dataset is empty.")

preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Using SegformerImageProcessor
Cache directory: /dev/shm/segformer_cache

Applying preprocessing using 4 processes with batch size 16...
Available CPU cores: 4, Total RAM: 33.7GB
Processing training dataset...


  return func(*args, **kwargs)


Processing train images (num_proc=4):   0%|          | 0/1606 [00:00<?, ? examples/s]

Train dataset processed: 1606 items
Processing validation dataset...


Processing validation images (num_proc=4):   0%|          | 0/171 [00:00<?, ? examples/s]

Validation dataset processed: 171 items
Processing test dataset...


Processing test images (num_proc=4):   0%|          | 0/172 [00:00<?, ? examples/s]

Test dataset processed: 172 items

Preprocessing completed!
Processed dataset sizes: Train: 1606 images,  Validation: 171 images,  Test: 172 images

Processed example structure:
image: <class 'PIL.JpegImagePlugin.JpegImageFile'>
label: <class 'PIL.JpegImagePlugin.JpegImageFile'>
pixel_values: <class 'list'>
labels: <class 'list'>


## 4. Model Definition

Load a pre-trained SegFormer model and configure it for our specific number of classes.

In [6]:
# Clear CUDA cache before loading model
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
model = SegformerForSemanticSegmentation.from_pretrained(
    "nvidia/segformer-b0-finetuned-ade-512-512",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True, # Allow changing the classification head
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # Use half precision if possible
    low_cpu_mem_usage=True, # Minimize CPU memory usage during loading
)

# Move model to GPU and optimize for inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Optimize memory usage on Kaggle
if torch.cuda.is_available():
    # Display GPU memory usage
    print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f}GB allocated, ",
          f"{torch.cuda.memory_reserved() / 1e9:.2f}GB reserved")

# Count model parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model loaded on {device} with {total_params/1e6:.2f}M parameters ({trainable_params/1e6:.2f}M trainable)")

config.json:   0%|          | 0.00/6.88k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/15.0M [00:00<?, ?B/s]

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/segformer-b0-finetuned-ade-512-512 and are newly initialized because the shapes did not match:
- decode_head.classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([7]) in the model instantiated
- decode_head.classifier.weight: found shape torch.Size([150, 256, 1, 1]) in the checkpoint and torch.Size([7, 256, 1, 1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU Memory: 0.01GB allocated,  0.01GB reserved
Model loaded on cuda with 3.72M parameters (3.72M trainable)


## 5. Training Configuration

Set up `TrainingArguments` with optimizations for P100 GPU and define the evaluation metric (Mean Intersection over Union - mIoU).

In [7]:
# Import needed modules first
import transformers
import torch
import os
import numpy as np
import evaluate

# Load mean_iou metric
metric = evaluate.load("mean_iou")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Logits shape: (batch_size, num_labels, height/4, width/4)
    # Labels shape: (batch_size, height, width)

    # Move logits to CPU for upsampling if needed
    if isinstance(logits, torch.Tensor) and logits.device.type == 'cuda':
        logits = logits.cpu()
        
    # Convert logits to torch tensor if they are numpy arrays
    if isinstance(logits, np.ndarray):
        logits = torch.from_numpy(logits)
        
    # Ensure labels are also tensors for interpolation size check
    if isinstance(labels, np.ndarray):
        labels_tensor = torch.from_numpy(labels)
    else:
        labels_tensor = labels
        
    # Move labels tensor to CPU if needed for size check
    if hasattr(labels_tensor, 'device') and labels_tensor.device.type == 'cuda':
        labels_tensor = labels_tensor.cpu()
        
    upsampled_logits = torch.nn.functional.interpolate(
        logits,
        size=labels_tensor.shape[-2:], # Target (height, width)
        mode='bilinear',
        align_corners=False
    )

    # Get predicted class IDs
    pred_labels = upsampled_logits.argmax(dim=1).detach().cpu().numpy()
    # Ensure labels are numpy arrays on CPU for metric computation
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()

    # Compute metrics
    metrics = metric.compute(
        predictions=pred_labels,
        references=labels,
        num_labels=num_labels,
        ignore_index=6, # Ignore the 'unknown' class if desired, otherwise use 255 or remove
        reduce_labels=False, # We are not reducing labels
    )

    # Add per-category IoU metrics for better interpretation
    # Handle potential KeyError if metric computation failed for some reason
    per_category_iou = metrics.pop('per_category_iou', [0.0] * num_labels)
    per_category_accuracy = metrics.pop('per_category_accuracy', [0.0] * num_labels)
    for i, label in id2label.items():
        metrics[f"iou_{label}"] = per_category_iou[i]
        metrics[f"accuracy_{label}"] = per_category_accuracy[i]

    # Return main metrics
    return {
        "mean_iou": metrics.get("mean_iou", 0.0),
        "mean_accuracy": metrics.get("mean_accuracy", 0.0),
        "overall_accuracy": metrics.get("overall_accuracy", 0.0),
        **metrics # Include per-category metrics as well
    }

# Maximizing batch sizes for Kaggle P100 - values tuned for best performance
train_batch_size = 24  # Maximum for P100 with this model
eval_batch_size = 48   # Can be larger for evaluation (no gradients stored)
gradient_accumulation_steps = 1  # No accumulation needed with large batch

print(f"Using Train Batch Size: {train_batch_size}, Eval Batch Size: {eval_batch_size}")
print(f"Transformers version: {transformers.__version__}")

# Create a data collator for image segmentation
from dataclasses import dataclass
from typing import Dict, List

@dataclass
class SegmentationDataCollator:
    """Data collator for image segmentation that handles the conversion of data structures to tensors"""
    def __call__(self, batch: List[Dict]) -> Dict:
        # Initialize with empty lists
        pixel_values = []
        labels = []
        
        # Extract all values into lists
        for example in batch:
            if 'pixel_values' in example:
                if isinstance(example['pixel_values'], torch.Tensor):
                    pixel_values.append(example['pixel_values'])
                elif isinstance(example['pixel_values'], list):
                    # Convert nested lists to tensors
                    pixel_values.append(torch.tensor(example['pixel_values']))
            
            if 'labels' in example:
                if isinstance(example['labels'], torch.Tensor):
                    labels.append(example['labels'])
                elif isinstance(example['labels'], list):
                    # Convert nested lists to tensors
                    labels.append(torch.tensor(example['labels']))
        
        # Create the batch - check for empty lists to avoid errors
        batch_dict = {}
        if pixel_values:
            # Stack tensors into batches
            try:
                batch_dict['pixel_values'] = torch.stack(pixel_values)
            except:
                print("Error stacking pixel_values, shapes may be inconsistent")
                # Attempt to handle arrays of different shapes (not ideal, but fallback)
                batch_dict['pixel_values'] = pixel_values
                
        if labels:
            try:
                batch_dict['labels'] = torch.stack(labels)
            except:
                print("Error stacking labels, shapes may be inconsistent")
                batch_dict['labels'] = labels
                
        return batch_dict

# Import TrainingArguments after we ensure transformers is properly imported
from transformers import TrainingArguments

# Check if the model supports gradient checkpointing before enabling it
def supports_gradient_checkpointing(model):
    """Check if the model supports gradient checkpointing"""
    if hasattr(model, 'supports_gradient_checkpointing'):
        return model.supports_gradient_checkpointing
    # Try to access the gradient_checkpointing_enable method
    return hasattr(model, 'gradient_checkpointing_enable')

# Check the model supports gradient checkpointing
use_gradient_checkpointing = False
try:
    # This will raise an error if gradient_checkpointing is not supported
    if supports_gradient_checkpointing(model):
        use_gradient_checkpointing = True
        print("Model supports gradient checkpointing. Enabling it.")
    else:
        print("Model does NOT support gradient checkpointing. Will disable it.")
except Exception as e:
    print(f"Error checking gradient checkpointing support: {e}")
    print("Will disable gradient checkpointing to be safe.")

try:
    training_args = TrainingArguments(
        output_dir="./segformer-finetuned-deepglobe-kaggle",
        learning_rate=6e-5,
        num_train_epochs=15,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        save_total_limit=2,
        
        # Kaggle full system utilization optimizations
        gradient_accumulation_steps=gradient_accumulation_steps,
        fp16=True,  # Use mixed precision
        fp16_opt_level="O2",  # Aggressive mixed precision
        half_precision_backend="auto",
        optim="adamw_torch",
        dataloader_num_workers=os.cpu_count(),  # Use ALL CPU cores
        dataloader_pin_memory=True,
        dataloader_persistent_workers=True,  # Keep workers alive between epochs
        
        # Performance tuning for Kaggle
        gradient_checkpointing=use_gradient_checkpointing,  # Only use if supported
        bf16=False,  # P100 doesn't support bfloat16
        label_smoothing_factor=0.1,  # Helps generalization
        weight_decay=0.01,          # Prevents overfitting
        max_grad_norm=1.0,          # Gradient clipping for stability
        warmup_ratio=0.1,           # Warmup for learning rate
        lr_scheduler_type="cosine_with_restarts",  # Better convergence
        
        # Evaluation and save strategies
        eval_strategy="steps", 
        save_strategy="steps", 
        logging_strategy="steps",
        eval_steps=300,  # More frequent evaluation 
        save_steps=300,  # More frequent saving
        logging_steps=50,  # More frequent logging
        
        # IMPORTANT: DISABLE group_by_length for image data
        group_by_length=False,  # This was causing the error
        
        # Other training parameters
        load_best_model_at_end=True,
        metric_for_best_model="mean_iou",
        greater_is_better=True,
        push_to_hub=False,
        remove_unused_columns=False
    )
except TypeError as e:
    print(f"First attempt failed with error: {e}")
    # Fallback to compatible parameters
    training_args = TrainingArguments(
        output_dir="./segformer-finetuned-deepglobe-kaggle",
        learning_rate=6e-5,
        num_train_epochs=15,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        save_total_limit=2,
        
        # Kaggle full system utilization optimizations (compatible subset)
        gradient_accumulation_steps=gradient_accumulation_steps,
        fp16=True,
        dataloader_num_workers=os.cpu_count(),
        dataloader_pin_memory=True,
        
        # Explicitly disable gradient checkpointing in fallback
        gradient_checkpointing=False,
        
        # Try appropriate parameter names
        evaluation_strategy="steps",
        save_strategy="steps",
        logging_strategy="steps",
        
        eval_steps=300,
        save_steps=300,
        logging_steps=50,
        
        # IMPORTANT: DISABLE group_by_length for image data
        group_by_length=False,  # This was causing the error
        
        load_best_model_at_end=True,
        metric_for_best_model="mean_iou",
        greater_is_better=True,
        push_to_hub=False,
        remove_unused_columns=False
    )

# Calculate effective batch size
effective_batch_size = train_batch_size * gradient_accumulation_steps
print("\nTraining arguments set for 100% Kaggle system utilization:")
print(f"- Output directory: {training_args.output_dir}")
print(f"- Learning rate: {training_args.learning_rate}")
print(f"- Number of epochs: {training_args.num_train_epochs}")
print(f"- Per device batch size: {training_args.per_device_train_batch_size}")
print(f"- Effective batch size: {effective_batch_size}")
print(f"- Dataloader workers: {training_args.dataloader_num_workers} (using all CPU cores)")
print(f"- FP16 (mixed precision): {training_args.fp16}")
print(f"- Group by length: {training_args.group_by_length} (disabled for image segmentation)")
print(f"- Gradient checkpointing: {getattr(training_args, 'gradient_checkpointing', False)}")

Downloading builder script:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Using Train Batch Size: 24, Eval Batch Size: 48
Transformers version: 4.51.1
Model does NOT support gradient checkpointing. Will disable it.

Training arguments set for 100% Kaggle system utilization:
- Output directory: ./segformer-finetuned-deepglobe-kaggle
- Learning rate: 6e-05
- Number of epochs: 15
- Per device batch size: 24
- Effective batch size: 24
- Dataloader workers: 4 (using all CPU cores)
- FP16 (mixed precision): True
- Group by length: False (disabled for image segmentation)
- Gradient checkpointing: False


In [None]:
# Import Trainer explicitly here to avoid any issues
from transformers import Trainer

# Clean up memory before training
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU Memory before training: {torch.cuda.memory_allocated() / 1e9:.2f}GB allocated, ",
          f"{torch.cuda.memory_reserved() / 1e9:.2f}GB reserved")

# Check if datasets are valid before creating Trainer
train_data_available = 'train' in processed_ds and len(processed_ds['train']) > 0
eval_data_available = 'validation' in processed_ds and len(processed_ds['validation']) > 0

# Display dataset shapes - safely check the structure
if train_data_available:
    print(f"Training dataset size: {len(processed_ds['train'])} samples")

    # Get the first element to inspect
    sample = processed_ds['train'][0]

    # Safely check and print the structure
    for key in list(sample.keys()):
        value = sample[key]
        if isinstance(value, torch.Tensor):
            print(f"- {key}: Tensor of shape {value.size()}, dtype {value.dtype}")
        elif hasattr(value, 'shape'):  # For numpy arrays or other objects with shape attribute
            print(f"- {key}: shape {value.shape}, dtype {type(value)}")
        elif isinstance(value, list):
            print(f"- {key}: list of length {len(value)}, element type {type(value[0]) if value else 'empty list'}")
        else:
            print(f"- {key}: type {type(value)}")

# Instantiate the custom data collator defined previously
# This ensures the Trainer uses our custom logic for batching segmentation data
data_collator = SegmentationDataCollator()

trainer = None
if train_data_available and eval_data_available:
    # Initialize Trainer with maximum performance settings
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=processed_ds["train"],
        eval_dataset=processed_ds["validation"],
        compute_metrics=compute_metrics,
        data_collator=data_collator  # Pass the custom collator here
    )

    # Start training with full system utilization
    # NOTE: If you encounter Out-of-Memory (OOM) errors, try reducing
    # per_device_train_batch_size and per_device_eval_batch_size in the TrainingArguments cell above.
    print("\nStarting training with 100% Kaggle system utilization...")
    try:
        train_results = trainer.train()

        # Show memory usage during training
        if torch.cuda.is_available():
            print(f"Peak GPU Memory: {torch.cuda.max_memory_allocated() / 1e9:.2f}GB")
            torch.cuda.reset_peak_memory_stats()

        # Save the best model and training state
        print("Saving model and state...")
        trainer.save_model()
        trainer.save_state()
        print("Model and state saved.")

        # Clean memory after training
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()

        print("Training finished.")
        print("Training Results:", train_results)

        # Log metrics
        metrics = train_results.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)

        # Evaluate after training (on validation set)
        print("\nEvaluating final model on validation set...")
        eval_metrics = trainer.evaluate(eval_dataset=processed_ds["validation"])
        trainer.log_metrics("eval", eval_metrics)
        trainer.save_metrics("eval", eval_metrics)
    except Exception as e:
        print(f"Error during training: {e}")
        # It might be helpful to raise the error here to stop execution if training fails critically
        # raise e
        print("\nAttempting to continue with evaluation or visualization if possible...")
        # Even if training fails, we might still be able to evaluate the model
        # or perform visualization with the current model state
else:
    print("Skipping training as train or validation dataset is empty or invalid.")


GPU Memory before training: 0.01GB allocated,  0.01GB reserved
Training dataset size: 1606 samples
- image: type <class 'PIL.JpegImagePlugin.JpegImageFile'>
- label: type <class 'PIL.JpegImagePlugin.JpegImageFile'>
- pixel_values: list of length 3, element type <class 'list'>
- labels: list of length 512, element type <class 'list'>





Starting training with 100% Kaggle system utilization...


<IPython.core.display.Javascript object>

## 7. Evaluation & Visualization

Evaluate the fine-tuned model on the test set and visualize some predictions.

In [None]:
test_data_available = 'test' in processed_ds and len(processed_ds['test']) > 0
all_preds = []
all_labels = []
test_metrics = None

if trainer is not None and test_data_available:
    print("\nEvaluating on the test set...")
    # Use the predict method to get raw predictions and labels
    test_results = trainer.predict(processed_ds['test'])
    test_metrics = test_results.metrics # Metrics are computed based on aggregated predictions/labels
    
    print("\nTest Set Evaluation Results:")
    print(test_metrics)
    trainer.log_metrics("test", test_metrics)
    trainer.save_metrics("test", test_metrics)
    
    # Extract predictions and labels for confusion matrix and visualization
    logits = test_results.predictions
    labels = test_results.label_ids
    
    # Upsample logits and get predicted labels (needs to be done on CPU)
    if isinstance(logits, torch.Tensor) and logits.device.type == 'cuda':
        logits = logits.cpu()
    if isinstance(logits, np.ndarray):
        logits = torch.from_numpy(logits)
        
    if isinstance(labels, torch.Tensor) and labels.device.type == 'cuda':
        labels = labels.cpu()
    if isinstance(labels, torch.Tensor):
        labels = labels.numpy() # Ensure labels are numpy array
        
    upsampled_logits = torch.nn.functional.interpolate(
        logits,
        size=labels.shape[-2:], # Target (height, width)
        mode='bilinear',
        align_corners=False
    )
    all_preds = upsampled_logits.argmax(dim=1).detach().cpu().numpy().flatten()
    all_labels = labels.flatten()
    
else:
    print("Skipping test set evaluation as trainer was not initialized or test dataset is empty.")

# --- Confusion Matrix ---
if len(all_preds) > 0 and len(all_labels) > 0:
    # Filter out ignored labels if necessary (e.g., 'unknown' class with id 6)
    ignore_idx = 6 
    valid_indices = all_labels != ignore_idx
    filtered_labels = all_labels[valid_indices]
    filtered_preds = all_preds[valid_indices]
    
    print(f"\nGenerating Confusion Matrix (ignoring class {ignore_idx}: '{id2label.get(ignore_idx, 'N/A')}')")
    if len(filtered_labels) > 0: # Ensure there are valid labels left
        cm = confusion_matrix(filtered_labels, filtered_preds, labels=list(range(num_labels-1))) # Exclude ignored class from labels
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names[:-1], yticklabels=class_names[:-1])
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title('Confusion Matrix (Test Set)')
        plt.show()
    else:
        print("Skipping confusion matrix: No valid labels after filtering.")
else:
    print("Skipping confusion matrix generation (no predictions/labels available).")

# --- Visualization ---
def visualize_predictions(num_samples=5):
    if trainer is None or not test_data_available or len(test_image_paths) == 0:
        print("Skipping visualization: Trainer not available, test data missing, or no test image paths.")
        return
        
    print(f"\nVisualizing predictions for {num_samples} random test samples...")
    
    # Ensure model is on CPU for visualization
    # Load the *saved* best model from disk onto CPU for consistent visualization
    print(f"Loading best model from {training_args.output_dir} for visualization...")
    try:
        viz_model = SegformerForSemanticSegmentation.from_pretrained(training_args.output_dir).cpu()
        viz_model.eval() # Set model to evaluation mode
    except Exception as e:
        print(f"Error loading saved model for visualization: {e}. Using current model state on CPU.")
        # Fallback to using the current model state moved to CPU
        viz_model = model.cpu()
        viz_model.eval()
    
    # Get random indices
    num_available = len(test_image_paths)
    indices = random.sample(range(num_available), min(num_samples, num_available))
    
    for i in indices:
        image_path = test_image_paths[i]
        mask_path = test_mask_paths[i]
        
        try:
            image = Image.open(image_path).convert("RGB")
            true_mask_rgb = Image.open(mask_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image/mask {i}: {e}")
            continue

        # Preprocess image for model
        # Feature extractor runs on CPU for visualization consistency
        encoding = feature_extractor(image, return_tensors="pt")
        pixel_values = encoding.pixel_values # Already on CPU

        # Predict using the CPU model
        with torch.no_grad():
            outputs = viz_model(pixel_values=pixel_values)
            logits = outputs.logits # Shape: (1, num_labels, H/4, W/4)

        # Upsample logits to original image size
        upsampled_logits = torch.nn.functional.interpolate(
            logits,
            size=image.size[::-1], # (height, width)
            mode="bilinear",
            align_corners=False,
        )
        pred_mask_id = upsampled_logits.argmax(dim=1).squeeze().numpy()

        # Convert predicted IDs back to RGB
        pred_mask_rgb = np.zeros((*pred_mask_id.shape, 3), dtype=np.uint8)
        for class_id, color in id_to_rgb.items():
            pred_mask_rgb[pred_mask_id == class_id] = color
            
        # Create color legend patches
        legend_patches = [plt.Rectangle((0,0),1,1, fc=np.array(color)/255.0) for color in id_to_rgb.values()]
        legend_labels = [f"{idx}: {name}" for idx, name in id2label.items()]

        # Plot
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        fig.suptitle(f"Sample {i}: {os.path.basename(image_path)}")
        axes[0].imshow(image)
        axes[0].set_title("Input Image")
        axes[0].axis('off')
        axes[1].imshow(true_mask_rgb)
        axes[1].set_title("True Mask (RGB)")
        axes[1].axis('off')
        axes[2].imshow(pred_mask_rgb)
        axes[2].set_title("Predicted Mask (RGB)")
        axes[2].axis('off')
        
        # Add legend to the figure
        fig.legend(legend_patches, legend_labels, loc='lower center', ncol=len(id_to_rgb), bbox_to_anchor=(0.5, -0.05))
        
        plt.tight_layout(rect=[0, 0.05, 1, 0.95]) # Adjust layout to make space for legend
        plt.show()
        
    # Move the original model back to GPU after visualization if needed
    if torch.cuda.is_available():
        model.to('cuda')

# Visualize predictions
visualize_predictions(num_samples=5)

## 8. Performance Analysis (P100 GPU)

Analyze the training performance on P100 GPU.

In [None]:
# Load training logs for performance analysis if available
import json
import glob
from datetime import datetime
import matplotlib.pyplot as plt

def analyze_training_performance():
    log_files = glob.glob(os.path.join(training_args.output_dir, "trainer_state.json"))
    
    if not log_files:
        print("No training logs found.")
        return
        
    try:
        with open(log_files[0], 'r') as f:
            logs = json.load(f)
            
        # Extract relevant metrics
        steps = [log['step'] for log in logs['log_history'] if 'loss' in log]
        losses = [log['loss'] for log in logs['log_history'] if 'loss' in log]
        learning_rates = [log['learning_rate'] for log in logs['log_history'] if 'learning_rate' in log and 'loss' in log]
        
        # Calculate training time and performance metrics
        if 'created_at' in logs and logs['created_at'] and 'last_update' in logs and logs['last_update']:
            created_at = datetime.fromisoformat(logs['created_at'].replace('Z', '+00:00'))
            last_update = datetime.fromisoformat(logs['last_update'].replace('Z', '+00:00'))
            training_duration = (last_update - created_at).total_seconds() / 60  # in minutes
            
            total_train_steps = logs['global_step']
            steps_per_second = total_train_steps / (training_duration * 60)
            
            print(f"\nP100 GPU Performance Analysis:")
            print(f"- Total training time: {training_duration:.2f} minutes")
            print(f"- Steps per second: {steps_per_second:.2f}")
            print(f"- Initial loss: {losses[0]:.4f}")
            print(f"- Final loss: {losses[-1]:.4f}")
            
        # Plot training progress
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)
        
        ax1.plot(steps, losses, 'b-', label='Training Loss')
        ax1.set_title('Training Loss on P100 GPU')
        ax1.set_ylabel('Loss')
        ax1.grid(True)
        ax1.legend()
        
        if learning_rates:
            ax2.plot(steps[:len(learning_rates)], learning_rates, 'g-', label='Learning Rate')
            ax2.set_title('Learning Rate Schedule')
            ax2.set_xlabel('Training Steps')
            ax2.set_ylabel('Learning Rate')
            ax2.grid(True)
            ax2.legend()
            
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error analyzing training logs: {e}")

# Run performance analysis if training was completed
if trainer is not None:
    analyze_training_performance()
else:
    print("No training was performed, skipping performance analysis.")

In [None]:
# Monitor system usage
import psutil
import GPUtil

def print_system_usage():
    """Print current system resource usage"""
    # CPU usage
    cpu_usage = psutil.cpu_percent(interval=0.1)
    
    # Memory usage
    mem = psutil.virtual_memory()
    mem_usage = mem.percent
    mem_used_gb = mem.used / (1024 ** 3)
    mem_total_gb = mem.total / (1024 ** 3)
    
    # Disk usage
    disk = psutil.disk_usage('/')
    disk_usage = disk.percent
    
    print(f"System Resource Usage:")
    print(f"- CPU: {cpu_usage}% used across {psutil.cpu_count()} cores")
    print(f"- RAM: {mem_usage}% used ({mem_used_gb:.1f}GB / {mem_total_gb:.1f}GB)")
    print(f"- Disk: {disk_usage}% used")
    
    # GPU usage if available
    if torch.cuda.is_available():
        try:
            for i in range(torch.cuda.device_count()):
                gpu_usage = torch.cuda.memory_allocated(i) / torch.cuda.get_device_properties(i).total_memory * 100
                print(f"- GPU {i}: {gpu_usage:.1f}% allocated")
                print(f"  Memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB / {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f}GB")
        except:
            try:
                # Alternative using GPUtil
                gpus = GPUtil.getGPUs()
                for i, gpu in enumerate(gpus):
                    print(f"- GPU {i}: {gpu.memoryUtil*100:.1f}% used")
                    print(f"  Memory: {gpu.memoryUsed:.2f}GB / {gpu.memoryTotal:.2f}GB")
            except:
                print("- GPU: Info not available")

# Print current system usage
print_system_usage()