In [1]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import os
from PIL import Image
import json
import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast, GradScaler

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# for dirname, _, filenames in os.walk('/kaggle/input/imagenet100'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
def collect_all_data(data_dir, json_mapping_path):
    """
    Collect all image paths and labels in a deterministic way
    """
    data_dir = (data_dir)
    
    # Load class mapping
    with open(json_mapping_path, 'r') as f:
        folder_to_class = json.load(f)
    
    # Create class to index mapping
    classes = sorted(list(set(folder_to_class.values())))
    class_to_idx = {cls: idx for idx, cls in enumerate(classes)}
    
    # Collect all image paths and labels
    all_image_paths = []
    all_labels = []
    directories = os.listdir(data_dir)[1:]

    # Sort folder names for deterministic ordering
    folder_names = []
    folder_paths = []

    for d in directories:
        folder_names.extend([f for f in os.listdir(f'{data_dir}/{d}')
                              if f in folder_to_class])
        folder_paths.extend([os.path.join(f'{data_dir}/{d}', f) for f in os.listdir(f'{data_dir}/{d}')
                              if f in folder_to_class])
    # print(folder_names[0])
    # print(folder_paths[0])
    for i, folder_name in enumerate(folder_names):
        if os.path.isdir(folder_paths[i]):
            class_name = folder_to_class[folder_name]
            class_idx = class_to_idx[class_name]
            
            # Get all image files and sort them for deterministic ordering
            image_files = sorted([
                os.path.join(folder_paths[i], img_file) for img_file in os.listdir(folder_paths[i])
                if img_file.endswith('.JPEG')
            ])
            # print(image_files)
            # for img_file in image_files:
            all_image_paths.extend(image_files)
            all_labels.extend([class_idx]*len(image_files))
    
    return all_image_paths, all_labels, classes, class_to_idx, folder_to_class
# collect_all_data(root, f'{root}/Labels.json')

In [3]:
# split data paths and labels
def split(all_image_paths, all_labels, test_size=0.2, val_size=0.1, random_state=42):
    """
    Split data once to ensure mutual exclusivity
    """
    # print(f"Splitting: {all_image_paths[0]}")
    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        all_image_paths, all_labels,
        test_size=test_size,
        stratify=all_labels,
        random_state=random_state
    )
    
    # Second split: separate train and validation from remaining data
    val_size_adjusted = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp,
        test_size=val_size_adjusted,
        stratify=y_temp,
        random_state=random_state
    )
    # print(X_train[0])
    # Verify mutual exclusivity
    train_set = set(X_train)
    val_set = set(X_val)
    test_set = set(X_test)
    # print(train_set[0])
    
    assert len(train_set.intersection(val_set)) == 0, "Train and validation sets overlap!"
    assert len(train_set.intersection(test_set)) == 0, "Train and test sets overlap!"
    assert len(val_set.intersection(test_set)) == 0, "Validation and test sets overlap!"
    
    print("✓ Data splits verified as mutually exclusive")
    print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
    
    return {
        'train': {'paths': X_train, 'labels': y_train},
        'val': {'paths': X_val, 'labels': y_val},
        'test': {'paths': X_test, 'labels': y_test}
    }
# img_path_splits = split(img_paths, labels)
# img_path_splits['train']['paths'][0]

In [4]:
class imagenet100(Dataset):
    def __init__(self, image_paths, labels, classes, classes_to_idx, transform = None, split=None):
        self.transform = transform
        self.image_paths = image_paths
        self.labels = labels
        self.classes = classes
        self.classes_to_idx = classes_to_idx
        self.num_classes = len(classes)
        
    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label


In [5]:
from torchvision import transforms

def get_transforms(input_size):
    return transforms.Compose([
        transforms.Resize((input_size, input_size)),     # Match ResNet18 input
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

In [6]:
# train_size = int(0.8*len(dataset))
# val_size = len(dataset) - train_size
# train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

def create_data_loaders(data_dir, json_mapping, batch_size=8, num_workers=4, input_size=224):
    img_paths, labels, classes, class_to_idx, folder_to_class = collect_all_data(data_dir, json_mapping)
    # print(img_paths[0]) 
    splits = split(img_paths, labels)
    transform = get_transforms(input_size)
    print(splits['train']['paths'][0])
    # Create datasets
    train_dataset = imagenet100(
        splits['train']['paths'], 
        splits['train']['labels'], 
        classes,
        class_to_idx,
        transform=transform, 
    )
    
    val_dataset = imagenet100(
        splits['val']['paths'], 
        splits['val']['labels'], 
        classes,
        class_to_idx,
        transform=transform, 
    )
    
    test_dataset = imagenet100(
        splits['test']['paths'], 
        splits['test']['labels'], 
        classes,
        class_to_idx,
        transform=transform, 
    )
    # print(train_dataset.image_paths)
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=num_workers,
        pin_memory=True  # Faster GPU transfer
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=num_workers,
        pin_memory=True
    )
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=num_workers,
        pin_memory=True
    )
    print(f"Dataset sizes:")
    print(f"Train: {len(train_dataset)} images")
    print(f"Validation: {len(val_dataset)} images")
    print(f"Test: {len(test_dataset)} images")
    print(f"Number of classes: {len(classes)}")
    
    return train_loader, val_loader, test_loader, len(classes)


In [7]:
root = f"/kaggle/input/imagenet100"
train_loader, val_loader, test_loader, num_classes = create_data_loaders(root, os.path.join(root, 'Labels.json'))

✓ Data splits verified as mutually exclusive
Train: 94500, Val: 13500, Test: 27000
/kaggle/input/imagenet100/train.X4/n01860187/n01860187_1819.JPEG
Dataset sizes:
Train: 94500 images
Validation: 13500 images
Test: 27000 images
Number of classes: 100


In [8]:
print("\nTesting DataLoaders...")
for batch_idx, (images, labels) in enumerate(train_loader):
    print(f"Batch {batch_idx}: Images shape: {images.shape}, Labels shape: {labels.shape}")
    print(f"Label range: {labels.min().item()} to {labels.max().item()}")
    if batch_idx == 2:  # Just show first few batches
        break


# Show some class information
print(f"\nFirst 10 classes: {train_loader.dataset.classes[:10]}")
print(f"Class to index mapping (first 5): {dict(list(train_loader.dataset.classes_to_idx.items())[:5])}")
print(f"\n first 10 image paths: {train_loader.dataset.image_paths[:10]}")


Testing DataLoaders...
Batch 0: Images shape: torch.Size([8, 3, 224, 224]), Labels shape: torch.Size([8])
Label range: 14 to 73
Batch 1: Images shape: torch.Size([8, 3, 224, 224]), Labels shape: torch.Size([8])
Label range: 11 to 92
Batch 2: Images shape: torch.Size([8, 3, 224, 224]), Labels shape: torch.Size([8])
Label range: 15 to 90

First 10 classes: ['American alligator, Alligator mississipiensis', 'American coot, marsh hen, mud hen, water hen, Fulica americana', 'Dungeness crab, Cancer magister', 'Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis', 'agama', 'albatross, mollymawk', 'axolotl, mud puppy, Ambystoma mexicanum', 'bald eagle, American eagle, Haliaeetus leucocephalus', 'banded gecko', 'barn spider, Araneus cavaticus']
Class to index mapping (first 5): {'American alligator, Alligator mississipiensis': 0, 'American coot, marsh hen, mud hen, water hen, Fulica americana': 1, 'Dungeness crab, Cancer magister': 2, 'Komodo dragon, Komodo lizard, dr

In [9]:
def conv3x3(in_channels, out_channels, stride, dilation=1):
    return nn.Conv2d(
        in_channels=in_channels, 
        out_channels=out_channels, 
        kernel_size=3, 
        stride=stride, 
        dilation=dilation, 
        padding=dilation, 
        bias=False,
    )
    
class block(nn.Module):
    '''
    Basic Block: 3x3 Conv -> Batch Norm 1 -> ReLU -> 3x3 Conv -> Batch Norm 2 -> += initial -> ReLU
    '''
    
    def __init__(self, in_channels, out_channels, stride=1):
        super(block, self).__init__()
        self.conv1 = conv3x3(in_channels, out_channels, stride)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.ReLU = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(out_channels, out_channels, stride)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # add another layer if channel
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
    def forward(self, x):
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.ReLU(out)

        out = self.conv2(out)
        out = self.bn2(out)

        # skip connection / identity matching
        out += self.shortcut(x)
        out = self.ReLU(out)
        
        return out

class resnet18(nn.Module):
    def __init__(self, stride=1):
        super(resnet18, self).__init__()
        self.in_channels = 64
        self.conv1 = conv3x3(3, self.in_channels, stride)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.ReLU = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)

        self.layer1 = self.make_layer(64,  2, stride=1)
        self.layer2 = self.make_layer(128, 2, stride=1)
        self.layer3 = self.make_layer(256, 2, stride=1)
        self.layer4 = self.make_layer(512, 2, stride=1)

        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512, num_classes)
    
        
    def make_layer(self, out_channels, num_blocks, stride=1):
        strides = [stride] + [1] * (num_blocks-1)
        layers = []
        for s in strides:
            layers.append(block(self.in_channels, out_channels, stride=stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.ReLU(x)
        x = self.maxpool(x)

        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        out = self.avgpool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        
        return out
        


In [10]:
def get_model_size(model):
    # Count total parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    # Calculate memory usage (assumes float32 = 4 bytes)
    total_size = sum(p.numel() * p.element_size() for p in model.parameters())
    
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Model size: {total_size / 1024**2:.2f} MB")
    print(f"Model size: {total_size / 1024**3:.3f} GB")
    
    return total_params, total_size

# Usage
total_params, model_size = get_model_size(resnet18())

Total parameters: 11,220,132
Trainable parameters: 11,220,132
Model size: 42.80 MB
Model size: 0.042 GB


In [11]:
import gc 
# Check current usage"
def print_memory():
    print(f"Allocated: {torch.cuda.memory_allocated()/1024**2:.0f} MB")
    print(f"Reserved: {torch.cuda.memory_reserved()/1024**2:.0f} MB")
    # Get detailed memory info
    print(torch.cuda.memory_summary())

def clean_memory():
    # Check current usage
    del model, optimizer, criterion  # Replace with your variable names
    gc.collect()
    torch.cuda.empty_cache()
print_memory()
clean_memory()


Allocated: 0 MB
Reserved: 0 MB
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|--------------------------------

UnboundLocalError: cannot access local variable 'model' where it is not associated with a value

In [None]:
model = resnet18()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

for epoch in (range(10)):
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(ouputs,labels)
        loss.backward()
        optimizer.step()
        
        running_loss+=loss.item()
        
        del outputs, loss
    torch.cuda.empty_cache()
    print(f'Epoch {epoch} loss: {running_loss / len(train_loader)}')

In [None]:
def memory_optimized_training():
    # Model setup
    model = resnet18()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # Enable gradient checkpointing if available (saves memory during backprop)
    if hasattr(model, 'gradient_checkpointing_enable'):
        model.gradient_checkpointing_enable()
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    
    # Mixed precision training setup (uses FP16, saves ~50% memory)
    scaler = GradScaler()
    
    # Monitor memory usage
    def print_memory_usage(stage=""):
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1024**2
            reserved = torch.cuda.memory_reserved() / 1024**2
            print(f"{stage} - Allocated: {allocated:.1f}MB, Reserved: {reserved:.1f}MB")
    
    print_memory_usage("Initial")
    
    # Training loop with memory optimizations
    for epoch in range(10):
        running_loss = 0.0
        model.train()
        
        for i, (images, labels) in enumerate(train_loader):
            # Move data to GPU
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Mixed precision forward pass
            with autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)  # Fixed typo: ouputs -> outputs
            
            # Mixed precision backward pass
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            # Accumulate loss
            running_loss += loss.item()

            # Explicit cleanup (helps with memory fragmentation)
            del outputs, loss, images, labels
            
            # Optional: Clear cache every N iterations to prevent fragmentation
            if i % 50 == 0:  # Adjust frequency as needed
                torch.cuda.empty_cache()
                if i % 100 == 0:  # Less frequent memory reporting
                    print_memory_usage(f"Epoch {epoch+1}, Batch {i}")
        
        # End of epoch cleanup
        torch.cuda.empty_cache()
        gc.collect()  # Python garbage collection
        
        avg_loss = running_loss / len(train_loader)
        print(f'Epoch {epoch+1}/10 - Loss: {avg_loss:.4f}')
        print_memory_usage(f"End Epoch {epoch+1}")
    
    print("Training completed!")
    print_memory_usage("Final")
    return model


In [None]:
model = memory_optimized_training()

torch.save(model.state_dict(), "resnet18_1.pth")