In [1]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import os
from PIL import Image
import glob
from loguru import logger

from paths import DATA_DIR, RAW_DATA, SRC_DIR
from data.utils import (
   
    PROCESSED_DATA,
    TRANSFORMS_FOLDER,
    GOOGLENET_TRAIN_IMAGES_NPY,
    GOOGLENET_TRAIN_LABELS_NPY,

    GOOGLENET_TEST_IMAGES_NPY,
    GOOGLENET_TEST_LABELS_NPY,
    
    CLASSES_NAMES,
    CLASSES_INDEX,
    DATA_TRAIN_PATH,
    DATA_TESTA_PATH,

    show_images,
    save_npy,
    load_images_from_folders
)
from data.data_augmentation_transforms import ImageData
import sys
print(sys.path)

['/home/yopparay/Desktop/jaziri/cellule/project/data', '/home/yopparay/Desktop/jaziri/cellule/project/src', '/home/yopparay/Desktop/jaziri/cellule/project', '/home/yopparay/miniconda3/envs/rocm/lib/python312.zip', '/home/yopparay/miniconda3/envs/rocm/lib/python3.12', '/home/yopparay/miniconda3/envs/rocm/lib/python3.12/lib-dynload', '', '/home/yopparay/miniconda3/envs/rocm/lib/python3.12/site-packages', '/home/yopparay/miniconda3/envs/rocm/lib/python3.12/site-packages/setuptools/_vendor', '/tmp/tmptk7bhk5c']


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import numpy as np

In [7]:
import torch
import subprocess
import os
from loguru import logger

logger.info("="*70)
logger.info("ROCm DIAGNOSTIC TOOL")
logger.info("="*70)

# ============================================================================
# 1. CHECK IF PYTORCH RECOGNIZES GPU
# ============================================================================

logger.info("\n1. PyTorch GPU Detection:")
logger.info(f"   CUDA Available: {torch.cuda.is_available()}")
logger.info(f"   Device Count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    logger.info(f"   Current Device: {torch.cuda.current_device()}")
    logger.info(f"   Device Name: {torch.cuda.get_device_name(0)}")
else:
    logger.warning("   ⚠ PyTorch cannot detect GPU!")

# ============================================================================
# 2. CHECK PYTORCH VERSION AND HIP SUPPORT
# ============================================================================

logger.info("\n2. PyTorch Version & HIP:")
logger.info(f"   PyTorch Version: {torch.__version__}")

try:
    hip_version = torch.version.hip
    logger.info(f"   HIP Version: {hip_version}")
    if hip_version:
        logger.info("   ✓ HIP is available (ROCm support)")
    else:
        logger.warning("   ⚠ HIP is NOT available (GPU support missing)")
except:
    logger.warning("   ⚠ Could not determine HIP version")

# ============================================================================
# 3. CHECK SYSTEM ENVIRONMENT VARIABLES
# ============================================================================

logger.info("\n3. Environment Variables:")

important_vars = [
    'HIP_VISIBLE_DEVICES',
    'ROCM_HOME',
    'HSA_OVERRIDE_GFX_VERSION',
    'PATH',
    'LD_LIBRARY_PATH'
]

for var in important_vars:
    value = os.environ.get(var, "NOT SET")
    if value != "NOT SET":
        logger.info(f"   {var}: {value[:100]}...")
    else:
        logger.warning(f"   {var}: NOT SET")

# ============================================================================
# 4. CHECK SYSTEM GPU (rocm-smi)
# ============================================================================

logger.info("\n4. System GPU Detection (rocm-smi):")

try:
    result = subprocess.run(['rocm-smi'], capture_output=True, text=True, timeout=5)
    if result.returncode == 0:
        logger.info("   rocm-smi output:")
        for line in result.stdout.split('\n')[:10]:
            logger.info(f"   {line}")
        logger.info("   ✓ GPU detected by rocm-smi")
    else:
        logger.warning("   ⚠ rocm-smi failed")
except FileNotFoundError:
    logger.warning("   ⚠ rocm-smi not found (ROCm not installed?)")
except subprocess.TimeoutExpired:
    logger.warning("   ⚠ rocm-smi timeout")

# ============================================================================
# 5. CHECK GPU MEMORY
# ============================================================================

logger.info("\n5. GPU Memory:")

try:
    result = subprocess.run(['rocm-smi', '--showmeminfo'], capture_output=True, text=True, timeout=5)
    if result.returncode == 0:
        logger.info("   GPU Memory Info:")
        for line in result.stdout.split('\n')[:15]:
            if line.strip():
                logger.info(f"   {line}")
except:
    logger.warning("   Could not get GPU memory info")

# ============================================================================
# 6. TEST PYTORCH WITH GPU
# ============================================================================

logger.info("\n6. PyTorch GPU Test:")

try:
    # Create tensor on GPU
    if torch.cuda.is_available():
        x = torch.randn(10, 10).cuda()
        logger.info(f"   ✓ Tensor created on GPU: {x.device}")
        
        # Test computation
        y = x @ x.T
        logger.info(f"   ✓ Computation successful")
    else:
        logger.warning("   ⚠ No GPU available for testing")
except Exception as e:
    logger.error(f"   ✗ GPU test failed: {e}")

# ============================================================================
# SUMMARY & RECOMMENDATIONS
# ============================================================================

logger.info("\n" + "="*70)
logger.info("SUMMARY & RECOMMENDATIONS")
logger.info("="*70)

if torch.cuda.is_available() and torch.version.hip:
    logger.info("""
    ✓ ROCm is properly configured!
    ✓ You can use GPU for training
    
    In your training script, use:
        device = torch.device("cuda")
    """)
else:
    logger.warning("""
    ⚠ ROCm is NOT properly configured
    
    FIXES TO TRY (in order):
    
    1. INSTALL/UPDATE ROCm:
       pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7
       
    2. SET ENVIRONMENT VARIABLES:
       export HIP_VISIBLE_DEVICES=0
       export ROCM_HOME=/opt/rocm
       export PATH=/opt/rocm/bin:$PATH
       export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
       
    3. CHECK ROCM INSTALLATION:
       rocm-smi
       
    4. VERIFY HIP:
       python -c "import torch; print(torch.version.hip)"
       
    5. REINSTALL PYTORCH FOR ROCM:
       pip uninstall torch torchvision torchaudio
       pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7
       
    6. RESTART PYTHON/TERMINAL:
       Environment variables need to be reloaded
    """)

# ============================================================================
# FULL FIX COMMANDS
# ============================================================================

logger.info("\n" + "="*70)
logger.info("FULL FIX - RUN THESE COMMANDS")
logger.info("="*70)

logger.info("""
# Step 1: Update pip
pip install --upgrade pip setuptools

# Step 2: Install PyTorch for ROCm
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7

# Step 3: Verify installation
python -c "import torch; print('CUDA:', torch.cuda.is_available()); print('HIP:', torch.version.hip)"

# Step 4: Test GPU
python -c "x = torch.randn(10).cuda(); print('GPU Test:', x.device)"

# Step 5: Run training
python -m src.models
""")

logger.info("\n" + "="*70)
logger.info("FOR NOW: Use CPU Mode")
logger.info("="*70)

logger.info("""
While you fix ROCm, you can still train on CPU (slower):

In training script:
    device = torch.device("cpu")
    batch_size = 2 (reduce for CPU)
    num_workers = 0

Training will be slow but will work!
""")

[32m2025-12-09 19:32:17.284[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mROCm DIAGNOSTIC TOOL[0m
[32m2025-12-09 19:32:17.286[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1m
1. PyTorch GPU Detection:[0m
[32m2025-12-09 19:32:17.287[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1m   CUDA Available: False[0m
[32m2025-12-09 19:32:17.288[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1m   Device Count: 0[0m
[32m2025-12-09 19:32:17.291[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1m
2. PyTorch Version & HIP:[0m
[32m2025-12-09 19:32:17.292[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1m   PyTorch Version: 2.6.0+rocm6.1[0m
[32m2025-12-09 19:32:17.293[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m33[0m - [1m   HIP Version: 6.1.40091-a8dbc0c19[0m
[32m2025-12-09 19:32:17.294

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from loguru import logger

import os

# ============================================================================
# ROCm SPECIFIC SETTINGS
# ============================================================================

# Set environment variables for ROCm stability
os.environ['HSA_OVERRIDE_GFX_VERSION'] = '0'  # Auto-detect GPU
os.environ['PYTORCH_HIP_ALLOC_CONF'] = ':16:8'  # Memory pooling for stability

logger.info("="*70)
logger.info("ROCm (AMD GPU) TRAINING")
logger.info("="*70)

# ============================================================================
# TRAINING FUNCTIONS
# ============================================================================

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    batch_count = 0

    for batch_idx, (images, labels) in enumerate(loader):
        try:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            batch_count += 1
            
            # Log every 5 batches
            if (batch_idx + 1) % 5 == 0:
                logger.info(f"    Batch {batch_idx + 1}: Loss = {loss.item():.4f}")
        
        except RuntimeError as e:
            logger.error(f"ERROR in batch {batch_idx}: {e}")
            logger.error("Try reducing batch_size or using CPU mode")
            raise

    return total_loss / batch_count if batch_count > 0 else 0


def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    batch_count = 0

    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            batch_count += 1

    accuracy = correct / total if total > 0 else 0
    return total_loss / batch_count if batch_count > 0 else 0, accuracy


# ============================================================================
# MAIN TRAINING
# ============================================================================

if __name__ == "__main__":

    # ====================================================================
    # STEP 1: SETUP DEVICE (ROCm)
    # ====================================================================
    
    logger.info("\nSetting up ROCm device...")
    
    # Check for ROCm/AMD GPU
    if torch.cuda.is_available():
        logger.info("✓ CUDA/ROCm is available")
        device = torch.device("cuda")
        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
        
        # Get GPU properties for ROCm
        props = torch.cuda.get_device_properties(0)
        logger.info(f"GPU Memory: {props.total_memory / 1e9:.2f} GB")
        logger.info(f"GPU Architecture: {props.name}")
    else:
        logger.warning("⚠ No GPU detected! Using CPU (training will be SLOW)")
        device = torch.device("cpu")
    
    # Clear GPU cache
    torch.cuda.empty_cache()
    
    # ====================================================================
    # STEP 2: LOAD DATA
    # ====================================================================
    
    logger.info("\nLoading data...")
    
    # ROCm optimal settings:
    # - batch_size: 8-16 (smaller than CUDA due to memory management differences)
    # - num_workers: 2-4 (helps with data loading on ROCm)
    
    data = ImageData(
        train_dir=DATA_TRAIN_PATH,
        test_dir=DATA_TESTA_PATH,
        batch_size=8,           # ← Smaller for ROCm stability
        num_workers=2,          # ← Help ROCm with data loading
        balance_classes=True
    )
    
    train_loader = data.train_loader()
    val_loader = data.val_loader()
    test_loader = data.test_loader()
    
    logger.info(f"✓ Data loaded")
    logger.info(f"  Train batches: {len(train_loader)}")
    logger.info(f"  Val batches: {len(val_loader)}")
    logger.info(f"  Test batches: {len(test_loader)}")
    
    # ====================================================================
    # STEP 3: CREATE MODEL
    # ====================================================================
    
    logger.info("\nCreating model...")
    
    # For ROCm, ResNet50 might be too heavy
    # Start with ResNet18 for stability
    logger.info("Loading ResNet18 (lightweight for ROCm)...")
    
    try:
        model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        
        # Modify last layer for 5 classes
        num_features = model.fc.in_features
        model.fc = nn.Sequential(
            nn.Linear(num_features, 256),  # Smaller hidden layer
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 5)
        )
        
        model = model.to(device)
        logger.info("✓ Model created and moved to device")
        
        # Count parameters
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        logger.info(f"  Total parameters: {total_params:,}")
        logger.info(f"  Trainable parameters: {trainable_params:,}")
        
    except RuntimeError as e:
        logger.error(f"Error loading model: {e}")
        logger.error("Trying ResNet18 without pretrained weights...")
        model = models.resnet18(weights=None)
        num_features = model.fc.in_features
        model.fc = nn.Sequential(
            nn.Linear(num_features, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 5)
        )
        model = model.to(device)
    
    # ====================================================================
    # STEP 4: SETUP TRAINING
    # ====================================================================
    
    logger.info("\nSetting up training...")
    
    criterion = nn.CrossEntropyLoss()
    
    # Optimizer for ROCm
    optimizer = optim.Adam(
        model.parameters(),
        lr=1e-4,
        weight_decay=1e-4,
        amsgrad=False  # ROCm sometimes has issues with AMSGrad
    )
    
    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.5,
        patience=2,
        verbose=True
    )
    
    logger.info(f"Optimizer: Adam (lr=1e-4)")
    logger.info(f"Loss: CrossEntropyLoss")
    
    # ====================================================================
    # STEP 5: TRAINING LOOP
    # ====================================================================
    
#     logger.info("\n" + "="*70)
#     logger.info("STARTING TRAINING ON ROCm")
#     logger.info("="*70)
    
#     EPOCHS = 10
#     best_val_loss = float('inf')
#     patience = 3
#     patience_counter = 0
    
#     for epoch in range(EPOCHS):
#         logger.info(f"\nEpoch {epoch+1}/{EPOCHS}")
        
#         # Clear GPU cache before epoch (important for ROCm)
#         torch.cuda.empty_cache()
        
#         if torch.cuda.is_available():
#             allocated = torch.cuda.memory_allocated() / 1e9
#             logger.info(f"GPU Memory at start: {allocated:.2f} GB")
        
#         # Train
#         try:
#             train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
#             logger.info(f"  Train Loss: {train_loss:.4f}")
#         except Exception as e:
#             logger.error(f"Training error: {e}")
#             break
        
#         # Validate
#         try:
#             val_loss, val_acc = validate(model, val_loader, criterion, device)
#             logger.info(f"  Val Loss: {val_loss:.4f}")
#             logger.info(f"  Val Accuracy: {val_acc*100:.2f}%")
#         except Exception as e:
#             logger.error(f"Validation error: {e}")
#             break
        
#         # Learning rate scheduling
#         scheduler.step(val_loss)
        
#         # Early stopping with patience
#         if val_loss < best_val_loss:
#             best_val_loss = val_loss
#             patience_counter = 0
#             torch.save(model.state_dict(), 'best_model.pth')
#             logger.info("  ✓ Best model saved")
#         else:
#             patience_counter += 1
#             if patience_counter >= patience:
#                 logger.info(f"Early stopping at epoch {epoch+1}")
#                 break
        
#         # Clear cache between epochs
#         torch.cuda.empty_cache()
    
#     # ====================================================================
#     # STEP 6: TEST
#     # ====================================================================
    
#     logger.info("\n" + "="*70)
#     logger.info("TESTING ON ROCm")
#     logger.info("="*70)
    
#     # Load best model
#     try:
#         model.load_state_dict(torch.load('best_model.pth'))
#         logger.info("✓ Loaded best model")
#     except:
#         logger.warning("Could not load best model, using current")
    
#     test_loss, test_acc = validate(model, test_loader, criterion, device)
#     logger.info(f"Test Loss: {test_loss:.4f}")
#     logger.info(f"Test Accuracy: {test_acc*100:.2f}%")
    
#     logger.info("\n✓ Training complete!")


# # ============================================================================
# # ROCm TROUBLESHOOTING
# # ============================================================================

# """
# If you still get segmentation fault on ROCm:

# 1. REDUCE BATCH SIZE FURTHER:
#    batch_size = 4
   
# 2. USE CPU MODE (for debugging):
#    device = torch.device("cpu")
   
# 3. UPDATE ROCm:
#    pip install --upgrade torch torchvision torchaudio
   
# 4. CHECK VRAM:
#    rocm-smi
   
# 5. DISABLE PRETRAINED WEIGHTS:
#    model = models.resnet18(weights=None)
   
# 6. MONOLITHIC KERNEL (for ROCm stability):
#    export PYTORCH_KERNEL_TYPE=monolithic
   
# 7. USE SIMPLER MODEL:
#    Instead of ResNet, try:
#    model = models.mobilenet_v2(weights=None)
   
# 8. CPU OFFLOADING:
#    optimizer = optim.SGD(model.parameters(), lr=1e-4)
#    # SGD is more stable than Adam on ROCm
# """

[32m2025-12-09 19:31:09.666[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mROCm (AMD GPU) TRAINING[0m
[32m2025-12-09 19:31:09.668[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m92[0m - [1m
Setting up ROCm device...[0m
[32m2025-12-09 19:31:09.669[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m115[0m - [1m
Loading data...[0m
[32m2025-12-09 19:31:09.708[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m133[0m - [1m✓ Data loaded[0m
[32m2025-12-09 19:31:09.709[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m134[0m - [1m  Train batches: 1272[0m
[32m2025-12-09 19:31:09.710[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m135[0m - [1m  Val batches: 109[0m
[32m2025-12-09 19:31:09.710[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m136[0m - [1m  Test batches: 433[0m
[32m2025-12-09 19:31:09.711[0m | [1mINFO    [0m | [36m__main__

In [3]:
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for images, labels in loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

def validate(model, loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(loader), accuracy


In [4]:
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    

data = ImageData(
    train_dir = DATA_TRAIN_PATH,
    test_dir = DATA_TESTA_PATH,
    batch_size=32,
    num_workers=4,
    balance_classes=True
)

train_loader = data.train_loader()
val_loader = data.val_loader()
test_loader = data.test_loader()

dataset = data.train_dataset


num_features = model.fc.in_features

model.fc = nn.Sequential(
    nn.Linear(num_features, 512),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(512, 5)
)
model = model.to(device)

for param in model.parameters():
    param.requires_grad = True

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    model.fc.parameters(),
    lr=1e-4,
    weight_decay=1e-4
)

EPOCHS = 1

for epoch in range(EPOCHS):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_acc = validate(model, val_loader, criterion)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss:   {val_loss:.4f}")
    print(f"  Val Acc:    {val_acc*100:.2f}%")

: 

In [5]:
import torch
torch.version.hip  # should be set for ROCm
torch.cuda.is_available()  # should be True


True

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize your data
    data = ImageData(
        train_dir=DATA_TRAIN_PATH,
        test_dir=DATA_TESTA_PATH,
        batch_size=32,
        num_workers=4,
        balance_classes=True
    )

    train_loader = data.train_loader()
    val_loader = data.val_loader()
    test_loader = data.test_loader()

    # -------------------------------
    # Initialize model
    # -------------------------------
    model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

    # Replace the final fully connected layer
    num_features = model.fc.in_features
    model.fc = nn.Sequential(
        nn.Linear(num_features, 512),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(512, 5)  # Replace 5 with your number of classes
    )

    model = model.to(device)  # Move entire model to device

    # Set all layers trainable (optional)
    for param in model.parameters():
        param.requires_grad = True

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(
        model.parameters(),  # can optimize all parameters
        lr=1e-4,
        weight_decay=1e-4
    )

    # -------------------------------
    # Training loop
    # -------------------------------
    EPOCHS = 1

    for epoch in range(EPOCHS):
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc = validate(model, val_loader, criterion, device)

        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss:   {val_loss:.4f}")
        print(f"  Val Acc:    {val_acc*100:.2f}%")