# Baseline Models Training

This notebook implements training for baseline models:
- ResNet-50
- DenseNet-121
- Swin Transformer

These will serve as comparison baselines for the kidney ultrasound diagnosis task.

In [None]:
import sys
from pathlib import Path
sys.path.append("/kaggle/input/renalfiles")

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import sys

from dataloader import KidneyDataset, create_datasets, create_dataloaders, analyze_dataset
from utils import calculate_metrics

In [None]:
# Configuration
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 50
NUM_CLASSES = 4  # Adjust based on your dataset
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {DEVICE}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {EPOCHS}")

In [None]:
# Load datasets using the existing dataloader functions
# Option 1: Use the existing create_datasets function
try:
    # This assumes your data is organized in the standard format
    datasets = create_datasets('/kaggle/input/ct-kidney-dataset-normal-cyst-tumor-and-stone/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone')  # Adjust path as needed
    train_dataset, val_dataset, test_dataset = datasets['train'], datasets['val'], datasets['test']
    
    print(f"Using existing data splits:")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    print(f"Test samples: {len(test_dataset)}")
    
except Exception as e:
    print(f"Could not load existing datasets: {e}")
    print("Please update the data path or create your own dataset loading code")
    
    # Fallback: Manual dataset creation with transforms
    train_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(10),
        transforms.ColorJitter(brightness=0.2, contrast=0.2),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    val_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Create datasets manually - replace paths with your actual data paths
    train_dataset = KidneyDataset([], [], transform=train_transform)
    val_dataset = KidneyDataset([], [], transform=val_transform)
    
    print("Please update the dataset paths and labels in the cell above")

In [None]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

In [None]:
# Define baseline models
def get_resnet50(num_classes):
    """ResNet-50 baseline model"""
    model = models.resnet50(pretrained=True)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

def get_densenet121(num_classes):
    """DenseNet-121 baseline model"""
    model = models.densenet121(pretrained=True)
    model.classifier = nn.Linear(model.classifier.in_features, num_classes)
    return model

def get_swin_transformer(num_classes):
    """Swin Transformer baseline model"""
    try:
        from torchvision.models import swin_t
        model = swin_t(pretrained=True)
        model.head = nn.Linear(model.head.in_features, num_classes)
        return model
    except ImportError:
        print("Swin Transformer not available in this torchvision version")
        print("Using ResNet-50 as fallback")
        return get_resnet50(num_classes)

# Model selection
models_dict = {
    'resnet50': get_resnet50(NUM_CLASSES),
    'densenet121': get_densenet121(NUM_CLASSES),
    'swin_transformer': get_swin_transformer(NUM_CLASSES)
}

print("Available models:")
for name in models_dict.keys():
    print(f"- {name}")

In [None]:
patience = 5

In [None]:
# Training function
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

def train_model(model, train_loader, val_loader, epochs, device, model_name, patience=7):
    """Train a baseline model with early stopping"""
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
    
    train_losses = []
    val_losses = []
    val_accuracies = []
    
    best_val_acc = 0.0
    best_val_loss = float("inf")
    epochs_no_improve = 0
    
    for epoch in range(epochs):
    # for epoch in range(2):
        # Training phase
        model.train()
        train_loss = 0.0
        
        for batch_idx, (data, targets) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
            data, targets = data.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for data, targets in val_loader:
                data, targets = data.to(device), targets.to(device)
                outputs = model(data)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs.data, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        val_acc = 100.0 * correct / total
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        
        scheduler.step(val_loss)
        
        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss: {val_loss:.4f}")
        print(f"  Val Acc: {val_acc:.2f}%")
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f'best_{model_name}_model.pth')
            print(f"  New best model saved with accuracy: {best_val_acc:.2f}%")
        
        # Early stopping logic
        if val_loss < best_val_loss - 1e-4:  # delta to avoid tiny fluctuations
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs.")
            break
        
        print("-" * 50)
    
    return train_losses, val_losses, val_accuracies, best_val_acc


In [None]:
# Train all baseline models
results = {}

for model_name, model in models_dict.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name.upper()}")
    print(f"{'='*60}")
    
    train_losses, val_losses, val_accuracies, best_acc = train_model(
        model, train_loader, val_loader, 2, DEVICE, model_name
        # model, train_loader, val_loader, EPOCHS, DEVICE, model_name
    )
    
    results[model_name] = {
        'train_losses': train_losses,
        'val_losses': val_losses,
        'val_accuracies': val_accuracies,
        'best_accuracy': best_acc
    }
    
    print(f"\nCompleted training {model_name}")
    print(f"Best validation accuracy: {best_acc:.2f}%")

In [None]:
# Plot training results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Training losses
for model_name, result in results.items():
    axes[0, 0].plot(result['train_losses'], label=model_name)
axes[0, 0].set_title('Training Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Validation losses
for model_name, result in results.items():
    axes[0, 1].plot(result['val_losses'], label=model_name)
axes[0, 1].set_title('Validation Loss')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Validation accuracies
for model_name, result in results.items():
    axes[1, 0].plot(result['val_accuracies'], label=model_name)
axes[1, 0].set_title('Validation Accuracy')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Accuracy (%)')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Best accuracies comparison
model_names = list(results.keys())
best_accs = [results[name]['best_accuracy'] for name in model_names]
axes[1, 1].bar(model_names, best_accs)
axes[1, 1].set_title('Best Validation Accuracy Comparison')
axes[1, 1].set_ylabel('Accuracy (%)')
axes[1, 1].tick_params(axis='x', rotation=45)

# Add value labels on bars
for i, v in enumerate(best_accs):
    axes[1, 1].text(i, v + 0.5, f'{v:.2f}%', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('baseline_training_results.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Summary of results
print("\n" + "="*60)
print("BASELINE MODELS TRAINING SUMMARY")
print("="*60)

for model_name, result in results.items():
    print(f"\n{model_name.upper()}:")
    print(f"  Best Validation Accuracy: {result['best_accuracy']:.2f}%")
    print(f"  Final Training Loss: {result['train_losses'][-1]:.4f}")
    print(f"  Final Validation Loss: {result['val_losses'][-1]:.4f}")

# Find best performing model
best_model = max(results.keys(), key=lambda x: results[x]['best_accuracy'])
print(f"\nBEST PERFORMING MODEL: {best_model.upper()}")
print(f"Accuracy: {results[best_model]['best_accuracy']:.2f}%")

# Baseline Models Evaluation

This evaluates the trained baseline models:
- ResNet-50
- DenseNet-121
- Swin Transformer

Comprehensive evaluation including:
- Test accuracy
- Confusion matrix
- Classification report
- ROC curves
- Model comparison

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, classification_report, 
    roc_curve, auc, accuracy_score,
    precision_recall_fscore_support
)
from sklearn.preprocessing import label_binarize
import pandas as pd
import os
import sys
from tqdm import tqdm

# Add parent directory to path for imports
from dataloader import KidneyDataset, create_datasets, create_dataloaders, analyze_dataset
from utils import calculate_metrics

# Set style
plt.style.use('default')
sns.set_palette("husl")

In [None]:
# Configuration
BATCH_SIZE = 32
NUM_CLASSES = 4  # Adjust based on your dataset
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Class names (adjust based on your dataset)
CLASS_NAMES = ['Normal', 'Chronic Kidney Disease', 'Kidney Stone', 'Tumor']

print(f"Using device: {DEVICE}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"Class names: {CLASS_NAMES}")

In [None]:
# Create test data loader
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
print(f"Test batches: {len(test_loader)}")

In [None]:
# Define model architectures (same as training)
def get_resnet50(num_classes):
    model = models.resnet50(pretrained=False)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

def get_densenet121(num_classes):
    model = models.densenet121(pretrained=False)
    model.classifier = nn.Linear(model.classifier.in_features, num_classes)
    return model

def get_swin_transformer(num_classes):
    try:
        from torchvision.models import swin_t
        model = swin_t(pretrained=False)
        model.head = nn.Linear(model.head.in_features, num_classes)
        return model
    except ImportError:
        print("Swin Transformer not available, using ResNet-50")
        return get_resnet50(num_classes)

# Load trained models
models_dict = {
    'resnet50': get_resnet50(NUM_CLASSES),
    'densenet121': get_densenet121(NUM_CLASSES),
    'swin_transformer': get_swin_transformer(NUM_CLASSES)
}

# Load model weights
loaded_models = {}
for model_name, model in models_dict.items():
    try:
        model.load_state_dict(torch.load(f'best_{model_name}_model.pth', map_location=DEVICE))
        model.to(DEVICE)
        model.eval()
        loaded_models[model_name] = model
        print(f"Loaded {model_name} model successfully")
    except FileNotFoundError:
        print(f"Warning: {model_name} model weights not found")

print(f"\nModels available for evaluation: {list(loaded_models.keys())}")

In [None]:
CLASS_NAMES = ['Cyst', 'Normal', 'Stone', 'Tumor']

In [None]:
# Evaluation function
def evaluate_model(model, test_loader, device, model_name):
    """Comprehensive model evaluation"""
    model.eval()
    all_predictions = []
    all_targets = []
    all_probabilities = []
    
    print(f"Evaluating {model_name}...")
    
    with torch.no_grad():
        for data, targets in tqdm(test_loader, desc="Evaluating"):
            data, targets = data.to(device), targets.to(device)
            
            outputs = model(data)
            probabilities = F.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            
            all_predictions.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
    
    return np.array(all_predictions), np.array(all_targets), np.array(all_probabilities)

# Evaluate all models
evaluation_results = {}

for model_name, model in loaded_models.items():
    predictions, targets, probabilities = evaluate_model(model, test_loader, DEVICE, model_name)
    
    # Calculate metrics using the imported function
    metrics = calculate_metrics(targets, predictions, probabilities, CLASS_NAMES)
    
    evaluation_results[model_name] = {
        'predictions': predictions,
        'targets': targets,
        'probabilities': probabilities,
        'metrics': metrics
    }
    
    print(f"{model_name} - Test Accuracy: {metrics['accuracy']:.4f}")

In [None]:
# Generate classification reports
print("\n" + "="*80)
print("CLASSIFICATION REPORTS")
print("="*80)

for model_name, results in evaluation_results.items():
    print(f"\n{model_name.upper()}:")
    print("-" * 50)
    print(classification_report(
        results['targets'], 
        results['predictions'], 
        target_names=CLASS_NAMES,
        digits=4
    ))

In [None]:
# Plot confusion matrices
if evaluation_results:
    fig, axes = plt.subplots(1, len(evaluation_results), figsize=(5*len(evaluation_results), 4))
    if len(evaluation_results) == 1:
        axes = [axes]

    for idx, (model_name, results) in enumerate(evaluation_results.items()):
        cm = confusion_matrix(results['targets'], results['predictions'])
        
        # Normalize confusion matrix
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        sns.heatmap(cm_normalized, 
                    annot=True, 
                    fmt='.3f', 
                    cmap='Blues',
                    xticklabels=CLASS_NAMES,
                    yticklabels=CLASS_NAMES,
                    ax=axes[idx])
        
        axes[idx].set_title(f'{model_name.upper()}\nAccuracy: {results["metrics"]["accuracy"]:.3f}')
        axes[idx].set_xlabel('Predicted')
        axes[idx].set_ylabel('Actual')

    plt.tight_layout()
    plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No models available for evaluation. Please check model loading.")

In [None]:
# ROC Curves (for multi-class classification)
if evaluation_results:
    plt.figure(figsize=(12, 8))

    for model_name, results in evaluation_results.items():
        # Binarize the targets for multi-class ROC
        targets_bin = label_binarize(results['targets'], classes=range(NUM_CLASSES))
        probabilities = results['probabilities']
        
        # Compute ROC curve and AUC for each class
        fpr = {}
        tpr = {}
        roc_auc = {}
        
        for i in range(NUM_CLASSES):
            fpr[i], tpr[i], _ = roc_curve(targets_bin[:, i], probabilities[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        
        # Compute micro-average ROC curve and AUC
        fpr["micro"], tpr["micro"], _ = roc_curve(targets_bin.ravel(), probabilities.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        
        # Plot micro-average ROC curve
        plt.plot(fpr["micro"], tpr["micro"],
                 label=f'{model_name} (AUC = {roc_auc["micro"]:.3f})',
                 linewidth=2)

    plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves - Baseline Models Comparison')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.savefig('roc_curves_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No models available for ROC curve analysis.")

In [None]:
# Per-class performance analysis
if evaluation_results:
    performance_data = []

    for model_name, results in evaluation_results.items():
        precision, recall, f1, support = precision_recall_fscore_support(
            results['targets'], results['predictions'], average=None
        )
        
        for i, class_name in enumerate(CLASS_NAMES):
            performance_data.append({
                'Model': model_name,
                'Class': class_name,
                'Precision': precision[i],
                'Recall': recall[i],
                'F1-Score': f1[i],
                'Support': support[i]
            })

    performance_df = pd.DataFrame(performance_data)

    # Plot per-class performance
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    metrics = ['Precision', 'Recall', 'F1-Score']

    for idx, metric in enumerate(metrics):
        pivot_df = performance_df.pivot(index='Class', columns='Model', values=metric)
        pivot_df.plot(kind='bar', ax=axes[idx], width=0.8)
        axes[idx].set_title(f'{metric} by Class')
        axes[idx].set_xlabel('Class')
        axes[idx].set_ylabel(metric)
        axes[idx].legend(title='Model')
        axes[idx].tick_params(axis='x', rotation=45)
        axes[idx].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('per_class_performance.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Display performance table
    print("\nPER-CLASS PERFORMANCE TABLE:")
    print(performance_df.round(4))
else:
    print("No models available for per-class analysis.")

In [None]:
# Overall model comparison
if evaluation_results:
    comparison_data = []

    for model_name, results in evaluation_results.items():
        metrics = results['metrics']
        
        comparison_data.append({
            'Model': model_name,
            'Accuracy': metrics['accuracy'],
            'Precision (Macro)': metrics['macro_precision'],
            'Recall (Macro)': metrics['macro_recall'],
            'F1-Score (Macro)': metrics['macro_f1'],
            'Precision (Weighted)': metrics['weighted_precision'],
            'Recall (Weighted)': metrics['weighted_recall'],
            'F1-Score (Weighted)': metrics['weighted_f1']
        })

    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.round(4)

    print("\n" + "="*100)
    print("OVERALL MODEL COMPARISON")
    print("="*100)
    print(comparison_df.to_string(index=False))

    # Save results
    comparison_df.to_csv('baseline_models_comparison.csv', index=False)
    if 'performance_df' in locals():
        performance_df.to_csv('baseline_models_per_class_performance.csv', index=False)

    print("\nResults saved to CSV files:")
    print("- baseline_models_comparison.csv")
    if 'performance_df' in locals():
        print("- baseline_models_per_class_performance.csv")
else:
    print("No models available for comparison.")

In [None]:
# Model ranking and best model identification
if evaluation_results:
    print("\n" + "="*80)
    print("MODEL RANKING")
    print("="*80)

    # Rank by different metrics
    ranking_metrics = ['Accuracy', 'F1-Score (Macro)', 'F1-Score (Weighted)']

    for metric in ranking_metrics:
        if metric in comparison_df.columns:
            ranked = comparison_df.sort_values(metric, ascending=False)
            print(f"\nRanking by {metric}:")
            for idx, (_, row) in enumerate(ranked.iterrows(), 1):
                print(f"  {idx}. {row['Model'].upper()}: {row[metric]:.4f}")

    # Overall best model (average ranking)
    comparison_df['Average_Score'] = comparison_df[['Accuracy', 'F1-Score (Macro)', 'F1-Score (Weighted)']].mean(axis=1)
    best_model = comparison_df.loc[comparison_df['Average_Score'].idxmax(), 'Model']

    print(f"\n OVERALL BEST PERFORMING MODEL: {best_model.upper()}")
    print(f"Average Score: {comparison_df.loc[comparison_df['Model'] == best_model, 'Average_Score'].values[0]:.4f}")

    # Performance insights
    print("\n" + "="*80)
    print("PERFORMANCE INSIGHTS")
    print("="*80)

    best_acc_model = comparison_df.loc[comparison_df['Accuracy'].idxmax(), 'Model']
    best_f1_model = comparison_df.loc[comparison_df['F1-Score (Macro)'].idxmax(), 'Model']

    print(f" Highest Accuracy: {best_acc_model.upper()} ({comparison_df['Accuracy'].max():.4f})")
    print(f" Best F1-Score (Macro): {best_f1_model.upper()} ({comparison_df['F1-Score (Macro)'].max():.4f})")

    # Performance differences
    acc_diff = comparison_df['Accuracy'].max() - comparison_df['Accuracy'].min()
    f1_diff = comparison_df['F1-Score (Macro)'].max() - comparison_df['F1-Score (Macro)'].min()

    print(f"\n Performance Range:")
    print(f"   Accuracy difference: {acc_diff:.4f}")
    print(f"   F1-Score difference: {f1_diff:.4f}")

    if acc_diff < 0.05:
        print("   → Models show similar performance levels")
    else:
        print("   → Significant performance differences observed")
else:
    print("No models available for ranking analysis.")

In [None]:
!zip -r results ./