# AIPI 590 - XAI | Assignment #05
### GradCAM & Computer Vision
### Peter Banyas

#### Include the button below. Change the link to the location in your github repository:
#### Example: https://colab.research.google.com/github/banyasp/aipi590_hw5_gradCAM/blob/main/report.ipynb


[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/banyasp/aipi590_hw5_gradCAM/blob/main/report.ipynb)

## Setup and Installation

This notebook demonstrates various Class Activation Mapping (CAM) techniques on facial emotion images using a pretrained ResNet50 model.


In [None]:
# Install required packages
%pip install -q grad-cam pytorch-grad-cam torch torchvision


In [None]:
# Clone the repository to get the data folder
import os
if not os.path.exists('aipi590_hw5_gradCAM'):
    os.system('git clone https://github.com/banyasp/aipi590_hw5_gradCAM.git')
    print("Repository cloned successfully!")
else:
    print("Repository already exists!")


In [None]:
# Import required libraries
from pytorch_grad_cam import GradCAM, GradCAMPlusPlus, ScoreCAM, EigenCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from torchvision.models import resnet50, ResNet50_Weights
from PIL import Image
import torch
import numpy as np
from torchvision import transforms
import matplotlib.pyplot as plt
import ssl
import time

# Fix SSL certificate verification issue
ssl._create_default_https_context = ssl._create_unverified_context

# Create output directory if it doesn't exist
os.makedirs('output', exist_ok=True)

print("✓ All libraries imported successfully!")
print(f"✓ Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")


## Load Model and Define CAM Processing Function


In [None]:
# Load pretrained ResNet50 model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
model = model.to(device)
model.eval()

# Target layer (last convolutional layer in ResNet50)
target_layers = [model.layer4[-1]]

# Initialize CAM methods
cam_algorithms = {
    'GradCAM': GradCAM(model=model, target_layers=target_layers),
    'GradCAM++': GradCAMPlusPlus(model=model, target_layers=target_layers),
    'EigenCAM': EigenCAM(model=model, target_layers=target_layers),
    'Score-CAM': ScoreCAM(model=model, target_layers=target_layers)
}

print("✓ Model loaded successfully!")
print(f"✓ Initialized {len(cam_algorithms)} CAM algorithms")


In [None]:
def process_single_image(img_name, display=True):
    """
    Process a single image with all CAM methods and optionally display results.
    
    Parameters:
    -----------
    img_name : str
        Name of the image file (without .jpg extension)
    display : bool
        Whether to display the results inline (default: True)
    
    Returns:
    --------
    dict : Contains processing information and results
    """
    print(f"\n{'='*60}")
    print(f"Processing image: {img_name}")
    print(f"{'='*60}")
    
    # Load and preprocess image from the data folder
    img_path = f'aipi590_hw5_gradCAM/data/{img_name}.jpg'
    img = Image.open(img_path).convert('RGB')
    img_resized = img.resize((224, 224))
    img_array = np.array(img_resized) / 255.0
    img_tensor = transforms.ToTensor()(img_resized).unsqueeze(0).to(device)
    
    # Generate visualizations for all methods
    fig, axes = plt.subplots(1, 5, figsize=(20, 4))
    
    # Original image
    axes[0].imshow(img_resized)
    axes[0].set_title('Original Image', fontsize=14, fontweight='bold')
    axes[0].axis('off')
    
    # Generate CAM for each method
    for idx, (name, cam) in enumerate(cam_algorithms.items(), 1):
        print(f"  Generating {name}...")
        grayscale_cam = cam(input_tensor=img_tensor, targets=None)
        visualization = show_cam_on_image(img_array, grayscale_cam[0], use_rgb=True)
        
        axes[idx].imshow(visualization)
        axes[idx].set_title(name, fontsize=14, fontweight='bold')
        axes[idx].axis('off')
        
        # Save individual visualizations
        output_img = Image.fromarray(visualization)
        output_path = f'output/{name.lower().replace("-", "")}_output_{img_name}.jpg'
        output_img.save(output_path)
        print(f"    Saved {output_path}")
    
    # Save comparison
    plt.tight_layout()
    comparison_path = f'output/cam_comparison_{img_name}.png'
    plt.savefig(comparison_path, dpi=150, bbox_inches='tight')
    print(f"  Comparison saved as '{comparison_path}'")
    
    if display:
        plt.show()
    else:
        plt.close()
    
    # Get model prediction
    with torch.no_grad():
        output = model(img_tensor)
        pred_class = output.argmax(dim=1).item()
        confidence = torch.softmax(output, dim=1)[0][pred_class].item()
        print(f"  Predicted class index: {pred_class}")
        print(f"  Confidence: {confidence:.2%}")
    
    return {
        'image_name': img_name,
        'predicted_class': pred_class,
        'confidence': confidence
    }

print("✓ Processing function defined!")


## Process All Images

Now we'll process all 12 images from the dataset using all four CAM methods.


In [None]:
# List of images to process
img_list = [
    'happy_01932',
    'happy_01419',
    'disgust_00024',
    'disgust_00045',
    'angry_00009',
    'angry_00021',
    'surprise_00007',
    'surprise_00019',
    'sad_00007',
    'sad_00018',
    'neutral_00017',
    'neutral_00025'
]

# Process all images
print(f"\n{'='*60}")
print(f"Starting CAM processing")
print(f"Total images to process: {len(img_list)}")
print(f"{'='*60}\n")

start_time = time.time()
results = []

for img_name in img_list:
    result = process_single_image(img_name, display=True)
    results.append(result)

elapsed_time = time.time() - start_time

print(f"\n{'='*60}")
print(f"✓ All {len(img_list)} images processed successfully!")
print(f"Total time: {elapsed_time:.2f} seconds")
print(f"Average time per image: {elapsed_time/len(img_list):.2f} seconds")
print(f"{'='*60}")


## Results Summary

Let's view a summary of all processed images and their predictions.


In [None]:
# Display results summary
import pandas as pd

df = pd.DataFrame(results)
df.index = df.index + 1
df.columns = ['Image Name', 'Predicted Class', 'Confidence']
df['Confidence'] = df['Confidence'].apply(lambda x: f"{x:.2%}")

print("\nSummary of All Predictions:")
print("="*60)
print(df.to_string())
print("="*60)


## Analysis and Conclusions

### CAM Methods Compared:

1. **GradCAM**: Uses gradients to weight the importance of feature maps
2. **GradCAM++**: Improved version with better localization for multiple instances
3. **EigenCAM**: Uses principal components of activations
4. **Score-CAM**: Gradient-free method using perturbation-based importance

### Key Observations:

- All methods successfully highlight the facial regions in the emotion images
- Different CAM methods may emphasize different facial features
- The ResNet50 model (trained on ImageNet) provides reasonable feature extraction even for facial emotion images
- Score-CAM tends to be slower but provides gradient-free explanations

### Output Files:

All visualizations are saved in the `output/` directory:
- Individual CAM outputs for each method and image
- Comparison images showing all 4 methods side-by-side

### Notes:

- The model was pretrained on ImageNet (1000 classes), so predicted class indices refer to ImageNet classes
- For emotion-specific classification, fine-tuning on emotion datasets would be recommended
- CAM visualizations help understand which parts of the image the model focuses on
