# Data Exploration for Skin Cancer Detection

**Team**: Dr. Homi Jehangir Bhabha  
**Problem Statement**: PS 18

This notebook explores the skin lesion dataset used for training the classification model.

In [None]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# Add src to path
sys.path.insert(0, '../src')

from config import *

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("Libraries imported successfully!")

## 1. Dataset Overview

In [None]:
# Count images in each category
def count_images(directory):
    """Count images in each subdirectory"""
    counts = {}
    if os.path.exists(directory):
        for class_name in os.listdir(directory):
            class_path = os.path.join(directory, class_name)
            if os.path.isdir(class_path):
                counts[class_name] = len([f for f in os.listdir(class_path) 
                                          if f.endswith(('.jpg', '.jpeg', '.png'))])
    return counts

# Count images in train, val, and test sets
train_counts = count_images(TRAIN_DIR)
val_counts = count_images(VAL_DIR)
test_counts = count_images(TEST_DIR)

print("Training set:", train_counts)
print("Validation set:", val_counts)
print("Test set:", test_counts)

## 2. Class Distribution

In [None]:
# Visualize class distribution
if train_counts:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Training set
    if train_counts:
        axes[0].bar(train_counts.keys(), train_counts.values())
        axes[0].set_title('Training Set Distribution')
        axes[0].set_ylabel('Number of Images')
        axes[0].tick_params(axis='x', rotation=45)
    
    # Validation set
    if val_counts:
        axes[1].bar(val_counts.keys(), val_counts.values())
        axes[1].set_title('Validation Set Distribution')
        axes[1].set_ylabel('Number of Images')
        axes[1].tick_params(axis='x', rotation=45)
    
    # Test set
    if test_counts:
        axes[2].bar(test_counts.keys(), test_counts.values())
        axes[2].set_title('Test Set Distribution')
        axes[2].set_ylabel('Number of Images')
        axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("No training data found. Please add images to data/raw/train/")

## 3. Sample Images

In [None]:
# Display sample images from each class
def display_samples(directory, n_samples=5):
    """Display sample images from each class"""
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return
    
    classes = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
    
    if not classes:
        print("No class directories found")
        return
    
    fig, axes = plt.subplots(len(classes), n_samples, figsize=(15, 3*len(classes)))
    
    if len(classes) == 1:
        axes = axes.reshape(1, -1)
    
    for i, class_name in enumerate(classes):
        class_path = os.path.join(directory, class_name)
        images = [f for f in os.listdir(class_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
        
        for j in range(min(n_samples, len(images))):
            img_path = os.path.join(class_path, images[j])
            img = Image.open(img_path)
            axes[i, j].imshow(img)
            axes[i, j].axis('off')
            if j == 0:
                axes[i, j].set_title(f'{class_name}', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()

display_samples(TRAIN_DIR, n_samples=5)

## 4. Image Properties Analysis

In [None]:
# Analyze image dimensions
def analyze_image_properties(directory, max_samples=100):
    """Analyze image dimensions and properties"""
    widths = []
    heights = []
    aspects = []
    
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return
    
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(('.jpg', '.jpeg', '.png')):
                try:
                    img_path = os.path.join(root, file)
                    img = Image.open(img_path)
                    width, height = img.size
                    widths.append(width)
                    heights.append(height)
                    aspects.append(width / height)
                    count += 1
                    if count >= max_samples:
                        break
                except Exception as e:
                    print(f"Error processing {file}: {e}")
        if count >= max_samples:
            break
    
    if widths:
        print(f"Analyzed {len(widths)} images")
        print(f"Width - Min: {min(widths)}, Max: {max(widths)}, Mean: {np.mean(widths):.2f}")
        print(f"Height - Min: {min(heights)}, Max: {max(heights)}, Mean: {np.mean(heights):.2f}")
        print(f"Aspect Ratio - Min: {min(aspects):.2f}, Max: {max(aspects):.2f}, Mean: {np.mean(aspects):.2f}")
        
        # Visualize distributions
        fig, axes = plt.subplots(1, 3, figsize=(15, 4))
        
        axes[0].hist(widths, bins=30, edgecolor='black')
        axes[0].set_title('Width Distribution')
        axes[0].set_xlabel('Width (pixels)')
        axes[0].set_ylabel('Frequency')
        
        axes[1].hist(heights, bins=30, edgecolor='black')
        axes[1].set_title('Height Distribution')
        axes[1].set_xlabel('Height (pixels)')
        axes[1].set_ylabel('Frequency')
        
        axes[2].hist(aspects, bins=30, edgecolor='black')
        axes[2].set_title('Aspect Ratio Distribution')
        axes[2].set_xlabel('Aspect Ratio')
        axes[2].set_ylabel('Frequency')
        
        plt.tight_layout()
        plt.show()
    else:
        print("No images found to analyze")

analyze_image_properties(TRAIN_DIR)

## 5. Summary

This notebook provides an overview of the dataset structure and properties. Key findings:

- Dataset structure with training, validation, and test splits
- Class distribution and balance
- Sample visualizations
- Image property analysis

Next steps:
- Proceed to model training (see `train.py`)
- Experiment with different architectures (see `model_experiments.ipynb`)