<a href="https://colab.research.google.com/github/atharv-arya/3D-Brain-Tumor-Segmentation/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/

/content/drive


In [None]:
!ls /content/drive/MyDrive/CSCI566/data/data/


ASNR-MICCAI-BraTS2023-GLI-Challenge-TrainingData
ASNR-MICCAI-BraTS2023-GLI-Challenge-ValidationData


In [None]:
import os
import numpy as np
import nibabel as nib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from glob import glob

In [None]:
# Define the root directory
root_dir = "/content/drive/MyDrive/CSCI566/data/data/ASNR-MICCAI-BraTS2023-GLI-Challenge-TrainingData/ASNR-MICCAI-BraTS2023-GLI-Challenge-TrainingData/"

# Get all subject folders
subjects = glob(os.path.join(root_dir, "BraTS-GLI-*"))

print(f"Total number of subjects: {len(subjects)}")

# Check modalities available
modalities = ["t1", "t1ce", "t2", "flair"]  # t1, t1ce, t2, flair'

Total number of subjects: 262


## Image Properties Analysis

In [None]:
from glob import glob
import os
import numpy as np
import nibabel as nib
from tqdm import tqdm

# Correct modality names
modalities = ["t1", "t1ce", "t2", "flair"]

def analyze_scan_properties(subject_paths):
    properties = {
        'shape': [],
        'max_intensity': {mod: [] for mod in modalities},
        'min_intensity': {mod: [] for mod in modalities},
        'mean_intensity': {mod: [] for mod in modalities},
        'std_intensity': {mod: [] for mod in modalities},
        'spacing': [],
        'subject_id': []
    }

    for subject in tqdm(subject_paths):
        subj_id = os.path.basename(subject)
        properties['subject_id'].append(subj_id)

        # Load each modality
        for mod in modalities:
            nii_files = glob(os.path.join(subject, f"*{mod}.nii"))  # Look directly inside subject folder
            if not nii_files:
                print(f"Warning: No NIfTI file found for modality '{mod}' in {subject}")
                continue  # Skip to next modality

            nii_file = nii_files[0]  # First file found
            img = nib.load(nii_file)
            data = img.get_fdata()

            # Store properties
            if mod == modalities[0]:  # Only store shape and spacing once per subject
                properties['shape'].append(data.shape)
                properties['spacing'].append(img.header.get_zooms())

            # Store intensity information
            properties['max_intensity'][mod].append(np.max(data))
            properties['min_intensity'][mod].append(np.min(data))
            properties['mean_intensity'][mod].append(np.mean(data))
            properties['std_intensity'][mod].append(np.std(data))

    return properties

properties = analyze_scan_properties(subjects[:10])  # Start with a subset for testing


100%|██████████| 10/10 [01:19<00:00,  7.99s/it]


## Visualizing Scan Properties

In [None]:
# Define the correct directory in Google Drive
distributions_folder = "/content/drive/MyDrive/CSCI566/distribution"

# Ensure the folder exists
os.makedirs(distributions_folder, exist_ok=True)

# Plot Intensity Distributions
plt.figure(figsize=(15, 10))
for i, mod in enumerate(modalities):
    plt.subplot(2, 2, i+1)
    sns.histplot(properties['mean_intensity'][mod], kde=True)
    plt.title(f"Mean Intensity Distribution - {mod}")
plt.tight_layout()
plt.savefig(os.path.join(distributions_folder, "intensity_distributions.png"))  # ✅ Fix: Save inside Drive
plt.close()

# Plot Image Dimensions
plt.figure(figsize=(10, 6))
shapes = [f"{s[0]}x{s[1]}x{s[2]}" for s in properties['shape']]
shape_counts = pd.Series(shapes).value_counts()
sns.barplot(x=shape_counts.index, y=shape_counts.values)
plt.title("Distribution of Image Dimensions")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(distributions_folder, "dimension_distribution.png"))  # ✅ Fix: Save inside Drive
plt.close()

print(f"Plots saved in: {distributions_folder}")

Plots saved in: /content/drive/MyDrive/CSCI566/distribution


## Sample Visualization

In [None]:
import random

# Define the correct Google Drive save directory
save_dir = "/content/drive/MyDrive/CSCI566/visualizations"
os.makedirs(save_dir, exist_ok=True)  # Ensure directory exists

def visualize_subject(subject_path, save_dir=save_dir):
    subject_id = os.path.basename(subject_path)

    plt.figure(figsize=(15, 15))
    for i, mod in enumerate(modalities):
        # ✅ Fix: Search for .nii files directly inside the subject folder
        nii_files = glob(os.path.join(subject_path, f"*{mod}.nii"))
        if not nii_files:
            print(f"Warning: No NIfTI file found for modality '{mod}' in {subject_path}")
            continue  # Skip this modality

        nii_file = nii_files[0]  # First matching file
        img = nib.load(nii_file)
        data = img.get_fdata()

        # Get middle slices in axial, sagittal, and coronal planes
        z_mid = data.shape[2] // 2
        y_mid = data.shape[1] // 2
        x_mid = data.shape[0] // 2

        # Plot axial, sagittal, and coronal views
        plt.subplot(4, 3, i*3 + 1)
        plt.imshow(data[:, :, z_mid], cmap='gray')
        plt.title(f"{mod} - Axial")
        plt.axis('off')

        plt.subplot(4, 3, i*3 + 2)
        plt.imshow(data[:, y_mid, :], cmap='gray')
        plt.title(f"{mod} - Sagittal")
        plt.axis('off')

        plt.subplot(4, 3, i*3 + 3)
        plt.imshow(data[x_mid, :, :], cmap='gray')
        plt.title(f"{mod} - Coronal")
        plt.axis('off')

    plt.suptitle(f"Subject: {subject_id}", fontsize=16)
    plt.tight_layout()
    plt.subplots_adjust(top=0.95)

    # ✅ Fix: Save inside Google Drive directory
    save_path = os.path.join(save_dir, f"{subject_id}_visualization.png")
    plt.savefig(save_path)
    plt.close()

    return save_path  # Return path of the saved visualization

# ✅ Fix: Reduce sample size to 10 (100 is too many for testing)
sample_subjects = random.sample(subjects, 10)
for subject in sample_subjects:
    save_path = visualize_subject(subject)
    print(f"Saved visualization to: {save_path}")


Saved visualization to: /content/drive/MyDrive/CSCI566/visualizations/BraTS-GLI-00018-000_visualization.png
Saved visualization to: /content/drive/MyDrive/CSCI566/visualizations/BraTS-GLI-00545-000_visualization.png
Saved visualization to: /content/drive/MyDrive/CSCI566/visualizations/BraTS-GLI-00533-000_visualization.png
Saved visualization to: /content/drive/MyDrive/CSCI566/visualizations/BraTS-GLI-00046-000_visualization.png
Saved visualization to: /content/drive/MyDrive/CSCI566/visualizations/BraTS-GLI-00078-000_visualization.png
Saved visualization to: /content/drive/MyDrive/CSCI566/visualizations/BraTS-GLI-00528-000_visualization.png
Saved visualization to: /content/drive/MyDrive/CSCI566/visualizations/BraTS-GLI-00714-000_visualization.png
Saved visualization to: /content/drive/MyDrive/CSCI566/visualizations/BraTS-GLI-00663-000_visualization.png
Saved visualization to: /content/drive/MyDrive/CSCI566/visualizations/BraTS-GLI-00479-000_visualization.png
Saved visualization to: /con

## Histogram Matching Analysis

In [None]:
def compare_intensity_histograms(subjects):
    plt.figure(figsize=(15, 10))

    for i, mod in enumerate(modalities):
        plt.subplot(2, 2, i+1)

        for j, subject in enumerate(subjects[:5]):  # Limit to 5 subjects for clarity
            # ✅ Fix: Search for .nii files directly inside the subject folder
            nii_files = glob(os.path.join(subject, f"*{mod}.nii"))
            if not nii_files:
                print(f"Warning: No NIfTI file found for modality '{mod}' in {subject}")
                continue  # Skip this subject for this modality

            nii_file = nii_files[0]  # Use the first matching file
            img = nib.load(nii_file)
            data = img.get_fdata()

            # Plot histogram
            hist, bins = np.histogram(data.flatten(), bins=100, density=True)
            plt.plot(bins[:-1], hist, label=f"Subject {j+1}")

        plt.title(f"Intensity Histogram - {mod}")
        plt.legend()

    plt.tight_layout()

    # ✅ Fix: Save to Google Drive instead of an unsupported location
    save_path = "/content/drive/MyDrive/CSCI566/visualizations/intensity_histogram_comparison.png"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)  # Ensure folder exists
    plt.savefig(save_path)
    plt.close()

    print(f"Histogram comparison saved at: {save_path}")

# ✅ Fix: Use a small test set first
compare_intensity_histograms(sample_subjects[:5])  # Compare first 5 subjects


Histogram comparison saved at: /content/drive/MyDrive/CSCI566/visualizations/intensity_histogram_comparison.png


In [None]:
def compare_intensity_histograms(subjects, save_dir="/content/drive/MyDrive/CSCI566/visualizations/",
                                figsize=(15, 10), num_subjects=100, num_bins=100,
                                x_range=None, y_range=None):
    """
    Compare intensity histograms across different subjects and modalities.

    Parameters:
    -----------
    subjects : list
        List of subject paths
    save_dir : str
        Directory to save the visualization
    figsize : tuple
        Figure size (width, height) in inches
    num_subjects : int
        Number of subjects to include in comparison
    num_bins : int
        Number of bins for histogram
    x_range : tuple or None
        Optional (min, max) to limit x-axis range
    y_range : tuple or None
        Optional (min, max) to limit y-axis range
    """
    # ✅ Fix: Save inside Google Drive
    os.makedirs(save_dir, exist_ok=True)

    plt.figure(figsize=figsize)

    for i, mod in enumerate(modalities):
        plt.subplot(2, 2, i+1)

        for j, subject in enumerate(subjects[:num_subjects]):
            # ✅ Fix: Search directly inside subject folder
            nii_files = glob(os.path.join(subject, f"*{mod}.nii"))
            if not nii_files:
                print(f"Warning: No NIfTI file found for modality '{mod}' in {subject}")
                continue  # Skip to the next subject

            nii_file = nii_files[0]  # Use the first matching file
            subject_id = os.path.basename(subject)

            img = nib.load(nii_file)
            data = img.get_fdata()

            # Plot histogram
            hist, bins = np.histogram(data.flatten(), bins=num_bins, density=True)
            plt.plot(bins[:-1], hist, label=f"{subject_id}")

        plt.title(f"Intensity Histogram - {mod}")

        # Set axis ranges if specified
        if x_range:
            plt.xlim(x_range)
        if y_range:
            plt.ylim(y_range)

        plt.xlabel("Intensity")
        plt.ylabel("Frequency (density)")
        plt.legend(fontsize='small')

    plt.tight_layout()

    # ✅ Fix: Save inside Google Drive instead of `/mnt/gpussd2/`
    save_path = os.path.join(save_dir, "intensity_histogram_comparison_3.png")
    plt.savefig(save_path, dpi=300)
    plt.close()

    print(f"Saved intensity histogram to: {save_path}")
    return save_path

# ✅ Fix: Use fewer subjects for clarity
save_path = compare_intensity_histograms(
    sample_subjects,
    figsize=(18, 12),
    num_subjects=3,  # Compare fewer subjects for clarity
    num_bins=150,    # Finer histogram bins
    x_range=(0, 2000)  # Focus on relevant intensity range
)

Saved intensity histogram to: /content/drive/MyDrive/CSCI566/visualizations/intensity_histogram_comparison_3.png


## Advanced Analysis - Brain Volume Statistics

In [None]:
def analyze_brain_volumes(subjects):
    volumes = {mod: [] for mod in modalities}
    subject_ids = []

    for subject in tqdm(subjects):
        subject_id = os.path.basename(subject)
        subject_ids.append(subject_id)

        for mod in modalities:
            # ✅ Search directly inside the subject folder
            nii_files = glob(os.path.join(subject, f"*{mod}.nii"))
            if not nii_files:
                print(f"Warning: No NIfTI file found for modality '{mod}' in {subject}")
                volumes[mod].append(np.nan)  # Fill missing with NaN
                continue

            nii_file = nii_files[0]
            img = nib.load(nii_file)
            data = img.get_fdata()

            # ✅ Apply a simple threshold to separate brain from background
            threshold = np.mean(data) * 0.1
            brain_mask = data > threshold

            # ✅ Calculate volume (considering voxel spacing)
            voxel_volume = np.prod(img.header.get_zooms())  # mm³ per voxel
            brain_volume = np.sum(brain_mask) * voxel_volume
            volumes[mod].append(brain_volume)

    # ✅ Create DataFrame
    df = pd.DataFrame({
        'subject_id': subject_ids,
        **{f"{mod}_volume": volumes[mod] for mod in modalities}
    })

    return df

# ✅ Analyze on a small subset
volume_df = analyze_brain_volumes(subjects[:20])

# ✅ Save plot to Google Drive
save_path = "/content/drive/MyDrive/CSCI566/visualizations/brain_volume_distribution.png"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# ✅ Visualize volumes using KDE plots
plt.figure(figsize=(8, 6))
volume_data = volume_df.melt(id_vars="subject_id", var_name="Modality", value_name="Volume")
sns.boxplot(x="Modality", y="Volume", data=volume_data)
plt.title("Brain Volume Distribution by Modality")
plt.ylabel("Volume (mm³)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(save_path.replace(".png", "_boxplot.png"), dpi=300)
plt.close()



print(f"Brain volume distribution plot saved at: {save_path}")


100%|██████████| 20/20 [01:24<00:00,  4.21s/it]


Brain volume distribution plot saved at: /content/drive/MyDrive/CSCI566/visualizations/brain_volume_distribution.png


## Checking for Missing or Corrupted Files

In [None]:
def check_data_integrity(subjects):
    issues = []

    for subject in tqdm(subjects):
        subject_id = os.path.basename(subject)

        # ✅ Fix: Check for missing modalities directly inside subject folder
        for mod in modalities:
            nii_files = glob(os.path.join(subject, f"*{mod}.nii"))  # ✅ Look for .nii files directly
            if not nii_files:
                issues.append(f"❌ Missing {mod}.nii file for subject {subject_id}")
                continue  # Skip this modality

            nii_file = nii_files[0]  # Use the first found file

            # ✅ Try loading the file safely
            try:
                img = nib.load(nii_file)
                data = img.get_fdata()

                # ✅ Check for NaN values
                if np.isnan(data).any():
                    issues.append(f"⚠️ NaN values found in {mod} for subject {subject_id}")

                # ✅ Check for all-zero data (which could indicate a processing error)
                if np.all(data == 0):
                    issues.append(f"⚠️ All-zero data in {mod} for subject {subject_id}")

                # ✅ Check if image is empty (no meaningful data)
                if data.size == 0:
                    issues.append(f"⚠️ Empty image file detected in {mod} for subject {subject_id}")

            except Exception as e:
                issues.append(f"🚨 Error loading {mod} for subject {subject_id}: {str(e)}")

    return issues

# ✅ Run integrity check on all subjects
integrity_issues = check_data_integrity(subjects)

# ✅ Print results
if integrity_issues:
    print(f"\n🚨 Found {len(integrity_issues)} data integrity issues:")
    for issue in integrity_issues:
        print(f"- {issue}")
else:
    print("✅ No data integrity issues found!")


100%|██████████| 262/262 [31:00<00:00,  7.10s/it]

✅ No data integrity issues found!





## Report Generation

In [None]:
def generate_eda_report(properties, volume_df, integrity_issues):
    # ✅ Fix: Save in Google Drive for persistence
    save_path = "/content/drive/MyDrive/CSCI566/reports/BraTS_2023_EDA_Report.md"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    with open(save_path, "w") as f:
        f.write("# BraTS 2023 Dataset Exploratory Data Analysis\n\n")

        # ✅ Dataset Overview
        f.write("## Dataset Overview\n\n")
        f.write(f"- **Total subjects:** {len(properties['subject_id'])}\n")
        f.write(f"- **Modalities:** {', '.join(modalities)}\n")

        # ✅ Image Properties
        f.write("\n## Image Properties\n\n")
        unique_shapes = set(str(s) for s in properties['shape'])
        f.write(f"- **Image dimensions:** {', '.join(unique_shapes)}\n")

        # ✅ Intensity Statistics
        f.write("\n## Intensity Statistics\n\n")
        f.write("| Modality | Min | Max | Mean | Std |\n")
        f.write("|----------|-----|-----|------|-----|\n")
        for mod in modalities:
            min_val = np.min(properties['min_intensity'][mod])  # ✅ Use min(), not mean()
            max_val = np.max(properties['max_intensity'][mod])  # ✅ Use max()
            mean_val = np.mean(properties['mean_intensity'][mod])
            std_val = np.std(properties['std_intensity'][mod])
            f.write(f"| {mod} | {min_val:.2f} | {max_val:.2f} | {mean_val:.2f} | {std_val:.2f} |\n")

        # ✅ Brain Volume Statistics
        f.write("\n## Brain Volume Statistics\n\n")
        f.write("| Modality | Min Volume (mm³) | Max Volume (mm³) | Mean Volume (mm³) | Std Volume (mm³) |\n")
        f.write("|----------|------------------|------------------|--------------------|------------------|\n")
        for mod in modalities:
            vol_col = f"{mod}_volume"
            if vol_col in volume_df.columns:
                min_vol = volume_df[vol_col].dropna().min()  # ✅ Drop NaNs before computation
                max_vol = volume_df[vol_col].dropna().max()
                mean_vol = volume_df[vol_col].dropna().mean()
                std_vol = volume_df[vol_col].dropna().std()
                f.write(f"| {mod} | {min_vol:.2f} | {max_vol:.2f} | {mean_vol:.2f} | {std_vol:.2f} |\n")

        # ✅ Data Integrity Issues
        f.write("\n## Data Integrity\n\n")
        if integrity_issues:
            f.write(f"⚠️ **Found {len(integrity_issues)} issues:**\n\n")
            for issue in integrity_issues:
                f.write(f"- {issue}\n")
        else:
            f.write("✅ No data integrity issues found.\n")

        # ✅ Visualization References
        f.write("\n## Visualizations\n\n")
        f.write("- 🖼️ See `intensity_distributions.png` for intensity distributions.\n")
        f.write("- 📏 See `dimension_distribution.png` for image dimension distribution.\n")
        f.write("- 📊 See `intensity_histogram_comparison.png` for intensity histogram comparison.\n")
        f.write("- 🧠 See `brain_volume_distribution.png` for brain volume distribution.\n")
        f.write("- 🔍 Individual subject visualizations saved as `<subject_id>_visualization.png`\n")

    # ✅ Print confirmation
    print(f"📄 EDA report saved at: {save_path}")

# ✅ Generate the report
generate_eda_report(properties, volume_df, integrity_issues)


📄 EDA report saved at: /content/drive/MyDrive/CSCI566/reports/BraTS_2023_EDA_Report.md
