# Brain Tumor MRI Dataset - Data Exploration & Validation

**Objective**: Comprehensive analysis to determine dataset readiness for preprocessing and model training.

**Dataset**: 253 MRI brain scans (binary classification: tumor vs no tumor)

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from pathlib import Path
import warnings
from collections import Counter
from scipy import stats
import json

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

## 1. Dataset Structure Analysis

In [2]:
# Define data paths
DATA_DIR = Path('./Data')
TUMOR_DIR = DATA_DIR / 'yes'
NO_TUMOR_DIR = DATA_DIR / 'no'

# Collect all image files
def get_image_files(directory):
    """Get all image files from directory with common extensions"""
    extensions = ['.jpg', '.jpeg', '.JPG', '.JPEG', '.png', '.PNG']
    files = []
    for ext in extensions:
        files.extend(directory.glob(f'*{ext}'))
    return sorted(files)

tumor_files = get_image_files(TUMOR_DIR)
no_tumor_files = get_image_files(NO_TUMOR_DIR)

print(f"Dataset Structure:")
print(f"├── Tumor images: {len(tumor_files)}")
print(f"├── No tumor images: {len(no_tumor_files)}")
print(f"└── Total images: {len(tumor_files) + len(no_tumor_files)}")

# Class balance analysis
total_images = len(tumor_files) + len(no_tumor_files)
tumor_ratio = len(tumor_files) / total_images
no_tumor_ratio = len(no_tumor_files) / total_images

print(f"\nClass Distribution:")
print(f"├── Tumor: {tumor_ratio:.1%}")
print(f"└── No tumor: {no_tumor_ratio:.1%}")

# Check for class imbalance
imbalance_ratio = min(tumor_ratio, no_tumor_ratio) / max(tumor_ratio, no_tumor_ratio)
print(f"\nImbalance Ratio: {imbalance_ratio:.2f} (>0.8 is balanced, >0.6 is acceptable)")

Dataset Structure:
├── Tumor images: 155
├── No tumor images: 98
└── Total images: 253

Class Distribution:
├── Tumor: 61.3%
└── No tumor: 38.7%

Imbalance Ratio: 0.63 (>0.8 is balanced, >0.6 is acceptable)


## 2. Image Quality Assessment

In [None]:
def analyze_image_properties(image_files, label):
    """Analyze basic image properties for quality assessment"""
    properties = {
        'widths': [], 'heights': [], 'channels': [],
        'file_sizes': [], 'mean_intensity': [],
        'std_intensity': [], 'contrast': []
    }
    
    corrupted_files = []
    total_files = len(image_files)
    
    print(f"Processing {total_files} {label} images...")
    
    for i, file_path in enumerate(image_files):
        # Progress indicator every 10 images or for last image
        if (i + 1) % 10 == 0 or (i + 1) == total_files:
            progress = ((i + 1) / total_files) * 100
            print(f"  Progress: {i+1}/{total_files} ({progress:.1f}%)")
            
        try:
            # Basic file properties
            file_size = file_path.stat().st_size / 1024  # KB
            
            # Load image with validation
            img = cv2.imread(str(file_path))
            if img is None:
                corrupted_files.append(str(file_path))
                print(f"     Failed to load: {file_path.name}")
                continue
                
            # Validate image dimensions
            if len(img.shape) < 2:
                corrupted_files.append(str(file_path))
                print(f"     Invalid dimensions: {file_path.name}")
                continue
                
            # Convert to grayscale for analysis
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Store properties
            properties['widths'].append(img.shape[1])
            properties['heights'].append(img.shape[0])
            properties['channels'].append(img.shape[2] if len(img.shape) > 2 else 1)
            properties['file_sizes'].append(file_size)
            properties['mean_intensity'].append(np.mean(gray))
            properties['std_intensity'].append(np.std(gray))
            
            # Contrast using Michelson contrast
            contrast = (np.max(gray) - np.min(gray)) / (np.max(gray) + np.min(gray) + 1e-8)
            properties['contrast'].append(contrast)
            
        except Exception as e:
            corrupted_files.append(str(file_path))
            print(f"     Error processing {file_path.name}: {str(e)}")
    
    # Summary
    valid_images = total_files - len(corrupted_files)
    print(f"  Successfully analyzed: {valid_images}/{total_files} images")
    if corrupted_files:
        print(f"  Corrupted/Invalid: {len(corrupted_files)} files")
    
    return properties, corrupted_files

# Analyze both classes with enhanced reporting
print("=" * 60)
print("STARTING IMAGE QUALITY ANALYSIS")
print("=" * 60)

print("\nPHASE 1: TUMOR CLASS ANALYSIS")
tumor_props, tumor_corrupted = analyze_image_properties(tumor_files, 'tumor')

print("\nPHASE 2: NO TUMOR CLASS ANALYSIS") 
no_tumor_props, no_tumor_corrupted = analyze_image_properties(no_tumor_files, 'no_tumor')

print("\n" + "=" * 60)
print("ANALYSIS SUMMARY")
print("=" * 60)
print(f"Total images processed: {len(tumor_files) + len(no_tumor_files)}")
print(f"Tumor images: {len(tumor_files)} ({len(tumor_files) - len(tumor_corrupted)} valid)")
print(f"No tumor images: {len(no_tumor_files)} ({len(no_tumor_files) - len(no_tumor_corrupted)} valid)")

print(f"\nFile integrity check:")
print(f"Tumor corrupted: {len(tumor_corrupted)}")
print(f"No tumor corrupted: {len(no_tumor_corrupted)}")

if tumor_corrupted or no_tumor_corrupted:
    print(f"\nCORRUPTED FILES DETECTED:")
    for file_path in tumor_corrupted + no_tumor_corrupted:
        print(f"   - {file_path}")
else:
    print(f"\nALL FILES SUCCESSFULLY VALIDATED")

In [None]:
# Create comprehensive statistics summary
def create_stats_summary(props, label):
    """Create statistical summary for image properties"""
    stats_data = {}
    
    for key, values in props.items():
        if values:  # Check if list is not empty
            stats_data[key] = {
                'mean': np.mean(values),
                'std': np.std(values),
                'min': np.min(values),
                'max': np.max(values),
                'median': np.median(values)
            }
    
    return pd.DataFrame(stats_data).round(2)

tumor_stats = create_stats_summary(tumor_props, 'Tumor')
no_tumor_stats = create_stats_summary(no_tumor_props, 'No Tumor')

print("TUMOR CLASS STATISTICS:")
print(tumor_stats)
print("\nNO TUMOR CLASS STATISTICS:")
print(no_tumor_stats)

## 3. Resolution and Format Consistency Analysis

In [None]:
# Analyze resolution consistency
all_widths = tumor_props['widths'] + no_tumor_props['widths']
all_heights = tumor_props['heights'] + no_tumor_props['heights']

width_counts = Counter(all_widths)
height_counts = Counter(all_heights)

print("RESOLUTION ANALYSIS:")
print(f"Unique widths: {len(width_counts)}")
print(f"Unique heights: {len(height_counts)}")
print(f"Most common resolution: {width_counts.most_common(1)[0][0]}x{height_counts.most_common(1)[0][0]}")
print(f"Resolution consistency: {width_counts.most_common(1)[0][1] / len(all_widths):.1%}")

# Visualize resolution distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Width distribution
axes[0].hist([tumor_props['widths'], no_tumor_props['widths']], 
             bins=20, alpha=0.7, label=['Tumor', 'No Tumor'])
axes[0].set_xlabel('Image Width (pixels)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Image Width Distribution')
axes[0].legend()

# Height distribution
axes[1].hist([tumor_props['heights'], no_tumor_props['heights']], 
             bins=20, alpha=0.7, label=['Tumor', 'No Tumor'])
axes[1].set_xlabel('Image Height (pixels)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Image Height Distribution')
axes[1].legend()

plt.tight_layout()
plt.show()

## 4. Intensity Distribution Analysis

In [None]:
# Analyze intensity distributions for preprocessing readiness
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Mean intensity comparison
axes[0,0].hist([tumor_props['mean_intensity'], no_tumor_props['mean_intensity']], 
               bins=30, alpha=0.7, label=['Tumor', 'No Tumor'])
axes[0,0].set_xlabel('Mean Intensity')
axes[0,0].set_ylabel('Frequency')
axes[0,0].set_title('Mean Intensity Distribution')
axes[0,0].legend()

# Standard deviation comparison
axes[0,1].hist([tumor_props['std_intensity'], no_tumor_props['std_intensity']], 
               bins=30, alpha=0.7, label=['Tumor', 'No Tumor'])
axes[0,1].set_xlabel('Intensity Standard Deviation')
axes[0,1].set_ylabel('Frequency')
axes[0,1].set_title('Intensity Variance Distribution')
axes[0,1].legend()

# Contrast comparison
axes[1,0].hist([tumor_props['contrast'], no_tumor_props['contrast']], 
               bins=30, alpha=0.7, label=['Tumor', 'No Tumor'])
axes[1,0].set_xlabel('Contrast (Michelson)')
axes[1,0].set_ylabel('Frequency')
axes[1,0].set_title('Image Contrast Distribution')
axes[1,0].legend()

# File size comparison
axes[1,1].hist([tumor_props['file_sizes'], no_tumor_props['file_sizes']], 
               bins=30, alpha=0.7, label=['Tumor', 'No Tumor'])
axes[1,1].set_xlabel('File Size (KB)')
axes[1,1].set_ylabel('Frequency')
axes[1,1].set_title('File Size Distribution')
axes[1,1].legend()

plt.tight_layout()
plt.show()

# Statistical tests for distribution differences
print("STATISTICAL TESTS FOR CLASS DIFFERENCES:")
intensity_pvalue = stats.mannwhitneyu(tumor_props['mean_intensity'], 
                                     no_tumor_props['mean_intensity'])[1]
contrast_pvalue = stats.mannwhitneyu(tumor_props['contrast'], 
                                   no_tumor_props['contrast'])[1]

print(f"Mean intensity difference p-value: {intensity_pvalue:.6f}")
print(f"Contrast difference p-value: {contrast_pvalue:.6f}")
print(f"Significant difference (p<0.05): {(intensity_pvalue < 0.05) or (contrast_pvalue < 0.05)}")

## 5. Sample Images Visualization

In [None]:
# Display sample images from both classes
fig, axes = plt.subplots(2, 5, figsize=(20, 8))

# Sample tumor images
for i in range(5):
    if i < len(tumor_files):
        img = cv2.imread(str(tumor_files[i]))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axes[0,i].imshow(img_rgb, cmap='gray')
        axes[0,i].set_title(f'Tumor {i+1}')
        axes[0,i].axis('off')

# Sample no tumor images
for i in range(5):
    if i < len(no_tumor_files):
        img = cv2.imread(str(no_tumor_files[i]))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axes[1,i].imshow(img_rgb, cmap='gray')
        axes[1,i].set_title(f'No Tumor {i+1}')
        axes[1,i].axis('off')

plt.suptitle('Sample Images from Dataset', fontsize=16)
plt.tight_layout()
plt.show()

## 6. Preprocessing Readiness Assessment

In [None]:
def assess_preprocessing_readiness():
    """Comprehensive assessment for preprocessing readiness"""
    
    readiness_score = 0
    max_score = 10
    issues = []
    recommendations = []
    
    print("PREPROCESSING READINESS ASSESSMENT:")
    print("=" * 50)
    
    # 1. Class balance (2 points)
    if imbalance_ratio > 0.6:
        readiness_score += 2
        print("✓ Class balance: ACCEPTABLE")
    else:
        issues.append("Severe class imbalance detected")
        recommendations.append("Apply class balancing techniques (SMOTE, class weights)")
        print("✗ Class balance: POOR")
    
    # 2. Corrupted files (1 point)
    total_corrupted = len(tumor_corrupted) + len(no_tumor_corrupted)
    if total_corrupted == 0:
        readiness_score += 1
        print("✓ File integrity: EXCELLENT")
    else:
        issues.append(f"{total_corrupted} corrupted files found")
        recommendations.append("Remove or repair corrupted image files")
        print(f"⚠ File integrity: {total_corrupted} corrupted files")
    
    # 3. Resolution consistency (2 points)
    resolution_consistency = width_counts.most_common(1)[0][1] / len(all_widths)
    if resolution_consistency > 0.8:
        readiness_score += 2
        print("✓ Resolution consistency: GOOD")
    elif resolution_consistency > 0.5:
        readiness_score += 1
        print("⚠ Resolution consistency: MODERATE")
        recommendations.append("Consider standardizing image resolution")
    else:
        issues.append("High resolution variance")
        recommendations.append("Mandatory resolution normalization required")
        print("✗ Resolution consistency: POOR")
    
    # 4. Dataset size (1 point)
    if total_images >= 200:
        readiness_score += 1
        print("✓ Dataset size: ADEQUATE")
    else:
        issues.append("Small dataset size")
        recommendations.append("Consider data augmentation strategies")
        print("⚠ Dataset size: SMALL")
    
    # 5. Intensity distribution (2 points)
    tumor_mean_std = np.std(tumor_props['mean_intensity'])
    no_tumor_mean_std = np.std(no_tumor_props['mean_intensity'])
    
    if tumor_mean_std < 50 and no_tumor_mean_std < 50:
        readiness_score += 2
        print("✓ Intensity consistency: GOOD")
    elif tumor_mean_std < 80 and no_tumor_mean_std < 80:
        readiness_score += 1
        print("⚠ Intensity consistency: MODERATE")
        recommendations.append("Apply intensity normalization")
    else:
        issues.append("High intensity variance")
        recommendations.append("Mandatory intensity normalization required")
        print("✗ Intensity consistency: POOR")
    
    # 6. Image quality (2 points)
    avg_contrast = np.mean(tumor_props['contrast'] + no_tumor_props['contrast'])
    if avg_contrast > 0.3:
        readiness_score += 2
        print("✓ Image quality: GOOD")
    elif avg_contrast > 0.15:
        readiness_score += 1
        print("⚠ Image quality: MODERATE")
        recommendations.append("Consider histogram equalization")
    else:
        issues.append("Low image contrast")
        recommendations.append("Apply contrast enhancement techniques")
        print("✗ Image quality: POOR")
    
    print("\n" + "=" * 50)
    print(f"FINAL READINESS SCORE: {readiness_score}/{max_score}")
    
    if readiness_score >= 8:
        status = "READY FOR PREPROCESSING"
    elif readiness_score >= 6:
        status = "READY WITH MINOR ADJUSTMENTS"
    elif readiness_score >= 4:
        status = "REQUIRES SIGNIFICANT PREPROCESSING"
    else:
        status = "NOT READY - MAJOR ISSUES DETECTED"
    
    print(f"STATUS: {status}")
    
    if issues:
        print("\nISSUES IDENTIFIED:")
        for i, issue in enumerate(issues, 1):
            print(f"{i}. {issue}")
    
    if recommendations:
        print("\nRECOMMENDATIONS:")
        for i, rec in enumerate(recommendations, 1):
            print(f"{i}. {rec}")
    
    return readiness_score, status, issues, recommendations

readiness_score, status, issues, recommendations = assess_preprocessing_readiness()

## 7. Final Report Generation

In [None]:
# Generate comprehensive report
report = {
    "dataset_summary": {
        "total_images": total_images,
        "tumor_images": len(tumor_files),
        "no_tumor_images": len(no_tumor_files),
        "class_imbalance_ratio": imbalance_ratio,
        "corrupted_files": len(tumor_corrupted) + len(no_tumor_corrupted)
    },
    "technical_specs": {
        "resolution_consistency": width_counts.most_common(1)[0][1] / len(all_widths),
        "most_common_resolution": f"{width_counts.most_common(1)[0][0]}x{height_counts.most_common(1)[0][0]}",
        "avg_file_size_kb": np.mean(tumor_props['file_sizes'] + no_tumor_props['file_sizes']),
        "avg_contrast": np.mean(tumor_props['contrast'] + no_tumor_props['contrast'])
    },
    "readiness_assessment": {
        "score": f"{readiness_score}/10",
        "status": status,
        "issues": issues,
        "recommendations": recommendations
    }
}

# Save report as JSON
with open('data_exploration_report.json', 'w') as f:
    json.dump(report, f, indent=2)

print("FINAL DATA EXPLORATION REPORT")
print("=" * 50)
print(json.dumps(report, indent=2))
print("\nReport saved as: data_exploration_report.json")

## Conclusion

This comprehensive data exploration provides essential metrics to determine preprocessing requirements. The dataset shows the following characteristics:

- **Dataset Size**: 253 images (adequate for deep learning with augmentation)
- **Class Balance**: Slightly imbalanced but manageable
- **Image Quality**: MRI scans with varying contrast and resolution
- **Preprocessing Needs**: Based on readiness score, specific preprocessing steps are recommended

**Next Steps**: Proceed to preprocessing phase based on the recommendations above.