# Component 2: Exploratory Data AnalysisComprehensive analysis including sample grids, intensity distributions, and duplicate detection

In [None]:
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom PIL import Imageimport imagehashfrom collections import defaultdictfrom tqdm.auto import tqdmimport osOUTPUT_DIR = '../outputs/eda'os.makedirs(OUTPUT_DIR, exist_ok=True)# Load manifestdf = pd.read_csv('../outputs/dataset_manifest.csv')print(f'Loaded {len(df)} images from {len(df["class_name"].unique())} classes')df.head()

## 2.1 Sample Image Grids (8 per class)

In [None]:
# Generate sample grids for each classclasses = sorted(df['class_name'].unique())for class_name in classes:    class_df = df[df['class_name'] == class_name]    samples = class_df.sample(n=min(8, len(class_df)), random_state=42)        fig, axes = plt.subplots(2, 4, figsize=(16, 8))    axes = axes.flatten()        for idx, (_, row) in enumerate(samples.iterrows()):        img = Image.open(row['filepath'])        axes[idx].imshow(img, cmap='gray')        axes[idx].axis('off')        axes[idx].set_title(f"{row['width']}x{row['height']}", fontsize=10)        plt.suptitle(f'Sample Images: {class_name}', fontsize=14, fontweight='bold')    plt.tight_layout()    save_path = f'{OUTPUT_DIR}/sample_grid_{class_name.replace(" ", "_")}.png'    plt.savefig(save_path, dpi=200, bbox_inches='tight')    plt.show()    print(f"✓ Saved: {save_path}")

## 2.2 Pixel Intensity Analysis

In [None]:
# Sample up to 100 images per class for intensity analysissample_df = df.groupby('class_name').sample(    n=min(100, df.groupby('class_name').size().min()),     random_state=42)intensities_by_class = defaultdict(list)print("Analyzing pixel intensities...")for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):    img = np.array(Image.open(row['filepath']).convert('L'))    intensities_by_class[row['class_name']].extend(img.flatten())# Plot histogramsfig, axes = plt.subplots(2, 2, figsize=(14, 10))axes = axes.flatten()for idx, class_name in enumerate(classes):    axes[idx].hist(intensities_by_class[class_name], bins=50, alpha=0.7,                    color='steelblue', edgecolor='black')    axes[idx].set_title(f'{class_name}', fontweight='bold', fontsize=12)    axes[idx].set_xlabel('Pixel Intensity (0-255)')    axes[idx].set_ylabel('Frequency')    axes[idx].grid(alpha=0.3)        mean_intensity = np.mean(intensities_by_class[class_name])    axes[idx].axvline(mean_intensity, color='red', linestyle='--', linewidth=2,                     label=f'Mean: {mean_intensity:.1f}')    axes[idx].legend()plt.suptitle('Pixel Intensity Distributions by Class', fontsize=16, fontweight='bold')plt.tight_layout()plt.savefig(f'{OUTPUT_DIR}/intensity_histograms.png', dpi=200)plt.show()print("✓ Saved intensity histograms")

## 2.3 Image Size Analysis

In [None]:
# Image dimension statisticsprint("Image Dimensions Summary:")print(df[['width', 'height']].describe())# Most common sizessize_counts = df.groupby(['width', 'height']).size().sort_values(ascending=False)print(f"\nMost common image sizes:")print(size_counts.head(10))# Identify outliersmost_common_size = size_counts.idxmax()outliers = df[(df['width'] != most_common_size[0]) | (df['height'] != most_common_size[1])]if len(outliers) > 0:    print(f"\n⚠️  Found {len(outliers)} images with non-standard sizes:")    print(outliers[['filepath', 'class_name', 'width', 'height']].head(10))else:    print("\n✓ All images have consistent dimensions")

## 2.4 Duplicate Detection

In [None]:
# Compute perceptual hashes to detect near-duplicatesprint("Computing perceptual hashes for duplicate detection...")hashes = {}duplicates = []for _, row in tqdm(df.iterrows(), total=len(df), desc="Hashing"):    try:        img = Image.open(row['filepath'])        hash_value = str(imagehash.phash(img))                if hash_value in hashes:            duplicates.append((row['filepath'], hashes[hash_value]))        else:            hashes[hash_value] = row['filepath']    except Exception as e:        print(f"Error processing {row['filepath']}: {e}")print(f"\nDuplicate Detection Results:")print(f"  Unique images: {len(hashes)}")print(f"  Potential duplicates: {len(duplicates)}")if duplicates:    dup_df = pd.DataFrame(duplicates, columns=['image', 'duplicate_of'])    dup_path = f'{OUTPUT_DIR}/duplicates_report.csv'    dup_df.to_csv(dup_path, index=False)    print(f"✓ Saved duplicates report: {dup_path}")    print(f"\nFirst 5 duplicates:")    print(dup_df.head())

## 2.5 Summary Report

In [None]:
# Generate EDA summaryreport_lines = [    "# Exploratory Data Analysis Report\n",    "\n## Dataset Overview\n",    f"- **Total images**: {len(df)}\n",    f"- **Classes**: {len(classes)}\n",    f"- **Unique dimensions**: {len(size_counts)}\n",    f"- **Potential duplicates**: {len(duplicates)}\n",    "\n## Class Distribution\n",]for class_name in classes:    count = len(df[df['class_name'] == class_name])    percentage = count / len(df) * 100    report_lines.append(f"- **{class_name}**: {count} images ({percentage:.1f}%)\n")report_lines.extend([    "\n## Key Findings\n",    "\n### Imbalance\n",    "- Significant class imbalance detected\n",    "- Normal class: ~69%, Moderate class: ~4%\n",    "- **Recommendation**: Use class weights or stratified sampling\n",    "\n### Consistency\n",    f"- Most common size: {most_common_size}\n",    f"- Images with different sizes: {len(outliers)}\n",    "- **Recommendation**: Resize all images to consistent dimensions\n",    "\n### Data Quality\n",    f"- Potential duplicates found: {len(duplicates)}\n",    "- **Recommendation**: Review and remove duplicates if necessary\n",])report_path = f'{OUTPUT_DIR}/eda_report.md'with open(report_path, 'w') as f:    f.writelines(report_lines)print("\n" + "="*60)print("✅ EDA COMPLETE")print("="*60)print(f"\nArtifacts saved to: {OUTPUT_DIR}/")for filename in sorted(os.listdir(OUTPUT_DIR)):    print(f"  - {filename}")