# Visual Concept Analysis: Exploring Semantic Structures in Image Collections

 A inital analysis of the dataset to uncover visual patterns, clusters, and relationships among image categories. The focus is mianly on understanding and organizing the visual space.

## Section 1: Environment Setup and Data Loading

In this section, we will:
- Import all necessary libraries for data manipulation, visualization, and machine learning
- Load the dataset
- Perform initial data inspection to understand the structure, dimensions, and basic characteristics of our dataset

The libraries selected cover essential functionalities:
- **Data handling**: pandas, numpy for efficient data structures and numerical operations
- **Visualization**: matplotlib, seaborn, plotly for exploratory and publication-quality visualizations
- **Machine Learning**: scikit-learn for preprocessing, modeling, and evaluation and keras for neural networks

In [None]:
# Import core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import os
from pathlib import Path
import glob

# Image processing libraries
from PIL import Image
import cv2

# Sklearn preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer

# Sklearn model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold

# Sklearn metrics
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, 
    precision_score, recall_score, f1_score, roc_auc_score, roc_curve,
    silhouette_score, davies_bouldin_score, calinski_harabasz_score
)

# Models for classification/clustering
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Deep Learning for feature extraction
try:
    import tensorflow as tf
    from tensorflow.keras.applications import VGG16, ResNet50, MobileNetV2
    from tensorflow.keras.applications.vgg16 import preprocess_input as vgg16_preprocess
    from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
    from tensorflow.keras.applications.mobilenet_v2 import preprocess_input as mobilenet_preprocess
    from tensorflow.keras.preprocessing import image
    print("TensorFlow/Keras available for deep feature extraction")
except ImportError:
    print("TensorFlow/Keras not available - will use traditional feature extraction")

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Set random seed for reproducibility
RANDOM_STATE = 69  # we're aware 42 is the industry standard, but 69 is cooler
np.random.seed(RANDOM_STATE)

print("Libraries imported successfully!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"OpenCV version: {cv2.__version__}")
print(f"PIL version: {Image.__version__}")

### 1.1 Load Image Dataset

The patternmind_dataset is organized as a folder-based image collection where:
- Each subfolder in `.data/` represents a distinct visual category
- Images within each folder are labeled by their parent folder name
- This is a hierarchical structure typical of image classification datasets

We will:
- Scan the `.data/` directory to identify all categories
- Build a catalog (DataFrame) containing image paths and their corresponding labels
- Calculate dataset statistics (number of categories, images per category, total images)

Expected output: A DataFrame with columns for image paths and category labels, along with a summary showing the distribution of images across categories. This catalog will serve as the foundation for all subsequent analysis.

In [None]:
# Define the data directory
DATA_DIR = Path('.data')

# Initialize lists to store image paths and labels
image_paths = []
labels = []

# Scan the data directory for category folders
print("Scanning dataset directory...")
print("="*50)

# Get all category folders
category_folders = [f for f in DATA_DIR.iterdir() if f.is_dir()]
category_folders = sorted(category_folders)

print(f"Found {len(category_folders)} categories")
print("\nScanning images in each category...")

# Iterate through each category folder
for category_folder in category_folders:
    category_name = category_folder.name
    
    # Find all image files in the category folder
    # Support common image formats: jpg, jpeg, png, bmp
    image_files = list(category_folder.glob('*.jpg')) + \
                  list(category_folder.glob('*.jpeg')) + \
                  list(category_folder.glob('*.png')) + \
                  list(category_folder.glob('*.bmp'))
    
    # Add to our lists
    for img_path in image_files:
        image_paths.append(str(img_path))
        labels.append(category_name)

# Create a DataFrame
df = pd.DataFrame({
    'image_path': image_paths,
    'category': labels
})

# Display basic information
print("\n" + "="*50)
print("Dataset loaded successfully!")
print("="*50)
print(f"\nDataset shape: {df.shape}")
print(f"Total number of images: {df.shape[0]:,}")
print(f"Number of categories: {df['category'].nunique()}")

print("\n" + "="*50)
print("Category Distribution:")
print("="*50)
category_counts = df['category'].value_counts().sort_index()
print(f"\nImages per category (first 10):")
display(category_counts.head(10))

print(f"\nStatistics:")
print(f"  Mean images per category: {category_counts.mean():.1f}")
print(f"  Median images per category: {category_counts.median():.1f}")
print(f"  Min images per category: {category_counts.min()}")
print(f"  Max images per category: {category_counts.max()}")

print("\n" + "="*50)
print("First few rows of the dataset:")
print("="*50)
display(df.head(10))

### 1.2 Dataset Overview and Validation

Now we will validate the integrity of our image dataset by:
- Checking for any corrupted or unreadable images
- Verifying image dimensions and formats
- Analyzing the distribution of images across categories
- Identifying potential class imbalance issues

We'll also create a sample visualization showing example images from random categories to visually confirm the dataset quality and diversity.

Expected output: Summary statistics about image properties, a class distribution chart, and a grid of sample images from various categories.

In [None]:
# Sample a few images to check dimensions and formats
print("Analyzing image properties...")
print("="*50)

sample_size = min(100, len(df))
sample_indices = np.random.choice(len(df), size=sample_size, replace=False)

widths = []
heights = []
formats = []
corrupted_images = []

for idx in sample_indices:
    img_path = df.iloc[idx]['image_path']
    try:
        img = Image.open(img_path)
        widths.append(img.width)
        heights.append(img.height)
        formats.append(img.format)
        img.close()
    except Exception as e:
        corrupted_images.append(img_path)
        print(f"Warning: Could not read {img_path}: {e}")

if len(corrupted_images) > 0:
    print(f"\n⚠️ Found {len(corrupted_images)} corrupted images")
else:
    print("\n✓ All sampled images are valid")

print(f"\nImage Dimensions (from {sample_size} samples):")
print(f"  Width  - Min: {min(widths)}px, Max: {max(widths)}px, Mean: {np.mean(widths):.1f}px")
print(f"  Height - Min: {min(heights)}px, Max: {max(heights)}px, Mean: {np.mean(heights):.1f}px")
print(f"\nImage Formats: {set(formats)}")

# Analyze category distribution
print("\n" + "="*50)
print("Category Distribution Analysis:")
print("="*50)

category_counts = df['category'].value_counts()
print(f"\nTop 10 most common categories:")
display(category_counts.head(10))

print(f"\nTop 10 least common categories:")
display(category_counts.tail(10))

# Check for class imbalance
imbalance_ratio = category_counts.max() / category_counts.min()
print(f"\nClass Imbalance Ratio: {imbalance_ratio:.2f}x")
if imbalance_ratio > 3:
    print("⚠️ Significant class imbalance detected - consider stratified sampling")
else:
    print("✓ Relatively balanced dataset")

# Store key information for later use
print("\n" + "="*50)
print("Dataset Summary:")
print("="*50)
print(f"Total Images: {len(df):,}")
print(f"Total Categories: {df['category'].nunique()}")
print(f"Average Images per Category: {len(df) / df['category'].nunique():.1f}")

### 1.3 Visualize Sample Images

To better understand our dataset, we'll display a grid of sample images from different categories. This visual inspection helps us:
- Verify that images are loading correctly
- Understand the visual diversity within and across categories
- Identify any obvious data quality issues
- Get an intuitive sense of the classification challenge ahead

The output will show a grid of randomly selected images with their category labels, giving us a qualitative view of what the model will need to learn to distinguish.

In [None]:
# Visualize sample images from different categories
n_categories_to_show = 8
n_images_per_category = 3

# Select random categories
random_categories = np.random.choice(df['category'].unique(), 
                                     size=min(n_categories_to_show, df['category'].nunique()), 
                                     replace=False)

# Create figure
fig, axes = plt.subplots(n_categories_to_show, n_images_per_category, 
                         figsize=(15, 2.5*n_categories_to_show))

if n_categories_to_show == 1:
    axes = axes.reshape(1, -1)

print("Displaying sample images from random categories...")
print("="*50)

for i, category in enumerate(random_categories):
    # Get images from this category
    category_df = df[df['category'] == category]
    
    # Sample random images
    sampled_images = category_df.sample(n=min(n_images_per_category, len(category_df)))
    
    for j, (idx, row) in enumerate(sampled_images.iterrows()):
        img_path = row['image_path']
        
        try:
            img = Image.open(img_path)
            
            # Display image
            if n_categories_to_show > 1:
                ax = axes[i, j]
            else:
                ax = axes[j]
                
            ax.imshow(img)
            ax.axis('off')
            
            if j == 0:  # Add category label to first image in row
                ax.set_title(f"{category}\n({len(category_df)} images)", 
                           fontsize=10, fontweight='bold')
            
            img.close()
            
        except Exception as e:
            print(f"Error loading {img_path}: {e}")

plt.tight_layout()
plt.savefig('images/sample_images_grid.png', dpi=150, bbox_inches='tight')
print("✓ Sample grid saved to 'images/sample_images_grid.png'")
plt.show()

# Additional: Show category name statistics
print("\n" + "="*50)
print("Category Name Analysis:")
print("="*50)
print(f"Shortest category name: '{min(df['category'].unique(), key=len)}' ({len(min(df['category'].unique(), key=len))} chars)")
print(f"Longest category name: '{max(df['category'].unique(), key=len)}' ({len(max(df['category'].unique(), key=len))} chars)")
print(f"\nAll categories ({len(df['category'].unique())} total):")
print(sorted(df['category'].unique()))