# Image Analysis for Instagram Photos

This notebook analyzes downloaded Instagram images to extract:
1. **Luminosity** - Grayscale intensity values
2. **Saturation** - Median of S channel in HSV color space
3. **Predominant Color** - Using K-means clustering and histograms

## Setup and Imports

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from pathlib import Path
from sklearn.cluster import KMeans
from collections import Counter
import seaborn as sns

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

## Configuration

In [None]:
# Directory containing downloaded images
IMAGE_DIR = "images"

# Number of clusters for K-means (for predominant color detection)
N_COLORS = 5

# Image extensions to process
VALID_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.webp')

print(f"Image directory: {IMAGE_DIR}")
print(f"Number of color clusters: {N_COLORS}")

## Helper Functions

In [None]:
def load_image(image_path):
    """
    Load an image from file.
    
    Args:
        image_path: Path to the image file
        
    Returns:
        Image in BGR format (OpenCV default)
    """
    img = cv2.imread(str(image_path))
    if img is None:
        raise ValueError(f"Failed to load image: {image_path}")
    return img


def calculate_luminosity(image):
    """
    Calculate luminosity statistics of an image.
    
    Args:
        image: BGR image
        
    Returns:
        Dictionary with luminosity statistics
    """
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    return {
        'mean_luminosity': float(np.mean(gray)),
        'median_luminosity': float(np.median(gray)),
        'std_luminosity': float(np.std(gray)),
        'min_luminosity': int(np.min(gray)),
        'max_luminosity': int(np.max(gray))
    }


def calculate_saturation(image):
    """
    Calculate saturation statistics using HSV color space.
    
    Args:
        image: BGR image
        
    Returns:
        Dictionary with saturation statistics
    """
    # Convert to HSV
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Extract saturation channel (S)
    saturation = hsv[:, :, 1]
    
    return {
        'median_saturation': float(np.median(saturation)),
        'mean_saturation': float(np.mean(saturation)),
        'std_saturation': float(np.std(saturation)),
        'min_saturation': int(np.min(saturation)),
        'max_saturation': int(np.max(saturation))
    }


def get_predominant_colors(image, n_colors=5):
    """
    Get predominant colors using K-means clustering.
    
    Args:
        image: BGR image
        n_colors: Number of predominant colors to extract
        
    Returns:
        Dictionary with predominant colors and their percentages
    """
    # Convert BGR to RGB for better color representation
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Reshape image to be a list of pixels
    pixels = rgb_image.reshape(-1, 3)
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=n_colors, random_state=42, n_init=10)
    kmeans.fit(pixels)
    
    # Get cluster centers (colors) and labels
    colors = kmeans.cluster_centers_.astype(int)
    labels = kmeans.labels_
    
    # Count pixels in each cluster
    label_counts = Counter(labels)
    total_pixels = len(pixels)
    
    # Calculate percentages and sort by frequency
    color_info = []
    for i in range(n_colors):
        percentage = (label_counts[i] / total_pixels) * 100
        color_info.append({
            'color_rgb': tuple(colors[i]),
            'percentage': percentage
        })
    
    # Sort by percentage (descending)
    color_info.sort(key=lambda x: x['percentage'], reverse=True)
    
    return {
        'predominant_color': color_info[0]['color_rgb'],
        'predominant_color_percentage': color_info[0]['percentage'],
        'all_colors': color_info
    }


def analyze_image(image_path, n_colors=5):
    """
    Perform complete analysis on an image.
    
    Args:
        image_path: Path to the image file
        n_colors: Number of predominant colors to extract
        
    Returns:
        Dictionary with all analysis results
    """
    try:
        # Load image
        image = load_image(image_path)
        
        # Get image info
        height, width, channels = image.shape
        
        # Perform analysis
        luminosity = calculate_luminosity(image)
        saturation = calculate_saturation(image)
        colors = get_predominant_colors(image, n_colors)
        
        # Combine results
        result = {
            'filename': os.path.basename(image_path),
            'width': width,
            'height': height,
            'channels': channels,
            **luminosity,
            **saturation,
            **colors
        }
        
        return result
    
    except Exception as e:
        print(f"Error analyzing {image_path}: {e}")
        return None


print("Helper functions defined successfully!")

## Load and Analyze Images

In [None]:
# Get list of image files
image_dir = Path(IMAGE_DIR)

if not image_dir.exists():
    print(f"Error: Directory '{IMAGE_DIR}' not found.")
    print("Please run photo_downloader.py first to download images.")
else:
    image_files = [f for f in image_dir.iterdir() if f.suffix.lower() in VALID_EXTENSIONS]
    print(f"Found {len(image_files)} images to analyze.")

In [None]:
# Analyze all images
results = []

for i, image_path in enumerate(image_files, 1):
    print(f"Analyzing image {i}/{len(image_files)}: {image_path.name}")
    result = analyze_image(image_path, n_colors=N_COLORS)
    if result:
        results.append(result)

print(f"\nSuccessfully analyzed {len(results)} images.")

## Create Results DataFrame

In [None]:
# Create DataFrame with basic metrics
df = pd.DataFrame(results)

# Display basic info
print(f"Total images analyzed: {len(df)}")
print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")

# Display first few rows (excluding complex columns)
display_columns = ['filename', 'width', 'height', 'mean_luminosity', 'median_saturation', 'predominant_color_percentage']
print(f"\nFirst few rows:")
df[display_columns].head()

## Statistical Summary

In [None]:
# Display statistics
stats_columns = ['mean_luminosity', 'median_luminosity', 'median_saturation', 'mean_saturation']
print("Statistical Summary:")
df[stats_columns].describe()

## Visualizations

### Luminosity Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Mean luminosity histogram
axes[0].hist(df['mean_luminosity'], bins=20, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Mean Luminosity')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Mean Luminosity')
axes[0].axvline(df['mean_luminosity'].mean(), color='red', linestyle='--', label=f'Mean: {df["mean_luminosity"].mean():.2f}')
axes[0].legend()

# Median luminosity histogram
axes[1].hist(df['median_luminosity'], bins=20, color='lightcoral', edgecolor='black')
axes[1].set_xlabel('Median Luminosity')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Median Luminosity')
axes[1].axvline(df['median_luminosity'].mean(), color='blue', linestyle='--', label=f'Mean: {df["median_luminosity"].mean():.2f}')
axes[1].legend()

plt.tight_layout()
plt.show()

### Saturation Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Median saturation histogram
axes[0].hist(df['median_saturation'], bins=20, color='lightgreen', edgecolor='black')
axes[0].set_xlabel('Median Saturation')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Median Saturation (HSV)')
axes[0].axvline(df['median_saturation'].mean(), color='red', linestyle='--', label=f'Mean: {df["median_saturation"].mean():.2f}')
axes[0].legend()

# Mean saturation histogram
axes[1].hist(df['mean_saturation'], bins=20, color='plum', edgecolor='black')
axes[1].set_xlabel('Mean Saturation')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Mean Saturation (HSV)')
axes[1].axvline(df['mean_saturation'].mean(), color='blue', linestyle='--', label=f'Mean: {df["mean_saturation"].mean():.2f}')
axes[1].legend()

plt.tight_layout()
plt.show()

### Luminosity vs Saturation Scatter Plot

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df['mean_luminosity'], df['median_saturation'], alpha=0.6, s=100)
plt.xlabel('Mean Luminosity')
plt.ylabel('Median Saturation')
plt.title('Luminosity vs Saturation')
plt.grid(True, alpha=0.3)
plt.show()

### Predominant Colors Visualization

In [None]:
def plot_image_with_colors(image_path, result, n_colors=5):
    """
    Plot an image alongside its predominant colors.
    
    Args:
        image_path: Path to the image file
        result: Analysis result dictionary
        n_colors: Number of colors to display
    """
    # Load and convert image
    img_bgr = load_image(image_path)
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    
    # Create figure
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Display image
    axes[0].imshow(img_rgb)
    axes[0].set_title(f"Image: {result['filename']}")
    axes[0].axis('off')
    
    # Display color palette
    colors_info = result['all_colors'][:n_colors]
    colors = [c['color_rgb'] for c in colors_info]
    percentages = [c['percentage'] for c in colors_info]
    
    # Create color bars
    color_array = np.array(colors).reshape(1, -1, 3) / 255.0
    axes[1].imshow(color_array, aspect='auto')
    axes[1].set_title('Predominant Colors')
    axes[1].set_yticks([])
    axes[1].set_xticks(range(len(colors)))
    axes[1].set_xticklabels([f"{p:.1f}%" for p in percentages])
    
    # Add stats
    stats_text = f"Luminosity: {result['mean_luminosity']:.2f}\nSaturation: {result['median_saturation']:.2f}"
    axes[1].text(0.5, -0.2, stats_text, transform=axes[1].transAxes, 
                ha='center', fontsize=10, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    plt.show()


# Display first few images with their predominant colors
n_samples = min(3, len(results))
for i in range(n_samples):
    image_path = image_dir / results[i]['filename']
    plot_image_with_colors(image_path, results[i], n_colors=N_COLORS)

## Save Results to CSV

In [None]:
# Prepare DataFrame for export (exclude complex nested data)
export_df = df.drop(columns=['all_colors'], errors='ignore').copy()

# Convert predominant_color tuple to string
export_df['predominant_color'] = export_df['predominant_color'].apply(str)

# Save to CSV
output_file = 'image_analysis_results.csv'
export_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

## Summary Statistics

In [None]:
print("=" * 60)
print("IMAGE ANALYSIS SUMMARY")
print("=" * 60)
print(f"\nTotal images analyzed: {len(df)}")
print(f"\nAverage Luminosity: {df['mean_luminosity'].mean():.2f}")
print(f"Average Saturation: {df['median_saturation'].mean():.2f}")
print(f"\nBrightest image: {df.loc[df['mean_luminosity'].idxmax(), 'filename']}")
print(f"Darkest image: {df.loc[df['mean_luminosity'].idxmin(), 'filename']}")
print(f"\nMost saturated image: {df.loc[df['median_saturation'].idxmax(), 'filename']}")
print(f"Least saturated image: {df.loc[df['median_saturation'].idxmin(), 'filename']}")
print("\n" + "=" * 60)