<a href="https://colab.research.google.com/github/ashwin-yedte/visual-intelligence-travel-finance/blob/main/notebooks/Step_1_Image_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Image Analysis & Feature Extraction

**Version**: 1.0  
**Author**: Ashwin Kumar Y (2023AC05628)  
**Project**: Visual Intelligence for Travel & Finance Optimization  
**Institution**: BITS Pilani  
**Date**: January 2026

---

## Overview

This notebook implements:
- Image preprocessing and validation
- CLIP-based feature extraction (512-dim embeddings)
- Zero-shot scene classification for Indian seashores
- Batch processing for 1-5 images

## Outputs
- Scene classification scores
- Visual descriptors
- Color statistics
- Dominant themes

---

In [3]:
# ============================================================================
# SETUP & INSTALLATION
# ============================================================================

print("Installing required packages...")
print("=" * 70)

# Install dependencies
!pip install -q transformers torch torchvision pillow scikit-learn

print("Installation complete!")
print("=" * 70)

# Import libraries
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
from typing import List, Dict, Tuple
import io
import json
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from google.colab import files
import os
import warnings
warnings.filterwarnings('ignore')

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n PyTorch version: {torch.__version__}")
print(f" Device: {device}")
if device == "cuda":
    print(f" GPU: {torch.cuda.get_device_name(0)}")

Installing required packages...
Installation complete!

 PyTorch version: 2.9.0+cpu
 Device: cpu


Configuration **Constants**

In [5]:
"""
================================================================================
CELL 3: CONFIGURATION CONSTANTS
================================================================================
"""

print("="*80)
print("CONFIGURATION: Setting up system parameters")
print("="*80)

class Config:
    """
    Centralized configuration for the image analysis system
    """

    # Model configuration
    CLIP_MODEL_NAME = "openai/clip-vit-base-patch32"
    EMBEDDING_DIMENSION = 512

    # Image processing
    TARGET_IMAGE_SIZE = (224, 224)
    MAX_IMAGE_SIZE_MB = 10.0
    SUPPORTED_FORMATS = ['jpg', 'jpeg', 'png']

    # Batch processing
    MIN_IMAGES = 1
    MAX_IMAGES = 5

    # Color analysis
    NUM_DOMINANT_COLORS = 3
    COLOR_SAMPLE_SIZE = 10000

    # Scene classification
    TOP_K_SCENES = 5
    HIGH_CONFIDENCE_THRESHOLD = 0.75
    MEDIUM_CONFIDENCE_THRESHOLD = 0.60
    HIGH_CONFIDENCE_GAP = 0.15
    MEDIUM_CONFIDENCE_GAP = 0.10

    # Thresholds
    BRIGHTNESS_BRIGHT_THRESHOLD = 150
    BRIGHTNESS_MODERATE_THRESHOLD = 100
    THEME_CONSISTENCY_THRESHOLD = 0.5

    # Output
    OUTPUT_JSON_FILE = "step1_analysis_results.json"
    OUTPUT_VISUALIZATION_FILE = "step1_visualization.png"
    VISUALIZATION_DPI = 150

    # Metadata
    VERSION = "1.0.0"
    STEP_NAME = "Step 1: Image Analysis and Feature Extraction"
    AUTHOR = "Ashwin Kumar Y (2023AC05628)"
    PROJECT = "Visual Intelligence for Travel and Finance Optimization"

    @classmethod
    def display_config(cls):
        """Display current configuration"""
        print("\nCurrent Configuration:")
        print(f"  Model: {cls.CLIP_MODEL_NAME}")
        print(f"  Target Image Size: {cls.TARGET_IMAGE_SIZE}")
        print(f"  Max Images per Batch: {cls.MAX_IMAGES}")
        print(f"  Embedding Dimension: {cls.EMBEDDING_DIMENSION}")
        print(f"  Supported Formats: {', '.join(cls.SUPPORTED_FORMATS)}")

# Display configuration
Config.display_config()

print("\n" + "="*80)
print("CONFIGURATION COMPLETE: System parameters set")
print("="*80)


CONFIGURATION: Setting up system parameters

Current Configuration:
  Model: openai/clip-vit-base-patch32
  Target Image Size: (224, 224)
  Max Images per Batch: 5
  Embedding Dimension: 512
  Supported Formats: jpg, jpeg, png

CONFIGURATION COMPLETE: System parameters set


PROMPT **LIBRARY**

In [6]:
"""
================================================================================
CELL 4: SEASHORE PROMPT LIBRARY
================================================================================
"""

print("="*80)
print("PROMPT LIBRARY: Loading scene classification prompts")
print("="*80)

class SeashorePromptLibrary:
    """
    Collection of specialized prompts for Indian seashore scene classification.
    Organized by category for systematic theme extraction.
    """

    # Primary seashore characteristics
    PRIMARY_PROMPTS = [
        "a tropical beach with palm trees and golden sand",
        "a pristine white sand beach with clear turquoise water",
        "a rocky coastline with cliffs and crashing waves",
        "a serene beach with calm waters and gentle waves",
        "a beach with traditional fishing boats and nets",
        "a secluded cove with crystal clear water",
        "a sunset beach with orange and pink sky",
        "a beach with water sports activities and equipment",
    ]

    # Indian region-specific characteristics
    INDIAN_REGIONAL_PROMPTS = [
        "a Goa style beach with colorful shacks and palm trees",
        "a Kerala beach with coconut groves and backwaters",
        "an Andaman island beach with coral reefs and pristine water",
        "a Konkan coast beach with rocky cliffs and coconut trees",
        "a beach with traditional fishing village and local culture",
        "a beach near a temple or coastal religious site",
        "a mangrove lined coastal area with dense vegetation",
        "a Tamil Nadu beach with rocky shores",
    ]

    # Activity and atmosphere
    ACTIVITY_PROMPTS = [
        "a peaceful beach ideal for relaxation and meditation",
        "an adventure beach with water sports and activities",
        "a beach with beach parties and nightlife",
        "a family friendly beach with safe shallow waters",
        "a romantic beach with scenic sunset views",
        "an offbeat secluded beach with minimal crowds",
    ]

    # Visual aesthetics
    AESTHETIC_PROMPTS = [
        "a beach with dramatic landscape and scenic views",
        "a beach with clear blue sky and bright sunlight",
        "a beach with golden hour lighting and warm tones",
        "a beach with lush green vegetation and natural beauty",
        "a beach with unique rock formations and natural features",
    ]

    @classmethod
    def get_all_prompts(cls) -> List[str]:
        """
        Returns all prompts combined across categories.

        Returns:
            List of all prompt strings
        """
        return (cls.PRIMARY_PROMPTS +
                cls.INDIAN_REGIONAL_PROMPTS +
                cls.ACTIVITY_PROMPTS +
                cls.AESTHETIC_PROMPTS)

    @classmethod
    def get_prompt_categories(cls) -> Dict[str, List[str]]:
        """
        Returns prompts organized by category.

        Returns:
            Dictionary mapping category names to prompt lists
        """
        return {
            'primary': cls.PRIMARY_PROMPTS,
            'regional': cls.INDIAN_REGIONAL_PROMPTS,
            'activity': cls.ACTIVITY_PROMPTS,
            'aesthetic': cls.AESTHETIC_PROMPTS
        }

    @classmethod
    def get_category_for_prompt(cls, prompt: str) -> str:
        """
        Identifies which category a prompt belongs to.

        Args:
            prompt: The prompt string to categorize

        Returns:
            Category name or 'unknown'
        """
        categories = cls.get_prompt_categories()
        for category, prompts in categories.items():
            if prompt in prompts:
                return category
        return 'unknown'

    @classmethod
    def display_statistics(cls):
        """Display statistics about the prompt library"""
        categories = cls.get_prompt_categories()
        total = len(cls.get_all_prompts())

        print("\nPrompt Library Statistics:")
        print(f"  Total prompts: {total}")
        for category, prompts in categories.items():
            print(f"    {category.capitalize()}: {len(prompts)} prompts")


# Initialize and display statistics
SeashorePromptLibrary.display_statistics()

print("\n" + "="*80)
print("PROMPT LIBRARY COMPLETE: 27 specialized prompts loaded")
print("="*80)


PROMPT LIBRARY: Loading scene classification prompts

Prompt Library Statistics:
  Total prompts: 27
    Primary: 8 prompts
    Regional: 8 prompts
    Activity: 6 prompts
    Aesthetic: 5 prompts

PROMPT LIBRARY COMPLETE: 27 specialized prompts loaded


================================================================================
 IMAGE PREPROCESSOR CLASS
================================================================================
Purpose: Handles image validation, preprocessing, and feature extraction
         before feeding to the CLIP model.
================================================================================================================================


In [8]:
print("="*80)
print("IMAGE PREPROCESSOR: Initializing image processing pipeline")
print("="*80)

from typing import Any # Added this import

class ImagePreprocessor:
    """
    Comprehensive image preprocessing pipeline for beach/seashore images.

    This class handles the complete preprocessing workflow including:
    - Image validation (format, size, integrity)
    - Image standardization (resize, orientation correction)
    - Color statistics extraction
    - Dominant color identification using K-means clustering

    The preprocessor ensures all images meet the requirements for CLIP model
    input while maintaining aspect ratio and extracting useful metadata.

    Attributes:
        target_size (Tuple[int, int]): Target dimensions for resized images
        max_size_mb (float): Maximum allowed file size in megabytes
        supported_formats (List[str]): List of supported image formats

    Example:
        >>> preprocessor = ImagePreprocessor()
        >>> validation = preprocessor.validate_image(image_bytes, "beach.jpg")
        >>> if validation['valid']:
        >>>     processed_img = preprocessor.preprocess_image(image_bytes)
        >>>     color_stats = preprocessor.extract_color_statistics(processed_img)
    """

    def __init__(self,
                 target_size: Tuple[int, int] = None,
                 max_size_mb: float = None,
                 supported_formats: List[str] = None):
        """
        Initialize the ImagePreprocessor with configuration parameters.

        Args:
            target_size: Target dimensions (width, height) for resized images.
                        Defaults to Config.TARGET_IMAGE_SIZE if not provided.
            max_size_mb: Maximum file size in MB. Defaults to Config.MAX_IMAGE_SIZE_MB.
            supported_formats: List of supported formats. Defaults to Config.SUPPORTED_FORMATS.

        Note:
            Using default values from Config class ensures consistency across the system.
        """
        self.target_size = target_size or Config.TARGET_IMAGE_SIZE
        self.max_size_mb = max_size_mb or Config.MAX_IMAGE_SIZE_MB
        self.supported_formats = supported_formats or Config.SUPPORTED_FORMATS

        print(f"ImagePreprocessor initialized")
        print(f"  Target size: {self.target_size}")
        print(f"  Max file size: {self.max_size_mb} MB")
        print(f"  Supported formats: {', '.join(self.supported_formats)}")

    def validate_image(self, image_bytes: bytes, filename: str) -> Dict[str, Any]:
        """
        Validate image file before processing.

        Performs the following checks:
        1. File size validation (must be under max_size_mb)
        2. Format validation (must be in supported_formats)
        3. Integrity check (file must be readable and not corrupted)
        4. Dimension extraction

        Args:
            image_bytes: Raw image data as bytes
            filename: Original filename (used for error messages)

        Returns:
            Dictionary containing validation results with keys:
                - valid (bool): Whether image passed all validation checks
                - error (str or None): Error message if validation failed
                - size_mb (float): File size in megabytes
                - format (str or None): Image format (jpg, png, etc.)
                - dimensions (Tuple[int, int] or None): Image dimensions (width, height)

        Example:
            >>> result = preprocessor.validate_image(img_bytes, "beach.jpg")
            >>> if result['valid']:
            >>>     print(f"Valid image: {result['dimensions']}")
            >>> else:
            >>>     print(f"Invalid: {result['error']}")
        """
        # Calculate file size in megabytes
        size_mb = len(image_bytes) / (1024 * 1024)

        # Check 1: File size validation
        if size_mb > self.max_size_mb:
            return {
                'valid': False,
                'error': f"File size {size_mb:.2f}MB exceeds maximum {self.max_size_mb}MB",
                'size_mb': size_mb,
                'format': None,
                'dimensions': None
            }

        try:
            # Attempt to open image
            img = Image.open(io.BytesIO(image_bytes))

            # Extract format
            img_format = img.format.lower() if img.format else 'unknown'

            # Check 2: Format validation
            if img_format not in self.supported_formats:
                return {
                    'valid': False,
                    'error': f"Unsupported format '{img_format}'. Supported: {', '.join(self.supported_formats)}",
                    'size_mb': size_mb,
                    'format': img_format,
                    'dimensions': None
                }

            # Check 3: Integrity validation
            # verify() checks if file is readable and not corrupted
            img.verify()

            # Re-open image after verify() (verify closes the file handle)
            img = Image.open(io.BytesIO(image_bytes))

            # All checks passed
            return {
                'valid': True,
                'error': None,
                'size_mb': size_mb,
                'format': img_format,
                'dimensions': img.size  # (width, height)
            }

        except Exception as e:
            # Handle any errors during validation
            return {
                'valid': False,
                'error': f"Image validation failed: {str(e)}",
                'size_mb': size_mb,
                'format': None,
                'dimensions': None
            }

    def preprocess_image(self, image_bytes: bytes) -> Image.Image:
        """
        Preprocess image for CLIP model input.

        Performs the following operations in sequence:
        1. Load image from bytes
        2. Fix orientation using EXIF data (auto-rotate if needed)
        3. Convert to RGB color mode (handles grayscale, RGBA, etc.)
        4. Resize to target size while maintaining aspect ratio
        5. Add white padding if needed to match target dimensions

        Args:
            image_bytes: Raw image data as bytes

        Returns:
            Preprocessed PIL Image object in RGB mode with target dimensions

        Note:
            The resizing maintains aspect ratio to prevent distortion.
            White padding is added to fill remaining space.

        Example:
            >>> processed_img = preprocessor.preprocess_image(img_bytes)
            >>> print(processed_img.size)  # Should be target_size
            >>> print(processed_img.mode)  # Should be 'RGB'
        """
        # Step 1: Load image
        img = Image.open(io.BytesIO(image_bytes))

        # Step 2: Fix orientation based on EXIF data
        # Many smartphone photos have rotation info in EXIF metadata
        img = self._fix_orientation(img)

        # Step 3: Convert to RGB
        # CLIP requires RGB input, so convert if image is grayscale, RGBA, etc.
        if img.mode != 'RGB':
            img = img.convert('RGB')

        # Step 4: Resize with padding to maintain aspect ratio
        img = self._resize_with_padding(img, self.target_size)

        return img

    def _fix_orientation(self, img: Image.Image) -> Image.Image:
        """
        Fix image orientation based on EXIF metadata.

        Many images (especially from smartphones) contain EXIF orientation data
        that specifies how the image should be rotated for proper display.
        This method reads that data and auto-rotates the image if needed.

        Args:
            img: PIL Image object

        Returns:
            Image with corrected orientation

        Note:
            If EXIF processing fails (e.g., no EXIF data), returns original image.
            This is a safe operation that won't break the pipeline.
        """
        try:
            from PIL import ImageOps
            # exif_transpose automatically handles all EXIF rotation cases
            img = ImageOps.exif_transpose(img)
        except Exception:
            # If EXIF processing fails, continue with original image
            # This can happen with images that don't have EXIF data
            pass
        return img

    def _resize_with_padding(self, img: Image.Image, target_size: Tuple[int, int]) -> Image.Image:
        """
        Resize image to target size while maintaining aspect ratio.

        Algorithm:
        1. Calculate aspect ratio of input image and target size
        2. Determine scaling factor to fit image within target dimensions
        3. Resize image using high-quality LANCZOS resampling
        4. Create white canvas with target dimensions
        5. Paste resized image in center of canvas

        This approach ensures:
        - No distortion (aspect ratio preserved)
        - No cropping (entire image visible)
        - Consistent output dimensions

        Args:
            img: PIL Image to resize
            target_size: Desired output size (width, height)

        Returns:
            Resized image with white padding if needed

        Example:
            Input:  1920x1080 image, target 224x224
            Output: 224x224 image with 224x126 content and white bars top/bottom
        """
        # Calculate aspect ratios
        img_aspect_ratio = img.width / img.height
        target_aspect_ratio = target_size[0] / target_size[1]

        # Determine new dimensions based on aspect ratio
        if img_aspect_ratio > target_aspect_ratio:
            # Image is wider than target - fit to width
            new_width = target_size[0]
            new_height = int(new_width / img_aspect_ratio)
        else:
            # Image is taller than target - fit to height
            new_height = target_size[1]
            new_width = int(new_height * img_aspect_ratio)

        # Resize using high-quality LANCZOS filter
        # LANCZOS provides best quality for downsampling
        img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

        # Create white canvas with target size
        canvas = Image.new('RGB', target_size, (255, 255, 255))

        # Calculate position to center the image
        offset_x = (target_size[0] - new_width) // 2
        offset_y = (target_size[1] - new_height) // 2

        # Paste resized image onto canvas
        canvas.paste(img, (offset_x, offset_y))

        return canvas

    def extract_color_statistics(self, img: Image.Image) -> Dict[str, Any]:
        """
        Extract comprehensive color statistics from image.

        Computes:
        1. Dominant colors using K-means clustering
        2. Overall brightness (mean pixel value)
        3. Color variance (measure of color diversity)

        These statistics help characterize the visual appearance:
        - Dominant colors indicate scene type (blue=water, green=vegetation, etc.)
        - Brightness helps identify time of day
        - Variance indicates visual complexity

        Args:
            img: PIL Image object

        Returns:
            Dictionary with keys:
                - dominant_colors: List of RGB triplets for top N colors
                - brightness: Mean pixel value (0-255)
                - color_variance: Variance in pixel values

        Example:
            >>> stats = preprocessor.extract_color_statistics(img)
            >>> print(f"Top color: RGB{stats['dominant_colors'][0]}")
            >>> print(f"Brightness: {stats['brightness']:.1f}")
        """
        # Convert PIL Image to numpy array for processing
        img_array = np.array(img)

        return {
            'dominant_colors': self._get_dominant_colors(img_array),
            'brightness': float(np.mean(img_array)),
            'color_variance': float(np.var(img_array))
        }

    def _get_dominant_colors(self, img_array: np.ndarray, n_colors: int = None) -> List[List[int]]:
        """
        Identify dominant colors in image using K-means clustering.

        Algorithm:
        1. Reshape image array to list of pixels (each pixel = RGB triplet)
        2. Sample pixels if image is large (for performance)
        3. Apply K-means clustering to group similar colors
        4. Sort clusters by size (frequency)
        5. Return cluster centers as dominant colors

        Args:
            img_array: Image as numpy array (height, width, 3)
            n_colors: Number of dominant colors to extract

        Returns:
            List of RGB triplets sorted by dominance
            Example: [[135, 206, 235], [255, 245, 200], [34, 139, 34]]
                     (sky blue,       sand beige,      vegetation green)

        Note:
            For performance, limits analysis to 10,000 random pixels
            if image is larger. This provides good approximation while
            maintaining fast execution.
        """
        n_colors = n_colors or Config.NUM_DOMINANT_COLORS

        # Reshape from (height, width, 3) to (num_pixels, 3)
        pixels = img_array.reshape(-1, 3)

        # Sample pixels if too many (for performance)
        if len(pixels) > Config.COLOR_SAMPLE_SIZE:
            # Randomly sample pixels
            indices = np.random.choice(len(pixels), Config.COLOR_SAMPLE_SIZE, replace=False)
            pixels = pixels[indices]

        # Apply K-means clustering
        # n_init=10 means algorithm runs 10 times with different initializations
        # and picks best result (more stable than default)
        kmeans = KMeans(n_clusters=n_colors, random_state=42, n_init=10)
        kmeans.fit(pixels)

        # Get cluster centers (these are our dominant colors)
        colors = kmeans.cluster_centers_.astype(int)

        # Get cluster assignments for each pixel
        labels = kmeans.labels_

        # Count pixels in each cluster
        counts = np.bincount(labels)

        # Sort colors by frequency (most common first)
        sorted_indices = np.argsort(-counts)  # Negative for descending order
        dominant_colors = colors[sorted_indices].tolist()

        return dominant_colors


# Initialize the preprocessor
print("\nInitializing ImagePreprocessor...")
preprocessor = ImagePreprocessor()

print("\n" + "="*80)
print("IMAGE PREPROCESSOR COMPLETE: Ready to process images")
print("="*80)


IMAGE PREPROCESSOR: Initializing image processing pipeline

Initializing ImagePreprocessor...
ImagePreprocessor initialized
  Target size: (224, 224)
  Max file size: 10.0 MB
  Supported formats: jpg, jpeg, png

IMAGE PREPROCESSOR COMPLETE: Ready to process images
