# Handwritten Character Recognition Dataset Generator

This notebook generates a synthetic dataset for OCR focusing on uppercase letters A-Z with advanced data augmentation techniques for robust CNN training.

## Pipeline Overview:
1. **Font Acquisition**: Automatic download of handwriting fonts via Fontsource API
2. **Image Generation**: Synthetic character images (28x28 grayscale) with augmentation
3. **Class Balancing**: Automatic handling of underrepresented/overrepresented characters
4. **Export**: Organized dataset structure ready for CNN training

## Output Specifications:
- Image size: 28x28 pixels (grayscale mode 'L')
- Background: Light (220-255), Text: Dark (0-50)
- Augmentations: Rotation, translation, scale, stroke width, blur, noise

In [None]:
# =============================================================================
# MODULE: AUTOMATIC HANDWRITTEN FONT ACQUISITION VIA FONTSOURCE API
# =============================================================================
# This module automatically downloads handwritten-style fonts from the public
# Fontsource API without requiring an API key.
#
# Workflow:
#   1. GET request to Fontsource API to retrieve font catalog
#   2. Filter fonts by 'handwriting' category
#   3. Download .ttf files (prioritizing Latin subset)
#   4. Save to ./handwritten_fonts directory
#
# Dependencies: requests, tqdm
# =============================================================================

import os
import json
import requests
from pathlib import Path
from typing import List, Dict, Optional, Any
from concurrent.futures import ThreadPoolExecutor, as_completed

# Install tqdm if not available
try:
    from tqdm import tqdm
except ImportError:
    import subprocess
    subprocess.check_call(['pip', 'install', 'tqdm'])
    from tqdm import tqdm


# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------

# Fontsource API endpoint
FONTSOURCE_API_URL = "https://api.fontsource.org/v1/fonts"

# Output directory for downloaded fonts
FONTS_OUTPUT_DIR = "./handwritten_fonts"

# Maximum number of fonts to download (None = all)
MAX_FONTS_TO_DOWNLOAD = None

# Preferred font subsets (in priority order)
PREFERRED_SUBSETS = ['latin', 'latin-ext']

# Preferred font weight (regular = 400)
PREFERRED_WEIGHT = 400

# Number of threads for parallel downloading
DOWNLOAD_THREADS = 4


# -----------------------------------------------------------------------------
# FONT ACQUISITION FUNCTIONS
# -----------------------------------------------------------------------------

def fetch_fonts_catalog() -> List[Dict[str, Any]]:
    """
    Fetch complete font catalog from Fontsource API.

    This function performs a GET request to the main API endpoint and
    returns the complete list of available fonts.

    Returns:
        List[Dict]: List of dictionaries containing font metadata.

    Raises:
        requests.RequestException: On connection error or HTTP error response.
    """
    print("[INFO] Connecting to Fontsource API...")

    try:
        response = requests.get(FONTSOURCE_API_URL, timeout=30)
        response.raise_for_status()

        fonts_data = response.json()
        print(f"[OK] Catalog retrieved: {len(fonts_data)} fonts available")

        return fonts_data

    except requests.Timeout:
        print("[ERROR] Timeout while connecting to Fontsource API")
        raise
    except requests.RequestException as e:
        print(f"[ERROR] API request failed: {e}")
        raise


def filter_handwriting_fonts(fonts_catalog: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Filter catalog to keep only handwriting-style fonts.

    Args:
        fonts_catalog (List[Dict]): Complete font catalog.

    Returns:
        List[Dict]: Filtered list of handwriting fonts only.
    """
    handwriting_fonts = [
        font for font in fonts_catalog
        if font.get('category', '').lower() == 'handwriting'
    ]

    print(f"[INFO] Handwriting fonts found: {len(handwriting_fonts)}")

    return handwriting_fonts


def get_font_download_url(font_id: str) -> Optional[str]:
    """
    Retrieve TTF file download URL for a given font.

    This function queries the detailed font endpoint to obtain download links,
    prioritizing the 'latin' subset.

    Args:
        font_id (str): Unique font identifier on Fontsource.

    Returns:
        Optional[str]: TTF file URL or None if not available.
    """
    detail_url = f"{FONTSOURCE_API_URL}/{font_id}"

    try:
        response = requests.get(detail_url, timeout=15)
        response.raise_for_status()
        font_detail = response.json()

        # Get available variants
        variants = font_detail.get('variants', {})

        # Look for preferred weight (400 = regular)
        weight_key = str(PREFERRED_WEIGHT)
        if weight_key not in variants:
            # Take first available weight
            if variants:
                weight_key = list(variants.keys())[0]
            else:
                return None

        weight_variants = variants.get(weight_key, {})

        # Look for normal style
        style_variants = weight_variants.get('normal', {})
        if not style_variants:
            # Try italic if normal not available
            style_variants = weight_variants.get('italic', {})

        if not style_variants:
            return None

        # Search for preferred subset
        for subset in PREFERRED_SUBSETS:
            if subset in style_variants:
                subset_data = style_variants[subset]
                # TTF file URL
                ttf_url = subset_data.get('url', {}).get('ttf')
                if ttf_url:
                    return ttf_url

        # Fallback: take first available subset
        if style_variants:
            first_subset = list(style_variants.keys())[0]
            subset_data = style_variants[first_subset]
            return subset_data.get('url', {}).get('ttf')

        return None

    except requests.RequestException:
        return None


def download_font_file(
    font_info: Dict[str, Any],
    output_dir: str
) -> Optional[str]:
    """
    Download a TTF font file and save it locally.

    Args:
        font_info (Dict): Dictionary containing font information.
        output_dir (str): Destination directory.

    Returns:
        Optional[str]: Path to downloaded file or None on failure.
    """
    font_id = font_info.get('id', '')
    font_family = font_info.get('family', font_id)

    try:
        # Get download URL
        download_url = get_font_download_url(font_id)

        if not download_url:
            return None

        # Download file
        response = requests.get(download_url, timeout=30)
        response.raise_for_status()

        # Create clean filename
        safe_name = "".join(c if c.isalnum() or c in '-_' else '_' for c in font_family)
        filename = f"{safe_name}.ttf"
        filepath = Path(output_dir) / filename

        # Save file
        with open(filepath, 'wb') as f:
            f.write(response.content)

        return str(filepath)

    except requests.RequestException:
        return None
    except IOError:
        return None


def download_handwriting_fonts(
    max_fonts: Optional[int] = None,
    output_dir: str = FONTS_OUTPUT_DIR,
    num_threads: int = DOWNLOAD_THREADS
) -> Dict[str, Any]:
    """
    Main function to download handwriting fonts.

    This function orchestrates the complete process:
    1. Retrieve Fontsource catalog
    2. Filter handwriting fonts
    3. Parallel download of TTF files

    Args:
        max_fonts (Optional[int]): Maximum number of fonts to download.
        output_dir (str): Destination directory.
        num_threads (int): Number of threads for parallel downloading.

    Returns:
        Dict: Download statistics (successes, failures, paths).
    """
    print("\n" + "=" * 70)
    print("HANDWRITTEN FONT ACQUISITION VIA FONTSOURCE API")
    print("=" * 70)

    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Statistics
    stats = {
        'total_available': 0,
        'total_attempted': 0,
        'successful': 0,
        'failed': 0,
        'downloaded_fonts': [],
        'failed_fonts': []
    }

    try:
        # Step 1: Retrieve catalog
        fonts_catalog = fetch_fonts_catalog()

        # Step 2: Filter handwriting fonts
        handwriting_fonts = filter_handwriting_fonts(fonts_catalog)
        stats['total_available'] = len(handwriting_fonts)

        if not handwriting_fonts:
            print("[WARNING] No handwriting fonts found in catalog")
            return stats

        # Limit number of fonts if specified
        fonts_to_download = handwriting_fonts
        if max_fonts and max_fonts < len(fonts_to_download):
            fonts_to_download = fonts_to_download[:max_fonts]
            print(f"[INFO] Limited to {max_fonts} fonts (out of {len(handwriting_fonts)} available)")

        stats['total_attempted'] = len(fonts_to_download)

        # Step 3: Parallel download with progress bar
        print(f"\n[DOWNLOAD] Starting download of {len(fonts_to_download)} fonts...")
        print(f"[INFO] Using {num_threads} parallel threads")
        print(f"[INFO] Destination: {output_path.absolute()}\n")

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            # Submit all download tasks
            future_to_font = {
                executor.submit(download_font_file, font, output_dir): font
                for font in fonts_to_download
            }

            # Track progress with tqdm
            with tqdm(total=len(fonts_to_download), desc="Download", unit="font") as pbar:
                for future in as_completed(future_to_font):
                    font = future_to_font[future]
                    font_name = font.get('family', font.get('id', 'Unknown'))

                    try:
                        result = future.result()
                        if result:
                            stats['successful'] += 1
                            stats['downloaded_fonts'].append({
                                'name': font_name,
                                'path': result
                            })
                        else:
                            stats['failed'] += 1
                            stats['failed_fonts'].append(font_name)
                    except Exception:
                        stats['failed'] += 1
                        stats['failed_fonts'].append(font_name)

                    pbar.update(1)

        # Display summary
        print("\n" + "-" * 70)
        print("[DOWNLOAD SUMMARY]")
        print(f"  - Handwriting fonts available: {stats['total_available']}")
        print(f"  - Download attempts: {stats['total_attempted']}")
        print(f"  - Successful downloads: {stats['successful']}")
        print(f"  - Failed downloads: {stats['failed']}")
        print(f"  - Output directory: {output_path.absolute()}")

        if stats['successful'] > 0:
            print(f"\n[OK] {stats['successful']} handwriting fonts ready for use")
        else:
            print("\n[WARNING] No fonts could be downloaded")

        print("=" * 70 + "\n")

        return stats

    except Exception as e:
        print(f"\n[CRITICAL ERROR] {e}")
        return stats


# -----------------------------------------------------------------------------
# EXECUTE DOWNLOAD
# -----------------------------------------------------------------------------

# Check if fonts already exist
existing_fonts = list(Path(FONTS_OUTPUT_DIR).glob("*.ttf")) if Path(FONTS_OUTPUT_DIR).exists() else []

if existing_fonts:
    print(f"[INFO] {len(existing_fonts)} fonts already present in '{FONTS_OUTPUT_DIR}'")
    user_choice = input("Do you want to download additional fonts? (y/n): ").strip().lower()

    if user_choice == 'y':
        download_stats = download_handwriting_fonts(
            max_fonts=MAX_FONTS_TO_DOWNLOAD,
            output_dir=FONTS_OUTPUT_DIR
        )
    else:
        print("[INFO] Download skipped. Using existing fonts.")
        download_stats = {'successful': len(existing_fonts), 'downloaded_fonts': []}
else:
    # No existing fonts, start download
    download_stats = download_handwriting_fonts(
        max_fonts=MAX_FONTS_TO_DOWNLOAD,
        output_dir=FONTS_OUTPUT_DIR
    )

In [None]:
# =============================================================================
# DEPENDENCY INSTALLATION
# =============================================================================
# This cell installs all necessary libraries for:
#   - Font downloading via API (requests, tqdm)
#   - Image generation (Pillow, numpy, opencv-python)
#   - Processing and augmentation (scipy for advanced operations)

# Install requirements for TRDG
%pip install -r requirements.txt

# Core dependencies
%pip install pillow numpy opencv-python requests tqdm matplotlib

# Optional dependencies for TRDG (if using legacy generator)
%pip install "arabic-reshaper>=2.1.4" python-bidi

%pip install git+https://github.com/Belval/TextRecognitionDataGenerator.git --no-deps

print("\n[OK] All dependencies are installed.")

In [None]:
# Import Required Libraries
import os
import string
from pathlib import Path
from trdg.generators import GeneratorFromStrings
import multiprocessing
import random

In [None]:
# =============================================================================
# GLOBAL CONFIGURATION PARAMETERS
# =============================================================================
# These parameters control the overall behavior of dataset generation.
# Modify them according to your requirements before running generation.

# Base number of samples per letter (before applying class balancing multipliers)
# Note: This number will be automatically adjusted by the class balancing system
BASE_SAMPLES_PER_LETTER = 10000

# Output directory for generated dataset
OUTPUT_ROOT = "dataset_handwritten_28x28"

# Directory containing handwritten fonts (.ttf files)
FONT_DIR = "./handwritten_fonts"

# List of characters to generate (uppercase A-Z)
LETTERS = list(string.ascii_uppercase)

# Enable/disable automatic class balancing
USE_CLASS_BALANCING = True

# =============================================================================
# DISPLAY CONFIGURATION
# =============================================================================
print("=" * 60)
print("DATASET GENERATOR CONFIGURATION")
print("=" * 60)
print(f"\n[GENERAL PARAMETERS]")
print(f"  - Base samples per letter: {BASE_SAMPLES_PER_LETTER}")
print(f"  - Output directory: {OUTPUT_ROOT}")
print(f"  - Font directory: {FONT_DIR}")
print(f"  - Number of characters: {len(LETTERS)}")
print(f"  - Class balancing: {'Enabled' if USE_CLASS_BALANCING else 'Disabled'}")

if USE_CLASS_BALANCING:
    print(f"\n[CLASS BALANCING]")
    print(f"  Underrepresented classes (x2.5): I, F, G, K, Q, X, Z")
    print(f"  Overrepresented classes (x0.7): O, S")

    # Calculate estimated total
    total_estimated = 0
    for letter in LETTERS:
        if letter in ['I', 'F', 'G', 'K', 'Q', 'X', 'Z']:
            total_estimated += int(BASE_SAMPLES_PER_LETTER * 2.5)
        elif letter in ['O', 'S']:
            total_estimated += int(BASE_SAMPLES_PER_LETTER * 0.7)
        else:
            total_estimated += BASE_SAMPLES_PER_LETTER
    print(f"\n  Estimated total images: {total_estimated:,}")
else:
    print(f"\n  Total images: {BASE_SAMPLES_PER_LETTER * len(LETTERS):,}")

print("\n" + "=" * 60)

In [None]:
def get_handwritten_fonts(font_dir):
    """
    Load all .ttf font files from the specified directory.

    Args:
        font_dir (str): Path to directory containing .ttf font files

    Returns:
        list: List of font file paths
    """
    font_path = Path(font_dir)

    if not font_path.exists():
        raise FileNotFoundError(f"Font directory '{font_dir}' does not exist. Please create it and add .ttf files.")

    fonts = list(font_path.glob("*.ttf"))

    if not fonts:
        raise FileNotFoundError(f"No .ttf files found in '{font_dir}'. Please add handwritten font files.")

    font_paths = [str(font) for font in fonts]
    print(f"Found {len(font_paths)} handwritten fonts:")
    for font in font_paths:
        print(f"  - {Path(font).name}")

    return font_paths

In [None]:
"""
===============================================================================
MODULE: SYNTHETIC IMAGE GENERATOR FOR OCR (ADVANCED VERSION)
===============================================================================
This module implements an optimized synthetic image generation system for
training CNN models for character recognition (OCR).

Key Features:
    - Class imbalance handling (underrepresented/overrepresented classes)
    - Realistic geometric augmentation (rotation, translation, scale)
    - Handwritten stroke simulation (variable thickness, blur, morphology)
    - 28x28 grayscale image generation
    - Modular architecture with configurable parameters

Dependencies:
    - PIL (Pillow): Image manipulation and text rendering
    - NumPy: Matrix operations and Gaussian noise
    - OpenCV (cv2): Morphological operations and filters
    - multiprocessing: CPU parallelization

Author: Mohamed
Date: December 2024
Version: 2.0
===============================================================================
"""

import os
import random
import multiprocessing
from multiprocessing import Pool
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import numpy as np
from PIL import Image, ImageDraw, ImageFont, ImageFilter

# Attempt to import OpenCV (optional but recommended)
try:
    import cv2
    CV2_AVAILABLE = True
except ImportError:
    CV2_AVAILABLE = False
    print("[WARNING] OpenCV not available. Morphological operations will be disabled.")


# =============================================================================
# SECTION 1: CONFIGURATION AND GLOBAL PARAMETERS
# =============================================================================

class AugmentationConfig:
    """
    Configuration class centralizing all augmentation parameters.

    This class allows easy modification of generation hyperparameters
    without changing function code.

    Attributes:
        IMAGE_SIZE (int): Output image size (28x28 pixels).
        BACKGROUND_RANGE (tuple): Light background value range (220-255).
        TEXT_COLOR_RANGE (tuple): Dark text value range (0-50).
        ROTATION_RANGE (tuple): Rotation range in degrees (-15 to +15).
        TRANSLATION_RANGE (tuple): Translation range in pixels (-3 to +3).
        SCALE_RANGE (tuple): Relative scale range (0.60 to 0.85).
        STROKE_WIDTH_RANGE (tuple): Stroke width range (1 to 4 pixels).
        BLUR_RADIUS_RANGE (tuple): Gaussian blur radius range (0.0 to 1.5).
        GAUSSIAN_NOISE_STD (float): Gaussian noise standard deviation (0-10).
        MORPHOLOGY_KERNEL_SIZE (int): Kernel size for morphological operations.
        MORPHOLOGY_PROBABILITY (float): Probability of applying morphological operation.
    """

    # Output image specifications
    IMAGE_SIZE: int = 28
    BACKGROUND_RANGE: Tuple[int, int] = (220, 255)
    TEXT_COLOR_RANGE: Tuple[int, int] = (0, 50)

    # Geometric augmentation parameters
    ROTATION_RANGE: Tuple[float, float] = (-15.0, 15.0)
    TRANSLATION_RANGE: Tuple[int, int] = (-3, 3)
    SCALE_RANGE: Tuple[float, float] = (0.60, 0.85)

    # Stroke simulation parameters
    STROKE_WIDTH_RANGE: Tuple[int, int] = (1, 4)
    BLUR_RADIUS_RANGE: Tuple[float, float] = (0.0, 1.5)
    GAUSSIAN_NOISE_STD: float = 5.0

    # Morphological parameters
    MORPHOLOGY_KERNEL_SIZE: int = 2
    MORPHOLOGY_PROBABILITY: float = 0.3


class ClassBalanceConfig:
    """
    Configuration for class imbalance handling.

    This class defines volume multipliers for each character category
    to compensate for natural dataset imbalances.

    Attributes:
        UNDERREPRESENTED_CHARS (list): Underrepresented characters requiring
            more samples (I, F, G, K, Q, X, Z).
        OVERREPRESENTED_CHARS (list): Overrepresented characters requiring
            fewer samples (O, S).
        UNDERREPRESENTED_MULTIPLIER (float): Multiplier for rare classes (2.5x).
        OVERREPRESENTED_MULTIPLIER (float): Multiplier for frequent classes (0.7x).
        DEFAULT_MULTIPLIER (float): Default multiplier (1.0x).
    """

    # Underrepresented characters (need more samples)
    UNDERREPRESENTED_CHARS: List[str] = ['I', 'F', 'G', 'K', 'Q', 'X', 'Z']

    # Overrepresented characters (need fewer samples)
    OVERREPRESENTED_CHARS: List[str] = ['O', 'S']

    # Volume multipliers
    UNDERREPRESENTED_MULTIPLIER: float = 2.5
    OVERREPRESENTED_MULTIPLIER: float = 0.7
    DEFAULT_MULTIPLIER: float = 1.0

    @classmethod
    def get_multiplier(cls, letter: str) -> float:
        """
        Return the volume multiplier for a given character.

        Args:
            letter (str): The character to get the multiplier for.

        Returns:
            float: The volume multiplier to apply.
        """
        if letter in cls.UNDERREPRESENTED_CHARS:
            return cls.UNDERREPRESENTED_MULTIPLIER
        elif letter in cls.OVERREPRESENTED_CHARS:
            return cls.OVERREPRESENTED_MULTIPLIER
        else:
            return cls.DEFAULT_MULTIPLIER

    @classmethod
    def get_samples_count(cls, letter: str, base_samples: int) -> int:
        """
        Calculate the number of samples to generate for a character.

        Args:
            letter (str): The target character.
            base_samples (int): Base number of samples.

        Returns:
            int: Adjusted number of samples to generate.
        """
        multiplier = cls.get_multiplier(letter)
        return int(base_samples * multiplier)


# =============================================================================
# SECTION 2: IMAGE AUGMENTATION FUNCTIONS
# =============================================================================

def create_background(size: int, config: AugmentationConfig = AugmentationConfig) -> Image.Image:
    """
    Create a background image with random light pixel values.

    This function generates a non-uniform background to simulate natural
    variations in paper or writing surfaces.

    Args:
        size (int): Square image size in pixels.
        config (AugmentationConfig): Parameter configuration.

    Returns:
        Image.Image: PIL image in 'L' mode (grayscale) with light background.
    """
    bg_min, bg_max = config.BACKGROUND_RANGE

    # Generate random value array for background
    background_array = np.random.randint(
        low=bg_min,
        high=bg_max + 1,
        size=(size, size),
        dtype=np.uint8
    )

    return Image.fromarray(background_array, mode='L')


def select_text_color(config: AugmentationConfig = AugmentationConfig) -> int:
    """
    Select a random dark text color.

    Args:
        config (AugmentationConfig): Parameter configuration.

    Returns:
        int: Pixel value for text (0-50).
    """
    return random.randint(config.TEXT_COLOR_RANGE[0], config.TEXT_COLOR_RANGE[1])


def calculate_font_size(
    target_scale: float,
    image_size: int,
    font_path: str,
    character: str
) -> int:
    """
    Calculate optimal font size to achieve target scale.

    This function performs an iterative search to find the font size
    that allows the character to occupy the desired percentage of
    the image area.

    Args:
        target_scale (float): Percentage of image the character should occupy (0.6-0.85).
        image_size (int): Image size in pixels.
        font_path (str): Path to TTF font file.
        character (str): The character to render.

    Returns:
        int: Calculated font size.
    """
    target_size = int(image_size * target_scale)

    # Binary search to find appropriate font size
    low, high = 8, 100
    best_size = 20

    while low <= high:
        mid = (low + high) // 2
        try:
            font = ImageFont.truetype(font_path, mid)
            bbox = font.getbbox(character)
            char_width = bbox[2] - bbox[0]
            char_height = bbox[3] - bbox[1]
            max_dim = max(char_width, char_height)

            if max_dim < target_size:
                best_size = mid
                low = mid + 1
            else:
                high = mid - 1
        except Exception:
            # On error, use default size
            break

    return best_size


def apply_rotation(
    image: Image.Image,
    angle: float,
    fill_color: int
) -> Image.Image:
    """
    Apply rotation to the image with background fill.

    Args:
        image (Image.Image): Source image.
        angle (float): Rotation angle in degrees.
        fill_color (int): Fill color for empty regions.

    Returns:
        Image.Image: Image after rotation.
    """
    return image.rotate(
        angle,
        resample=Image.BICUBIC,
        expand=False,
        fillcolor=fill_color
    )


def apply_gaussian_blur(
    image: Image.Image,
    radius: float
) -> Image.Image:
    """
    Apply Gaussian blur to the image.

    Gaussian blur helps reduce digital artifacts and simulate
    more natural handwritten character rendering.

    Args:
        image (Image.Image): Source image.
        radius (float): Gaussian blur radius.

    Returns:
        Image.Image: Image with blur applied.
    """
    if radius > 0:
        return image.filter(ImageFilter.GaussianBlur(radius=radius))
    return image


def add_gaussian_noise(
    image: Image.Image,
    std_dev: float
) -> Image.Image:
    """
    Add Gaussian noise to the image.

    Gaussian noise simulates natural imperfections and improves
    robustness of models trained on this data.

    Args:
        image (Image.Image): Source image.
        std_dev (float): Gaussian noise standard deviation.

    Returns:
        Image.Image: Image with added noise.
    """
    if std_dev <= 0:
        return image

    img_array = np.array(image, dtype=np.float32)
    noise = np.random.normal(0, std_dev, img_array.shape)
    noisy_array = np.clip(img_array + noise, 0, 255).astype(np.uint8)

    return Image.fromarray(noisy_array, mode='L')


def apply_morphological_operation(
    image: Image.Image,
    kernel_size: int,
    operation: str = 'random'
) -> Image.Image:
    """
    Apply morphological operation to the image.

    Morphological operations (erosion, dilation) allow slight deformation
    of character structure to simulate natural handwriting variations.

    Args:
        image (Image.Image): Source image.
        kernel_size (int): Structuring element size.
        operation (str): Operation type ('erode', 'dilate', 'random').

    Returns:
        Image.Image: Image after morphological operation.

    Note:
        This function requires OpenCV. If OpenCV is not available,
        the image is returned without modification.
    """
    if not CV2_AVAILABLE:
        return image

    img_array = np.array(image)

    # Create structuring element
    kernel = np.ones((kernel_size, kernel_size), np.uint8)

    # Select operation
    if operation == 'random':
        operation = random.choice(['erode', 'dilate'])

    # Invert image for morphological operations
    # (text must be white on black background for standard operations)
    inverted = 255 - img_array

    if operation == 'erode':
        result = cv2.erode(inverted, kernel, iterations=1)
    elif operation == 'dilate':
        result = cv2.dilate(inverted, kernel, iterations=1)
    else:
        result = inverted

    # Re-invert to return to original format
    final = 255 - result

    return Image.fromarray(final, mode='L')


def simulate_stroke_width(
    draw: ImageDraw.Draw,
    position: Tuple[int, int],
    character: str,
    font: ImageFont.FreeTypeFont,
    color: int,
    stroke_width: int
) -> None:
    """
    Draw a character with variable stroke width.

    To simulate thick strokes (marker-style), the character is
    drawn multiple times with slight offsets.

    Args:
        draw (ImageDraw.Draw): PIL drawing object.
        position (tuple): Position (x, y) of the character.
        character (str): The character to draw.
        font (ImageFont.FreeTypeFont): Font to use.
        color (int): Text color.
        stroke_width (int): Stroke width (1-4).
    """
    x, y = position

    if stroke_width <= 1:
        # Thin stroke: simple drawing
        draw.text((x, y), character, font=font, fill=color)
    else:
        # Thick stroke: multiple drawings with offsets
        offsets = []
        for dx in range(-stroke_width // 2, stroke_width // 2 + 1):
            for dy in range(-stroke_width // 2, stroke_width // 2 + 1):
                if dx * dx + dy * dy <= (stroke_width // 2) ** 2:
                    offsets.append((dx, dy))

        for dx, dy in offsets:
            draw.text((x + dx, y + dy), character, font=font, fill=color)


# =============================================================================
# SECTION 3: MAIN IMAGE GENERATION FUNCTION
# =============================================================================

def generate_augmented_character_image(
    character: str,
    font_path: str,
    config: AugmentationConfig = AugmentationConfig
) -> Image.Image:
    """
    Generate an augmented character image with all transformations.

    This main function orchestrates the entire generation pipeline:
    1. Random background creation
    2. Scale and font size calculation
    3. Character rendering with variable thickness
    4. Geometric transformation application
    5. Noise and blur application
    6. Optional morphological operations

    Args:
        character (str): The character to generate.
        font_path (str): Path to TTF font file.
        config (AugmentationConfig): Augmentation parameter configuration.

    Returns:
        Image.Image: 28x28 grayscale image of augmented character.
    """
    size = config.IMAGE_SIZE

    # -------------------------------------------------------------------------
    # STEP 1: Create background with random light values
    # -------------------------------------------------------------------------
    background = create_background(size, config)
    background_value = int(np.mean(np.array(background)))

    # -------------------------------------------------------------------------
    # STEP 2: Select random parameters
    # -------------------------------------------------------------------------

    # Character scale (60% to 85% of image)
    target_scale = random.uniform(config.SCALE_RANGE[0], config.SCALE_RANGE[1])

    # Rotation (-15 to +15 degrees)
    rotation_angle = random.uniform(config.ROTATION_RANGE[0], config.ROTATION_RANGE[1])

    # Translation (center offset)
    trans_x = random.randint(config.TRANSLATION_RANGE[0], config.TRANSLATION_RANGE[1])
    trans_y = random.randint(config.TRANSLATION_RANGE[0], config.TRANSLATION_RANGE[1])

    # Stroke width
    stroke_width = random.randint(config.STROKE_WIDTH_RANGE[0], config.STROKE_WIDTH_RANGE[1])

    # Gaussian blur
    blur_radius = random.uniform(config.BLUR_RADIUS_RANGE[0], config.BLUR_RADIUS_RANGE[1])

    # Text color (dark)
    text_color = select_text_color(config)

    # -------------------------------------------------------------------------
    # STEP 3: Calculate optimal font size
    # -------------------------------------------------------------------------
    font_size = calculate_font_size(target_scale, size, font_path, character)

    try:
        font = ImageFont.truetype(font_path, font_size)
    except Exception as e:
        # Fallback to default font on error
        font = ImageFont.load_default()

    # -------------------------------------------------------------------------
    # STEP 4: Calculate centered position with translation
    # -------------------------------------------------------------------------

    # Get character dimensions
    bbox = font.getbbox(character)
    char_width = bbox[2] - bbox[0]
    char_height = bbox[3] - bbox[1]

    # Centered position
    center_x = (size - char_width) // 2
    center_y = (size - char_height) // 2

    # Apply translation (offset from perfect center)
    pos_x = center_x + trans_x - bbox[0]
    pos_y = center_y + trans_y - bbox[1]

    # -------------------------------------------------------------------------
    # STEP 5: Draw character with variable stroke width
    # -------------------------------------------------------------------------
    draw = ImageDraw.Draw(background)
    simulate_stroke_width(draw, (pos_x, pos_y), character, font, text_color, stroke_width)

    # -------------------------------------------------------------------------
    # STEP 6: Apply rotation
    # -------------------------------------------------------------------------
    rotated_image = apply_rotation(background, rotation_angle, background_value)

    # -------------------------------------------------------------------------
    # STEP 7: Apply Gaussian blur
    # -------------------------------------------------------------------------
    blurred_image = apply_gaussian_blur(rotated_image, blur_radius)

    # -------------------------------------------------------------------------
    # STEP 8: Add Gaussian noise
    # -------------------------------------------------------------------------
    noisy_image = add_gaussian_noise(blurred_image, config.GAUSSIAN_NOISE_STD)

    # -------------------------------------------------------------------------
    # STEP 9: Morphological operations (probabilistic)
    # -------------------------------------------------------------------------
    if random.random() < config.MORPHOLOGY_PROBABILITY:
        final_image = apply_morphological_operation(
            noisy_image,
            config.MORPHOLOGY_KERNEL_SIZE
        )
    else:
        final_image = noisy_image

    return final_image


# =============================================================================
# SECTION 4: DATASET GENERATION FUNCTIONS
# =============================================================================

def generate_single_image_worker(args: Tuple) -> Optional[str]:
    """
    Worker function for parallel generation of a single image.

    This function is designed for use with multiprocessing.Pool.
    It generates an augmented image and saves it to disk.

    Args:
        args (tuple): Tuple containing:
            - letter (str): The character to generate
            - index (int): Image index in sequence
            - font_paths (list): List of paths to fonts
            - output_dir (str): Output directory
            - total_samples (int): Total number of samples for progress tracking

    Returns:
        Optional[str]: Progress message or None.
    """
    letter, index, font_paths, output_dir, total_samples = args

    # Cyclic font selection
    selected_font = font_paths[index % len(font_paths)]

    # Generate augmented image
    image = generate_augmented_character_image(
        character=letter,
        font_path=selected_font,
        config=AugmentationConfig
    )

    # Save image
    filename = f"{letter}_{index + 1:05d}.png"
    filepath = Path(output_dir) / filename
    image.save(filepath, format='PNG')

    # Progress message every 50 samples
    if (index + 1) % 50 == 0:
        return f"    - Progress: {index + 1}/{total_samples} images generated"

    return None


def generate_handwritten_dataset(
    letters: List[str],
    base_samples_per_letter: int,
    output_root: str,
    font_paths: List[str],
    use_class_balancing: bool = True
) -> Dict[str, int]:
    """
    Generate a complete synthetic handwritten character dataset.

    This main function manages the entire generation process:
    - Class balancing application
    - Multi-core CPU parallelization
    - Progress tracking
    - Organized saving by subdirectories

    Args:
        letters (List[str]): List of characters to generate.
        base_samples_per_letter (int): Base number of samples per character.
        output_root (str): Root output directory.
        font_paths (List[str]): List of paths to TTF fonts.
        use_class_balancing (bool): Enable class balancing.

    Returns:
        Dict[str, int]: Dictionary {letter: number_images_generated}.

    Example:
        >>> letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
        >>> fonts = get_handwritten_fonts('./fonts')
        >>> stats = generate_handwritten_dataset(letters, 1000, './output', fonts)
        >>> print(f"Total: {sum(stats.values())} images")
    """
    # -------------------------------------------------------------------------
    # MULTIPROCESSING CONFIGURATION
    # -------------------------------------------------------------------------
    num_cores = multiprocessing.cpu_count()
    num_processes = max(1, num_cores - 1) if num_cores > 2 else num_cores

    # Create root directory
    Path(output_root).mkdir(exist_ok=True)

    # Generation statistics
    generation_stats: Dict[str, int] = {}

    # -------------------------------------------------------------------------
    # DISPLAY CONFIGURATION
    # -------------------------------------------------------------------------
    print("\n" + "=" * 70)
    print("OCR DATASET GENERATOR - ADVANCED VERSION")
    print("=" * 70)
    print(f"\n[SYSTEM CONFIGURATION]")
    print(f"  - CPU cores detected: {num_cores}")
    print(f"  - Parallel processes: {num_processes}")
    print(f"  - Available fonts: {len(font_paths)}")

    print(f"\n[IMAGE PARAMETERS]")
    print(f"  - Output size: {AugmentationConfig.IMAGE_SIZE}x{AugmentationConfig.IMAGE_SIZE} pixels")
    print(f"  - Format: Grayscale (mode 'L')")
    print(f"  - Background: {AugmentationConfig.BACKGROUND_RANGE[0]}-{AugmentationConfig.BACKGROUND_RANGE[1]}")
    print(f"  - Text: {AugmentationConfig.TEXT_COLOR_RANGE[0]}-{AugmentationConfig.TEXT_COLOR_RANGE[1]}")

    print(f"\n[GEOMETRIC AUGMENTATION]")
    print(f"  - Rotation: {AugmentationConfig.ROTATION_RANGE[0]} to {AugmentationConfig.ROTATION_RANGE[1]} degrees")
    print(f"  - Translation: {AugmentationConfig.TRANSLATION_RANGE[0]} to {AugmentationConfig.TRANSLATION_RANGE[1]} pixels")
    print(f"  - Scale: {AugmentationConfig.SCALE_RANGE[0]*100:.0f}% to {AugmentationConfig.SCALE_RANGE[1]*100:.0f}%")

    print(f"\n[STROKE SIMULATION]")
    print(f"  - Thickness: {AugmentationConfig.STROKE_WIDTH_RANGE[0]} to {AugmentationConfig.STROKE_WIDTH_RANGE[1]} pixels")
    print(f"  - Gaussian blur: {AugmentationConfig.BLUR_RADIUS_RANGE[0]} to {AugmentationConfig.BLUR_RADIUS_RANGE[1]}")
    print(f"  - Gaussian noise: sigma={AugmentationConfig.GAUSSIAN_NOISE_STD}")
    print(f"  - Morphological operations: {'Enabled' if CV2_AVAILABLE else 'Disabled (OpenCV missing)'}")

    if use_class_balancing:
        print(f"\n[CLASS BALANCING]")
        print(f"  - Underrepresented ({ClassBalanceConfig.UNDERREPRESENTED_MULTIPLIER}x): {ClassBalanceConfig.UNDERREPRESENTED_CHARS}")
        print(f"  - Overrepresented ({ClassBalanceConfig.OVERREPRESENTED_MULTIPLIER}x): {ClassBalanceConfig.OVERREPRESENTED_CHARS}")

    print("\n" + "-" * 70)

    # -------------------------------------------------------------------------
    # GENERATION LOOP BY LETTER
    # -------------------------------------------------------------------------
    total_images = 0

    for letter in letters:
        # Calculate number of samples with balancing
        if use_class_balancing:
            samples_count = ClassBalanceConfig.get_samples_count(letter, base_samples_per_letter)
            multiplier = ClassBalanceConfig.get_multiplier(letter)
            balance_info = f" (x{multiplier})"
        else:
            samples_count = base_samples_per_letter
            balance_info = ""

        # Create subdirectory
        letter_dir = Path(output_root) / letter
        letter_dir.mkdir(exist_ok=True)

        print(f"\n[GENERATION] Letter '{letter}'{balance_info}: {samples_count} images")

        # Prepare arguments for multiprocessing
        args_list = [
            (letter, i, font_paths, str(letter_dir), samples_count)
            for i in range(samples_count)
        ]

        # Parallel generation
        with Pool(processes=num_processes) as pool:
            results = pool.map(generate_single_image_worker, args_list)

            # Display progress messages
            for result in results:
                if result:
                    print(result)

        # Update statistics
        generation_stats[letter] = samples_count
        total_images += samples_count

        print(f"  [OK] Letter '{letter}' completed: {samples_count} images in {letter_dir}")

    # -------------------------------------------------------------------------
    # FINAL SUMMARY
    # -------------------------------------------------------------------------
    print("\n" + "=" * 70)
    print("GENERATION COMPLETED")
    print("=" * 70)
    print(f"\n[FINAL STATISTICS]")
    print(f"  - Total images generated: {total_images}")
    print(f"  - Number of classes: {len(letters)}")
    print(f"  - Location: {Path(output_root).absolute()}")

    if use_class_balancing:
        print(f"\n[CLASS DISTRIBUTION]")
        for letter, count in sorted(generation_stats.items()):
            bar_length = int(count / max(generation_stats.values()) * 30)
            bar = "#" * bar_length
            print(f"  {letter}: {count:5d} |{bar}")

    print("\n" + "=" * 70)

    return generation_stats


# =============================================================================
# SECTION 5: UTILITY FUNCTION FOR VISUALIZATION
# =============================================================================

def preview_augmentation(
    character: str,
    font_path: str,
    num_samples: int = 10
) -> None:
    """
    Generate and display a preview of several augmented variants.

    Utility function to visualize the effect of augmentations on a given
    character before launching a complete generation.

    Args:
        character (str): The character to visualize.
        font_path (str): Path to the font to use.
        num_samples (int): Number of variants to generate.
    """
    try:
        import matplotlib.pyplot as plt

        fig, axes = plt.subplots(2, 5, figsize=(12, 5))
        fig.suptitle(f"Augmentation Preview for '{character}'", fontsize=14)

        for idx, ax in enumerate(axes.flat):
            if idx < num_samples:
                img = generate_augmented_character_image(character, font_path)
                ax.imshow(img, cmap='gray', vmin=0, vmax=255)
                ax.set_title(f"Variant {idx + 1}", fontsize=10)
            ax.axis('off')

        plt.tight_layout()
        plt.show()

    except ImportError:
        print("[ERROR] matplotlib is not installed. Install it with: pip install matplotlib")

## Multiprocessing Optimization and Augmentation Pipeline

**Synthetic Image Generation System Architecture:**

The image generation pipeline includes the following steps for each character:

1. **Background Creation**: Random pixel values between 220-255 (light background)
2. **Parameter Selection**: Scale, rotation, translation, stroke thickness
3. **Character Rendering**: Drawing with variable thickness (1-4 pixels)
4. **Rotation**: Apply random rotation (-15 to +15 degrees)
5. **Gaussian Blur**: Edge softening (radius 0-1.5)
6. **Gaussian Noise**: Add noise for robustness (sigma=5)
7. **Morphological Operations**: Random erosion/dilation (30% probability)

**Class Imbalance Management:**

| Category | Characters | Multiplier |
|----------|------------|------------|
| Underrepresented | I, F, G, K, Q, X, Z | x2.5 |
| Standard | A, B, C, D, E, H, J, L, M, N, P, R, T, U, V, W, Y | x1.0 |
| Overrepresented | O, S | x0.7 |

**Multiprocessing Performance:**

- Free Colab (2 cores): 2x faster
- Colab Pro (4+ cores): 3-4x faster
- Local machine (8+ cores): 6-8x faster

In [None]:
# =============================================================================
# AUGMENTATION PREVIEW (OPTIONAL)
# =============================================================================
# Execute this cell to visualize the effect of augmentations on a character
# before launching the complete dataset generation.
# This allows visual validation that parameters are correct.

import matplotlib.pyplot as plt

def visualize_augmentation_samples(
    characters: list = ['A', 'B', 'I', 'O', 'Q'],
    samples_per_char: int = 5
) -> None:
    """
    Display a preview grid of augmentation variations.

    Args:
        characters (list): List of characters to visualize.
        samples_per_char (int): Number of variants per character.
    """
    # Load available fonts
    try:
        font_paths = get_handwritten_fonts(FONT_DIR)
    except FileNotFoundError as e:
        print(f"[ERROR] {e}")
        return

    num_chars = len(characters)
    fig, axes = plt.subplots(num_chars, samples_per_char, figsize=(12, 2.5 * num_chars))
    fig.suptitle("Augmentation Preview - 28x28 Images", fontsize=14, fontweight='bold')

    for row, char in enumerate(characters):
        for col in range(samples_per_char):
            # Random font selection
            font_path = random.choice(font_paths)

            # Generate augmented image
            img = generate_augmented_character_image(char, font_path)

            # Display
            ax = axes[row, col] if num_chars > 1 else axes[col]
            ax.imshow(img, cmap='gray', vmin=0, vmax=255)

            if col == 0:
                ax.set_ylabel(f"'{char}'", fontsize=12, fontweight='bold')
            ax.set_xticks([])
            ax.set_yticks([])

    plt.tight_layout()
    plt.subplots_adjust(top=0.92)
    plt.show()

    # Image statistics
    print("\n[IMAGE STATISTICS]")
    sample_img = generate_augmented_character_image('A', font_paths[0])
    img_array = np.array(sample_img)
    print(f"  - Size: {sample_img.size}")
    print(f"  - Mode: {sample_img.mode}")
    print(f"  - Mean pixel value: {img_array.mean():.2f}")
    print(f"  - Normalized mean [0-1]: {img_array.mean() / 255:.3f}")
    print(f"  - Standard deviation: {img_array.std():.2f}")


# Execute preview
print("[PREVIEW] Generating augmentation samples...")
visualize_augmentation_samples(
    characters=['A', 'I', 'O', 'Q', 'Z'],  # Mix of different characters
    samples_per_char=6
)

In [None]:
# =============================================================================
# EXECUTE DATASET GENERATION
# =============================================================================
# This cell launches the complete dataset generation process.
# Ensure that:
#   1. Fonts are present in the FONT_DIR directory
#   2. Configuration parameters are correctly defined
#   3. You have sufficient disk space available

if __name__ == "__main__":
    try:
        # ---------------------------------------------------------------------
        # STEP 1: Load handwritten fonts
        # ---------------------------------------------------------------------
        print("[INITIALIZATION] Loading handwritten fonts...")
        font_paths = get_handwritten_fonts(FONT_DIR)

        if len(font_paths) == 0:
            raise FileNotFoundError(
                f"No .ttf fonts found in '{FONT_DIR}'. "
                "Please add handwritten fonts."
            )

        # ---------------------------------------------------------------------
        # STEP 2: Generate complete dataset
        # ---------------------------------------------------------------------
        print("\n[START] Launching generation...")

        generation_stats = generate_handwritten_dataset(
            letters=LETTERS,
            base_samples_per_letter=BASE_SAMPLES_PER_LETTER,
            output_root=OUTPUT_ROOT,
            font_paths=font_paths,
            use_class_balancing=USE_CLASS_BALANCING
        )

        # ---------------------------------------------------------------------
        # STEP 3: Save statistics
        # ---------------------------------------------------------------------
        print("\n[STATISTICS] Saving generation report...")

        stats_file = Path(OUTPUT_ROOT) / "generation_stats.txt"
        with open(stats_file, 'w', encoding='utf-8') as f:
            f.write("DATASET GENERATION REPORT\n")
            f.write("=" * 50 + "\n\n")
            f.write(f"Generation date: {__import__('datetime').datetime.now()}\n")
            f.write(f"Class balancing: {'Enabled' if USE_CLASS_BALANCING else 'Disabled'}\n\n")
            f.write("Distribution by class:\n")
            f.write("-" * 30 + "\n")
            for letter, count in sorted(generation_stats.items()):
                f.write(f"  {letter}: {count}\n")
            f.write("-" * 30 + "\n")
            f.write(f"  TOTAL: {sum(generation_stats.values())}\n")

        print(f"  Report saved: {stats_file}")

    except FileNotFoundError as e:
        print(f"\n[ERROR] {e}")
        print("\nSolution: Create the directory and add handwritten .ttf fonts")

    except Exception as e:
        print(f"\n[UNEXPECTED ERROR] {e}")
        import traceback
        traceback.print_exc()

## üìä Verify Font Distribution

After generation, execute this cell to verify that all fonts have been used equitably:

In [None]:
# =============================================================================
# FONT DISTRIBUTION VERIFICATION
# =============================================================================
# This function verifies that all fonts are used equitably during
# dataset generation.

def verify_font_distribution(samples_per_letter: int, num_fonts: int) -> None:
    """
    Verify that all fonts are used equitably.

    With cyclic rotation (font_paths[i % len(font_paths)]):
    If you have 5 fonts and 100 samples:
        - Font 0: samples 0, 5, 10, 15, ... (20 times)
        - Font 1: samples 1, 6, 11, 16, ... (20 times)
        - etc.

    Args:
        samples_per_letter (int): Number of samples per letter.
        num_fonts (int): Number of available fonts.
    """
    from collections import Counter

    # Simulate distribution
    font_usage = Counter()
    for i in range(samples_per_letter):
        font_index = i % num_fonts
        font_usage[f"Font {font_index}"] += 1

    print("=" * 60)
    print("FONT DISTRIBUTION (PREDICTION)")
    print("=" * 60)

    for font, count in sorted(font_usage.items()):
        percentage = (count / samples_per_letter) * 100
        bar_length = int(percentage / 2)
        bar = "#" * bar_length
        print(f"  {font}: {count:5d} samples ({percentage:5.1f}%) |{bar}")

    print("=" * 60)
    print("[OK] All fonts are used equitably.")
    print("=" * 60)


# -----------------------------------------------------------------------------
# EXECUTE VERIFICATION
# -----------------------------------------------------------------------------

# Check if necessary variables are defined
if 'BASE_SAMPLES_PER_LETTER' in dir() and 'font_paths' in globals():
    verify_font_distribution(BASE_SAMPLES_PER_LETTER, len(font_paths))
elif 'BASE_SAMPLES_PER_LETTER' in dir():
    # Load fonts if not loaded
    try:
        font_paths = get_handwritten_fonts(FONT_DIR)
        verify_font_distribution(BASE_SAMPLES_PER_LETTER, len(font_paths))
    except Exception as e:
        print(f"[WARNING] Unable to load fonts: {e}")
else:
    print("[INFO] Execute configuration cells first to define parameters.")

## Download Dataset

In [None]:
import os

# Path to your generated data folder
OUTPUT_ROOT = "dataset_handwritten_28x28"

# Recursive count of all files
total_files = sum([len(files) for r, d, files in os.walk(OUTPUT_ROOT)])

print(f"‚úÖ Total number of files found: {total_files}")
# Verify this number matches 26 * SAMPLES_PER_LETTER

In [None]:
# Create ZIP archive on local Colab disk (fast)
# Syntax: zip -r -q [output_archive_name] [source_folder]
!zip -r -q dataset_complet_v2.zip dataset_handwritten_28x28

print("‚úÖ Compression completed. The archive 'dataset_complet_v2.zip' is ready.")

---

## üìã Handwritten Character Simulation: Technical Specification

### TRDG Generator Parameters for Handwriting Simulation:

#### 1. **Skewing (Slant Effect)**
```python
skewing_angle=10          # Maximum skew angle in degrees (¬±10¬∞)
random_skew=True          # Randomizes skew for each image
```
- **Purpose**: Simulates natural handwriting slant/tilt
- **Effect**: Each letter is randomly tilted left or right (mimicking individual writing styles)
- **Range**: Angle randomly chosen between -10¬∞ to +10¬∞

#### 2. **Distortion (Natural Hand Movement)**
```python
distorsion_type=2         # Type 2 = Sine wave distortion
distorsion_orientation=2  # 2 = Random (horizontal, vertical, or both)
```
- **Purpose**: Mimics natural irregularity and waviness of human handwriting
- **Distortion Types**:
  - `0`: No distortion
  - `1`: Simple distortion
  - `2`: Sine wave distortion (optimal for handwriting)
  - `3`: Cosine wave distortion
- **Effect**: Creates wavy, non-linear characters that appear hand-drawn rather than printed

#### 3. **Blur (Ink Bleeding Effect)**
```python
blur=2                    # Blur intensity (0-3)
random_blur=True          # Randomizes blur for each image
```
- **Purpose**: Simulates pen/pencil ink bleeding and imperfect writing instruments
- **Effect**: Adds slight blur to edges, creating more organic appearance
- **Range**: 0 (no blur) to 3 (maximum blur)

#### 4. **Background Variety**
```python
background_type=3         # Random background type
```
- **Background Types**:
  - `0`: Gaussian noise
  - `1`: Plain white
  - `2`: Quasicrystal pattern
  - `3`: Random selection from above options
- **Purpose**: Enhances model robustness to different paper textures and scanning conditions

#### 5. **Font Selection**
```python
fonts=font_paths          # Uses handwritten .ttf fonts from local directory
```
- **Purpose**: Foundation of handwriting simulation
- **Critical**: MUST use actual handwritten-style fonts, not standard fonts
- **Effect**: Each font represents a different "handwriting style"

---

## üìÅ Font Directory Structure

### Required Setup:

```
project_root/
‚îú‚îÄ‚îÄ PD1_ICR_V3.ipynb              # This notebook
‚îú‚îÄ‚îÄ handwritten_fonts/            # CREATE THIS FOLDER
‚îÇ   ‚îú‚îÄ‚îÄ handwriting1.ttf
‚îÇ   ‚îú‚îÄ‚îÄ handwriting2.ttf
‚îÇ   ‚îú‚îÄ‚îÄ cursive_font.ttf
‚îÇ   ‚îî‚îÄ‚îÄ script_style.ttf
‚îî‚îÄ‚îÄ dataset_handwritten_28x28/    # Generated automatically
    ‚îú‚îÄ‚îÄ A/
    ‚îÇ   ‚îú‚îÄ‚îÄ A_00001.png
    ‚îÇ   ‚îú‚îÄ‚îÄ A_00002.png
    ‚îÇ   ‚îî‚îÄ‚îÄ ...
    ‚îú‚îÄ‚îÄ B/
    ‚îÇ   ‚îú‚îÄ‚îÄ B_00001.png
    ‚îÇ   ‚îî‚îÄ‚îÄ ...
    ‚îî‚îÄ‚îÄ ...
```

### Recommended Handwritten Font Sources:

1. **Google Fonts** (Free):
   - Caveat
   - Permanent Marker
   - Indie Flower
   - Shadows Into Light
   - Patrick Hand
   - Kalam
   - Architect's Daughter

2. **Download Process**:
   ```bash
   # Example: Download from Google Fonts
   # 1. Visit: https://fonts.google.com/
   # 2. Search for "handwriting" or "script" fonts
   # 3. Download the font
   # 4. Extract the .ttf file
   # 5. Place it in ./handwritten_fonts/
   ```

3. **Naming Convention**:
   - Font files can have any name (e.g., `my_handwriting.ttf`)
   - The script automatically discovers all `.ttf` files in the folder
   - Multiple fonts = greater handwriting variety in your dataset

---

## üéØ Comparison: Handwritten vs Printed Simulation

| Aspect | Printed (Default TRDG) | Handwritten (This Implementation) |
|--------|------------------------|------------------------------------|
| **Font** | System fonts (Arial, Times) | Handwritten .ttf fonts |
| **Skewing** | None | ¬±10¬∞ random slant |
| **Distortion** | None | Sine wave distortion |
| **Blur** | Sharp edges | Random blur (ink bleeding simulation) |
| **Appearance** | Perfectly uniform | Irregular, organic |
| **Real-world Use** | Typed documents | Human handwriting recognition |

---

## üí° Optimization Tips for Best Results:

1. **Use 5-10 different handwritten fonts** for maximum variety
2. **Adjust `BASE_SAMPLES_PER_LETTER`** based on requirements (1000-10000 recommended)
3. **Image size**: 28x28 pixels optimal for single characters
4. **Increase `ROTATION_RANGE`** to ¬±20¬∞ for more dramatic slant variation
5. **Increase `BLUR_RADIUS_RANGE`** to (0.5, 2.0) for older/low-quality document simulation
6. **Enable OpenCV** for morphological operations (erosion/dilation) to enhance realism

## üìä Optional: Visualize Sample Images

In [None]:
# =============================================================================
# DATASET VISUALIZATION AND FINAL REPORT
# =============================================================================
# This cell displays samples from the generated dataset and produces a
# comprehensive report on generation including font download statistics
# and class distribution.

import matplotlib.pyplot as plt
from PIL import Image
import random
from datetime import datetime


def visualize_dataset_samples(
    output_root: str,
    letters: list,
    num_samples: int = 5
) -> None:
    """
    Display random samples from the generated dataset.

    Args:
        output_root (str): Root directory of the dataset.
        letters (list): List of characters.
        num_samples (int): Number of samples to display per letter.
    """
    # Select 5 random letters for display
    sample_letters = random.sample(letters, min(5, len(letters)))

    fig, axes = plt.subplots(len(sample_letters), num_samples, figsize=(15, 3 * len(sample_letters)))
    fig.suptitle('Handwritten Character Dataset Samples', fontsize=16, fontweight='bold')

    for i, letter in enumerate(sample_letters):
        letter_dir = Path(output_root) / letter

        if not letter_dir.exists():
            print(f"[WARNING] Directory {letter_dir} not found")
            continue

        # Get all images (PNG or JPG)
        images = list(letter_dir.glob("*.png")) + list(letter_dir.glob("*.jpg"))

        if not images:
            print(f"[WARNING] No images in {letter_dir}")
            continue

        # Select random samples
        sample_images = random.sample(images, min(num_samples, len(images)))

        for j, img_path in enumerate(sample_images):
            img = Image.open(img_path)

            if len(sample_letters) == 1:
                ax = axes[j]
            else:
                ax = axes[i, j]

            ax.imshow(img, cmap='gray', vmin=0, vmax=255)
            ax.axis('off')

            if j == 0:
                ax.set_title(f"'{letter}'", fontsize=12, fontweight='bold', loc='left')

    plt.tight_layout()
    plt.subplots_adjust(top=0.92)
    plt.show()


def generate_final_report(
    output_root: str,
    letters: list,
    font_stats: dict = None
) -> None:
    """
    Generate a comprehensive report on the produced dataset.

    This report includes:
    - Font download statistics
    - Image distribution by class
    - Augmentation parameters used
    - Dataset quality metrics

    Args:
        output_root (str): Root directory of the dataset.
        letters (list): List of generated characters.
        font_stats (dict): Font download statistics (optional).
    """
    print("\n" + "=" * 80)
    print("DATASET GENERATION FINAL REPORT")
    print("=" * 80)
    print(f"Generation date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Section 1: Font statistics
    print("\n" + "-" * 80)
    print("SECTION 1: FONT ACQUISITION")
    print("-" * 80)

    fonts_dir = Path(FONTS_OUTPUT_DIR) if 'FONTS_OUTPUT_DIR' in dir() else Path("./handwritten_fonts")
    available_fonts = list(fonts_dir.glob("*.ttf")) if fonts_dir.exists() else []

    print(f"  Font directory: {fonts_dir.absolute()}")
    print(f"  Available fonts: {len(available_fonts)}")

    if font_stats:
        print(f"  Successful downloads: {font_stats.get('successful', 'N/A')}")
        print(f"  Failed downloads: {font_stats.get('failed', 'N/A')}")

    if available_fonts:
        print("\n  Font list:")
        for font_path in available_fonts[:10]:
            print(f"    - {font_path.name}")
        if len(available_fonts) > 10:
            print(f"    ... and {len(available_fonts) - 10} more")

    # Section 2: Class distribution
    print("\n" + "-" * 80)
    print("SECTION 2: CLASS DISTRIBUTION")
    print("-" * 80)

    output_path = Path(output_root)
    class_distribution = {}
    total_images = 0

    if output_path.exists():
        for letter in letters:
            letter_dir = output_path / letter
            if letter_dir.exists():
                count = len(list(letter_dir.glob("*.png"))) + len(list(letter_dir.glob("*.jpg")))
                class_distribution[letter] = count
                total_images += count

    if class_distribution:
        max_count = max(class_distribution.values())
        min_count = min(class_distribution.values())
        avg_count = total_images / len(class_distribution)

        print(f"  Total images: {total_images:,}")
        print(f"  Number of classes: {len(class_distribution)}")
        print(f"  Average per class: {avg_count:.1f}")
        print(f"  Min/Max per class: {min_count} / {max_count}")
        print(f"  Balance ratio: {max_count / min_count:.2f}x")

        # Text-based histogram
        print("\n  Distribution:")
        for letter in sorted(class_distribution.keys()):
            count = class_distribution[letter]
            bar_length = int((count / max_count) * 40)
            bar = "#" * bar_length
            marker = ""
            if letter in ['I', 'F', 'G', 'K', 'Q', 'X', 'Z']:
                marker = " [AUGMENTED]"
            elif letter in ['O', 'S']:
                marker = " [REDUCED]"
            print(f"    {letter}: {count:5d} |{bar}{marker}")
    else:
        print("  [INFO] Dataset not yet generated")

    # Section 3: Augmentation parameters
    print("\n" + "-" * 80)
    print("SECTION 3: AUGMENTATION PARAMETERS")
    print("-" * 80)

    print("  Image specifications:")
    print("    - Size: 28x28 pixels")
    print("    - Mode: Grayscale ('L')")
    print("    - Background: 220-255 (light)")
    print("    - Text: 0-50 (dark)")

    print("\n  Geometric augmentation:")
    print("    - Rotation: -15 to +15 degrees")
    print("    - Translation: -3 to +3 pixels")
    print("    - Scale: 60% to 85%")

    print("\n  Stroke simulation:")
    print("    - Thickness: 1 to 4 pixels")
    print("    - Gaussian blur: 0.0 to 1.5")
    print("    - Gaussian noise: sigma=5.0")
    print("    - Morphological operations: 30% probability")

    print("\n  Class balancing:")
    print("    - Underrepresented (x2.5): I, F, G, K, Q, X, Z")
    print("    - Overrepresented (x0.7): O, S")
    print("    - Standard (x1.0): Other characters")

    # Section 4: Generated files
    print("\n" + "-" * 80)
    print("SECTION 4: GENERATED FILES")
    print("-" * 80)

    print(f"  Dataset directory: {output_path.absolute()}")

    if output_path.exists():
        subdirs = [d for d in output_path.iterdir() if d.is_dir()]
        print(f"  Subdirectories: {len(subdirs)}")

        stats_file = output_path / "generation_stats.txt"
        if stats_file.exists():
            print(f"  Statistics file: {stats_file}")

    # Section 5: Recommendations
    print("\n" + "-" * 80)
    print("SECTION 5: TRAINING RECOMMENDATIONS")
    print("-" * 80)

    print("  Recommended preprocessing:")
    print("    - Normalization: pixels / 255.0 (range [0, 1])")
    print("    - Reshape for CNN: (N, 28, 28, 1)")
    print("    - Label encoding: One-hot or LabelEncoder")

    print("\n  Suggested CNN architecture:")
    print("    - Input: (28, 28, 1)")
    print("    - Conv2D -> BatchNorm -> ReLU -> MaxPool (x2-3)")
    print("    - Dropout: 0.25-0.5")
    print("    - Dense -> Softmax (26 classes)")

    print("\n  Initial hyperparameters:")
    print("    - Optimizer: Adam (lr=0.001)")
    print("    - Loss: categorical_crossentropy")
    print("    - Batch size: 32-64")
    print("    - Epochs: 20-50 with EarlyStopping")

    print("\n" + "=" * 80)
    print("END OF REPORT")
    print("=" * 80)


# -----------------------------------------------------------------------------
# EXECUTE REPORT
# -----------------------------------------------------------------------------

# Retrieve download statistics if available
font_download_stats = download_stats if 'download_stats' in dir() else None

# Generate final report
generate_final_report(
    output_root=OUTPUT_ROOT,
    letters=LETTERS,
    font_stats=font_download_stats
)

# Visualize samples (if dataset exists)
output_dataset_path = Path(OUTPUT_ROOT)
if output_dataset_path.exists() and any(output_dataset_path.iterdir()):
    print("\n[VISUALIZATION] Displaying dataset samples...")
    visualize_dataset_samples(OUTPUT_ROOT, LETTERS, num_samples=6)
else:
    print("\n[INFO] Dataset not yet generated. Execute generation cells first.")

## Copy .zip File to Google Drive

In [None]:
from google.colab import drive
import shutil
import os

# 1. Mount Google Drive (authorization window will appear)
drive.mount('/content/drive')

# --- CONFIGURATION ---
# Put here the path to the file you downloaded in Colab
# Example: '/content/dataset.zip' or just 'dataset.zip'
source_file = 'dataset_complet_v2.zip'

# Put here the folder where you want to save it in your Drive
# '/content/drive/My Drive/' is the root of your Drive
destination_folder = '/content/drive/My Drive/'
# ---------------------

# Create destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)
    print(f"Folder created: {destination_folder}")

# Copy file
print(f"Copying {source_file} to Drive in progress...")
try:
    shutil.copy(source_file, destination_folder)
    print("‚úÖ Success! File copied to your Google Drive.")
except FileNotFoundError:
    print(f"‚ùå Error: Source file '{source_file}' not found. Check the name.")
except Exception as e:
    print(f"‚ùå An error occurred: {e}")