# Data Generation Pipeline — Fine-Grained Sans-Serif Recognition

This notebook generates synthetic text images using **TextRecognitionDataGenerator (TRDG)** for **20 popular sans-serif font families**. Each image contains a short text phrase rendered in one of the target fonts on a clean white background — simulating web / digital poster screenshots.

**Pipeline steps:**
1. Install dependencies & configure paths
2. Build a reproducible text corpus
3. Generate images per font class using TRDG (varied sizes & colors)
4. Split into **80 / 20** train / test sets
5. Verify dataset statistics & preview samples

In [None]:
import os
import random
import shutil
from glob import glob
from collections import defaultdict

import numpy as np
from PIL import Image
from tqdm import tqdm

from trdg.generators import GeneratorFromStrings
import matplotlib.pyplot as plt

## 1 · Configuration & Setup

Define the font registry, output paths, and generation hyper-parameters.  
Paths **auto-detect** whether we are running on Kaggle or locally.

In [None]:
# Environment Detection
IS_KAGGLE = os.path.exists("/kaggle")
print(f"Running on: {'Kaggle' if IS_KAGGLE else 'Local'}")

if IS_KAGGLE:
    FONT_DIR    = "/kaggle/input/sans-serif-raw-fonts"  # ← Kaggle dataset name
    BASE_OUTPUT = "/kaggle/working/processed"
else:
    FONT_DIR    = "../data/raw_fonts"
    BASE_OUTPUT = "../data/processed"

TRAIN_DIR = os.path.join(BASE_OUTPUT, "train")
TEST_DIR  = os.path.join(BASE_OUTPUT, "test")
TEMP_DIR  = os.path.join(BASE_OUTPUT, "_all")   

# Font Registry
FONTS = {
    "DMSans":      "DMSans-Regular.ttf",
    "Figtree":     "Figtree-Regular.ttf",
    "FiraSans":    "FiraSans-Regular.ttf",
    "Heebo":       "Heebo-Regular.ttf",
    "Hind":        "Hind-Regular.ttf",
    "Inter":       "Inter-Regular.ttf",
    "Karla":       "Karla-Regular.ttf",
    "Lato":        "Lato-Regular.ttf",
    "Montserrat":  "Montserrat-Regular.ttf",
    "Mulish":      "Mulish-Regular.ttf",
    "NotoSans":    "NotoSans-Regular.ttf",
    "OpenSans":    "OpenSans-Regular.ttf",
    "Poppins":     "Poppins-Regular.ttf",
    "PTSans":      "PTSans-Regular.ttf",
    "Raleway":     "Raleway-Regular.ttf",
    "Roboto":      "Roboto-Regular.ttf",
    "Rubik":       "Rubik-Regular.ttf",
    "SourceSans3": "SourceSans3-Regular.ttf",
    "Ubuntu":      "Ubuntu-Regular.ttf",
    "WorkSans":    "WorkSans-Regular.ttf",
}

# Generation Parameters 
IMAGES_PER_CLASS = 5000
TRAIN_RATIO      = 0.8         
RANDOM_SEED      = 42
SIZES       = [32, 48, 64, 80, 96]
TEXT_COLORS = ["#000000", "#1a1a1a", "#333333", "#4a4a4a"]

# Verify Fonts

missing = [f for f in FONTS.values() if not os.path.isfile(os.path.join(FONT_DIR, f))]
if missing:
    raise FileNotFoundError(f"Missing font files in {FONT_DIR}: {missing}")

print(f"✓ All {len(FONTS)} font files found in {FONT_DIR}")
print(f"  Images per class : {IMAGES_PER_CLASS}")
print(f"  Total images     : {IMAGES_PER_CLASS * len(FONTS):,}")
print(f"  Train / Test     : {TRAIN_RATIO:.0%} / {1 - TRAIN_RATIO:.0%}")

## 2 · Text Corpus

A pool of **120 common English words** (web / UI / tech themed) is randomly combined into 1–5 word phrases.  

In [None]:
WORD_POOL = [
    "the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
    "hello", "world", "from", "with", "this", "that", "have", "been",
    "will", "your", "about", "more", "when", "which", "their", "what",
    "design", "modern", "creative", "digital", "website", "portfolio",
    "services", "products", "features", "pricing", "contact", "gallery",
    "home", "blog", "news", "events", "team", "careers", "support",
    "learn", "explore", "discover", "subscribe", "download", "search",
    "settings", "profile", "login", "register", "welcome", "dashboard",
    "analytics", "navigation", "header", "footer", "sidebar", "content",
    "innovation", "technology", "platform", "system", "software",
    "algorithm", "database", "framework", "interface", "network",
    "security", "cloud", "machine", "learning", "artificial",
    "intelligence", "automation", "optimize", "scalable", "robust",
    "company", "business", "startup", "enterprise", "agency",
    "strategy", "marketing", "branding", "growth", "revenue",
    "customer", "solution", "premium", "standard", "professional",
    "beautiful", "minimal", "elegant", "simple", "clean", "bold",
    "dynamic", "responsive", "seamless", "powerful", "intuitive",
    "efficient", "reliable", "flexible", "innovative", "sustainable",
    "experience", "performance", "development", "engineering", "research",
    "education", "community", "global", "future", "vision", "mission",
    "quality", "excellence", "progress", "success", "journey",
    "transform", "connect", "build", "create", "launch", "deliver",
]

def build_text_corpus(n: int, seed: int = 42) -> list[str]:
    rng = random.Random(seed)
    texts = []
    for _ in range(n):
        num_words = rng.randint(1, 5)
        phrase = " ".join(rng.choices(WORD_POOL, k=num_words))
        r = rng.random()
        if r < 0.35:
            phrase = phrase.title()     
        elif r < 0.50:
            phrase = phrase.upper()      
        texts.append(phrase)
    return texts

_preview = build_text_corpus(8, seed=RANDOM_SEED)
for t in _preview:
    print(f"  \"{t}\"")

## 3 · Image Generation with TRDG

For each of the 20 font classes, a total of **5,000 images** are generated, distributed evenly across **5 size batches** (1,000 images per height variant: `32`, `48`, `64`, `80`, and `96 px`). 

In [None]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

texts = build_text_corpus(IMAGES_PER_CLASS, seed=RANDOM_SEED)
BATCH_SIZE = IMAGES_PER_CLASS // len(SIZES)   

print(f"Generating {IMAGES_PER_CLASS} images × {len(FONTS)} fonts "
      f"= {IMAGES_PER_CLASS * len(FONTS):,} total")
print(f"  {len(SIZES)} size variants × {BATCH_SIZE} images each\n")

for font_name, font_file in tqdm(FONTS.items(), desc="Fonts", unit="font"):
    font_path = os.path.join(FONT_DIR, font_file)
    out_dir   = os.path.join(TEMP_DIR, font_name)
    os.makedirs(out_dir, exist_ok=True)

    img_idx = 0
    for size_i, size in enumerate(SIZES):
        # Slice the text corpus for this batch
        start = size_i * BATCH_SIZE
        end   = start + BATCH_SIZE
        batch_texts = texts[start:end]
        color = TEXT_COLORS[size_i % len(TEXT_COLORS)]

        generator = GeneratorFromStrings(
            strings=batch_texts,
            fonts=[font_path],
            count=BATCH_SIZE,
            size=size,
            skewing_angle=0,
            random_skew=False,
            blur=0,
            random_blur=False,
            background_type=1,        # plain white
            text_color=color,
            margins=(5, 5, 5, 5),
            fit=True,
        )

        for img, _ in generator:
            img.save(os.path.join(out_dir, f"{font_name}_{img_idx:04d}.png"))
            img_idx += 1

    tqdm.write(f"  ✓ {font_name:<14s} — {img_idx} images saved")

print(f"\n✓ Generation complete: {IMAGES_PER_CLASS * len(FONTS):,} images → {TEMP_DIR}")

## 4 · Train / Test Split

Shuffle each class and split **80 / 20** into `train/` and `test/` directories.  
The temporary `_all/` staging folder is removed afterwards to save disk space.

In [None]:
random.seed(RANDOM_SEED)

# Create output directories
for font_name in FONTS:
    os.makedirs(os.path.join(TRAIN_DIR, font_name), exist_ok=True)
    os.makedirs(os.path.join(TEST_DIR,  font_name), exist_ok=True)

print("Splitting dataset …\n")

train_total = 0
test_total  = 0

for font_name in sorted(FONTS.keys()):
    src_dir = os.path.join(TEMP_DIR, font_name)
    images  = sorted(glob(os.path.join(src_dir, "*.png")))
    random.shuffle(images)

    split_idx  = int(len(images) * TRAIN_RATIO)
    train_imgs = images[:split_idx]
    test_imgs  = images[split_idx:]

    for img_path in train_imgs:
        shutil.copy2(img_path, os.path.join(TRAIN_DIR, font_name, os.path.basename(img_path)))
    for img_path in test_imgs:
        shutil.copy2(img_path, os.path.join(TEST_DIR,  font_name, os.path.basename(img_path)))

    train_total += len(train_imgs)
    test_total  += len(test_imgs)
    print(f"  {font_name:<14s} — train: {len(train_imgs):>4d}   test: {len(test_imgs):>3d}")

# Clean up staging directory
shutil.rmtree(TEMP_DIR)

print(f"\n✓ Split complete")
print(f"  Train : {train_total:,} images → {TRAIN_DIR}")
print(f"  Test  : {test_total:,} images  → {TEST_DIR}")

## 5 · Dataset Verification

Count images per class in both splits and display one random sample per font.

In [None]:
# Per-class counts
print(" DATASET SUMMARY")

for split_name, split_dir in [("Train", TRAIN_DIR), ("Test", TEST_DIR)]:
    print(f"\n  {split_name} set:")
    total = 0
    for font_name in sorted(FONTS.keys()):
        count = len(glob(os.path.join(split_dir, font_name, "*.png")))
        total += count
        print(f"    {font_name:<14s}: {count}")
    print(f"    {'TOTAL':<14s}: {total}")

# Sample grid
fig, axes = plt.subplots(4, 5, figsize=(20, 12))
fig.suptitle("Sample Generated Images  (one per font family)", fontsize=16, y=1.01)

for idx, font_name in enumerate(sorted(FONTS.keys())):
    ax = axes[idx // 5, idx % 5]
    sample_dir  = os.path.join(TRAIN_DIR, font_name)
    sample_imgs = sorted(glob(os.path.join(sample_dir, "*.png")))
    if sample_imgs:
        img = Image.open(sample_imgs[0])
        ax.imshow(img)
    ax.set_title(font_name, fontsize=12, fontweight="bold")
    ax.axis("off")

plt.tight_layout()
plt.show()

## 6 · Export (Kaggle only)

Zip the processed dataset so it can be downloaded from Kaggle or attached to another notebook as a dataset.

In [None]:
if IS_KAGGLE:
    import zipfile

    zip_path = "/kaggle/working/sans_serif_dataset.zip"
    print("Creating zip archive …")

    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(BASE_OUTPUT):
            for fname in files:
                full = os.path.join(root, fname)
                arcname = os.path.relpath(full, BASE_OUTPUT)
                zf.write(full, arcname)

    size_mb = os.path.getsize(zip_path) / (1024 * 1024)
    print(f"✓ Saved {zip_path}  ({size_mb:.1f} MB)")
else:
    print("Skipping zip — not running on Kaggle.")