# Create Weather-Augmented Text Dataset

This notebook creates a combined dataset by applying weather effects to text images.
This bridges the domain gap between weather classification and text recognition.


In [None]:
import cv2
import numpy as np
from pathlib import Path
import random


# ---------------------------------------------------------
# INPUT / OUTPUT FOLDERS
# ---------------------------------------------------------
input_folder = Path(r"C:\Users\HP\Downloads\nlp-all\NLP-20251121T193818Z-1-001\NLP\ICDAR_update\Test\ch4_test_images")
output_folder = input_folder.parent / "ICDAR_update_augmented"

for sub in ["snow", "fog", "rain", "haze"]:
    (output_folder / sub).mkdir(parents=True, exist_ok=True)


# ---------------------------------------------------------
# REALISTIC EFFECT FUNCTIONS
# ---------------------------------------------------------

# -------------------------- SNOW -------------------------
def add_realistic_snow(img):
    h, w = img.shape[:2]
    snow = img.copy()

    snow_layer = np.zeros((h, w, 3), dtype=np.uint8)

    # Generate snowflakes
    num_flakes = int((h * w) / 800)  # density
    for _ in range(num_flakes):
        x = random.randint(0, w - 1)
        y = random.randint(0, h - 1)
        radius = random.randint(1, 3)
        cv2.circle(snow_layer, (x, y), radius, (255, 255, 255), -1)

    # Add motion blur (falling effect)
    k = random.randint(5, 15)
    kernel_motion = np.zeros((k, k))
    kernel_motion[:, int((k - 1) / 2)] = np.ones(k)
    kernel_motion /= k

    snow_layer = cv2.filter2D(snow_layer, -1, kernel_motion)

    return cv2.addWeighted(snow, 0.8, snow_layer, 0.4, 0)


# -------------------------- FOG -------------------------
def generate_perlin_noise(h, w, scale=50):
    noise = np.zeros((h, w))
    for y in range(h):
        for x in range(w):
            noise[y][x] = random.uniform(0, 1)

    noise = cv2.resize(noise, (w, h))
    return cv2.GaussianBlur(noise, (0, 0), sigmaX=scale)


def add_realistic_fog(img):
    h, w = img.shape[:2]
    fog = img.copy().astype(np.float32)

    fog_map = generate_perlin_noise(h, w, scale=80)
    fog_map = cv2.normalize(fog_map, None, 0.4, 1.0, cv2.NORM_MINMAX)

    fog_layer = np.dstack([fog_map] * 3) * 255

    return cv2.addWeighted(fog, 0.6, fog_layer, 0.4, 0).astype(np.uint8)


# -------------------------- RAIN -------------------------
def add_realistic_rain(img):
    h, w = img.shape[:2]
    rain_layer = np.zeros((h, w, 3), dtype=np.uint8)

    num_drops = int((h * w) / 600)

    for _ in range(num_drops):
        x = random.randint(0, w - 1)
        y = random.randint(0, h - 1)
        
        length = random.randint(10, 20)
        thickness = random.randint(1, 2)

        end_x = x + random.randint(-2, 2)
        end_y = y + length

        cv2.line(rain_layer, (x, y), (end_x, end_y), (255, 255, 255), thickness)

    # motion blur for rain streak direction
    kernel = np.zeros((15, 15))
    kernel[7, :] = np.ones(15)
    kernel /= 15
    rain_layer = cv2.filter2D(rain_layer, -1, kernel)

    return cv2.addWeighted(img, 0.75, rain_layer, 0.25, 0)


# -------------------------- HAZE -------------------------
def add_realistic_haze(img):
    haze = img.astype(np.float32)
    h, w = haze.shape[:2]

    haze_factor = random.uniform(0.3, 0.6)
    light_color = np.array([255, 255, 215], dtype=np.float32)  # warm haze

    haze_layer = np.ones_like(haze) * light_color

    result = cv2.addWeighted(haze, 1 - haze_factor, haze_layer, haze_factor, 0)
    return result.astype(np.uint8)


# ---------------------------------------------------------
# PROCESS ALL IMAGES
# ---------------------------------------------------------

image_files = list(input_folder.glob("*.jpg"))
print(f"Found {len(image_files)} images.")

for img_path in image_files:
    img = cv2.imread(str(img_path))
    if img is None:
        continue

    name = img_path.stem

    snow = add_realistic_snow(img)
    fog = add_realistic_fog(img)
    rain = add_realistic_rain(img)
    haze = add_realistic_haze(img)

    cv2.imwrite(str(output_folder / "snow" / f"{name}_snow.jpg"), snow)
    cv2.imwrite(str(output_folder / "fog" / f"{name}_fog.jpg"), fog)
    cv2.imwrite(str(output_folder / "rain" / f"{name}_rain.jpg"), rain)
    cv2.imwrite(str(output_folder / "haze" / f"{name}_haze.jpg"), haze)

print("✨ Realistic weather augmentation completed!")
print("Images saved to:", output_folder)


Found 0 images.
✨ Realistic weather augmentation completed!
Images saved to: C:\Users\HP\Downloads\nlp-all\NLP-20251121T193818Z-1-001\NLP\ICDAR_update_augmented


In [5]:
import sys
sys.path.append('src')
from pathlib import Path
import json
import cv2
import numpy as np
from tqdm import tqdm
from collections import Counter

sys.path.append(str(Path.cwd().parent))
import config
from src.weather_augmentation import apply_weather_effect
from src.data_loader import ICDARDataset

# Output directory
output_dir = config.PROCESSED_DATA_DIR / 'weather_augmented_text'
output_dir.mkdir(parents=True, exist_ok=True)

# Weather types to apply
weather_types = ['clear', 'rain', 'fog', 'snow', 'haze']
intensities = {'clear': 0.3, 'rain': 0.5, 'fog': 0.6, 'snow': 0.5, 'haze': 0.5}

print(f"Creating weather-augmented text dataset in: {output_dir}")


Creating weather-augmented text dataset in: c:\Users\HP\Downloads\nlp-all\NLP-20251121T193818Z-1-001\NLP\outputs\processed_data\weather_augmented_text


In [6]:
# Load ICDAR dataset (without transform to get raw images)
icdar_dataset = ICDARDataset(
    config.ICDAR_DIR / 'ic15_train.json',
    config.ICDAR_DIR / 'train_images',
    transform=None
)

print(f"Loaded {len(icdar_dataset)} ICDAR images")
print(f"Will create {len(icdar_dataset) * len(weather_types)} augmented images")


Loaded 167 ICDAR images
Will create 835 augmented images


In [8]:
# Create augmented dataset
augmented_data = []

# Process ALL images for comprehensive dataset
num_samples = len(icdar_dataset)
print(f"Processing {num_samples} images with {len(weather_types)} weather types each...")
print(f"Will create {num_samples * len(weather_types)} total augmented images")

for idx in tqdm(range(num_samples), desc="Augmenting"):
    try:
        img, text = icdar_dataset[idx]
        
        # Use placeholder when text is missing so augmentation still runs
        if not text or len(text.strip()) == 0:
            text = ""
        
        # Convert PIL to numpy
        img_np = np.array(img)
        
        # Apply each weather type
        for weather in weather_types:
            intensity = intensities.get(weather, 0.5)
            augmented_img = apply_weather_effect(img_np.copy(), weather, intensity)
            
            # Save augmented image
            filename = f"icdar_{idx:05d}_{weather}.jpg"
            save_path = output_dir / filename
            cv2.imwrite(str(save_path), cv2.cvtColor(augmented_img, cv2.COLOR_RGB2BGR))
            
            # Store metadata
            augmented_data.append({
                'image_path': str(save_path),
                'original_idx': idx,
                'weather': weather,
                'text': text,
                'intensity': intensity
            })
    except Exception as e:
        print(f"Error processing image {idx}: {e}")
        continue

print(f"\nCreated {len(augmented_data)} augmented images")
print(f"Average {len(augmented_data) / len(weather_types):.0f} images per weather type")


Processing 167 images with 5 weather types each...
Will create 835 total augmented images


Augmenting: 100%|██████████| 167/167 [20:04<00:00,  7.21s/it] 


Created 835 augmented images
Average 167 images per weather type





In [9]:
# Save metadata
metadata_path = output_dir / 'metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(augmented_data, f, indent=2)

print(f"Saved metadata to {metadata_path}")
print(f"\nDataset statistics:")
print(f"  Total images: {len(augmented_data)}")
print(f"  Weather distribution:")
weather_counts = Counter([d['weather'] for d in augmented_data])
for weather, count in weather_counts.items():
    print(f"    {weather}: {count}")


Saved metadata to c:\Users\HP\Downloads\nlp-all\NLP-20251121T193818Z-1-001\NLP\outputs\processed_data\weather_augmented_text\metadata.json

Dataset statistics:
  Total images: 835
  Weather distribution:
    clear: 167
    rain: 167
    fog: 167
    snow: 167
    haze: 167
