# Preprocesamiento III

## 5. Data Augmentation

### 5.1 Aplicacion

In [2]:
import os
import json
import cv2
import albumentations as A
from pathlib import Path
from tqdm import tqdm
from copy import deepcopy

In [17]:


def create_subset_data_augmentation(
    image_folder_path,
    coco_json_path,
    output_image_folder_path,
    output_json_path,
    num_augmentations
):
    """
    Applies data augmentation to images based on COCO annotations and saves the augmented images and new annotations.

    Parameters:
        image_folder_path (str): Path to the original image folder.
        coco_json_path (str): Path to the original COCO JSON annotations.
        output_image_folder_path (str): Directory to save augmented images.
        output_json_path (str): Path to save the new COCO JSON file.
        num_augmentations (int): Number of augmentations to apply per image.
    """
    os.makedirs(output_image_folder_path, exist_ok=True)

    with open(coco_json_path, 'r', encoding='utf-8') as f:
        coco = json.load(f)

    transform = A.Compose([
        A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=10, p=0.9),
        A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.5),
        A.MotionBlur(blur_limit=3, p=0.3)
    ], bbox_params=A.BboxParams(format='coco', label_fields=['category_ids']))

    new_images = []
    new_annotations = []
    new_image_id = max(img['id'] for img in coco['images']) + 1
    new_annotation_id = max(ann['id'] for ann in coco['annotations']) + 1

    image_id_to_anns = {}
    for ann in coco['annotations']:
        image_id_to_anns.setdefault(ann['image_id'], []).append(ann)

    for img_info in tqdm(coco['images'], desc="Augmenting images"):
        img_path = os.path.join(image_folder_path, img_info['file_name'])
        image = cv2.imread(img_path)
        if image is None:
            continue

        anns = image_id_to_anns.get(img_info['id'], [])
        bboxes = [ann['bbox'] for ann in anns]
        category_ids = [ann['category_id'] for ann in anns]

        if not bboxes:
            continue

        for i in range(1, num_augmentations + 1):
            transformed = transform(image=image, bboxes=bboxes, category_ids=category_ids)
            aug_file_name = f"{Path(img_info['file_name']).stem}_aug-{i}{Path(img_info['file_name']).suffix}"
            aug_path = os.path.join(output_image_folder_path, aug_file_name)

            cv2.imwrite(aug_path, transformed['image'])

            new_images.append({
                "id": new_image_id,
                "file_name": aug_file_name,
                "width": img_info['width'],
                "height": img_info['height']
            })

            for bbox, cat_id in zip(transformed['bboxes'], transformed['category_ids']):
                new_annotations.append({
                    "id": new_annotation_id,
                    "image_id": new_image_id,
                    "category_id": cat_id,
                    "bbox": bbox,
                    "iscrowd": 0,
                    "area": bbox[2] * bbox[3]
                })
                new_annotation_id += 1

            new_image_id += 1

    augmented_coco = {
        "images": new_images,
        "annotations": new_annotations,
        "categories": coco['categories']
    }

    # Create output folder for JSON if it doesn't exist
    os.makedirs(os.path.dirname(output_json_path), exist_ok=True)

    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(augmented_coco, f, indent=2)

    print("Data augmentation complete and saved.")


In [18]:
create_subset_data_augmentation(
    image_folder_path='datasets/unified_dataset/images',
    coco_json_path='datasets/unified_dataset/subsets/train.json',
    output_image_folder_path='datasets/unified_dataset/train_augmented/images',
    output_json_path='datasets/unified_dataset/train_augmented/train_augmented.json',
    num_augmentations=4
)

Augmenting images: 100%|██████████| 665/665 [06:04<00:00,  1.82it/s]

Data augmentation complete and saved.





### 5.2 Verificacion de BBOX

In [12]:
import os
from PIL import Image, ImageDraw
from pycocotools.coco import COCO
import random

In [19]:
def save_coco_bboxes_to_images(image_dir, annotation_path, output_dir='output', max_images=None):
    """
    Save images with bounding boxes drawn from COCO annotations.

    Parameters:
        image_dir (str): Directory containing the images.
        annotation_path (str): Full path to the COCO annotation file.
        output_dir (str): Directory to save output images with drawn bounding boxes.
        max_images (int, optional): If provided, randomly selects up to this number of images (without repetition).
    """
    os.makedirs(output_dir, exist_ok=True)
    coco = COCO(annotation_path)
    image_ids = coco.getImgIds()

    if max_images is not None and max_images < len(image_ids):
        image_ids = random.sample(image_ids, max_images)

    for img_id in image_ids:
        img_data = coco.loadImgs(img_id)[0]
        img_path = os.path.join(image_dir, img_data['file_name'])
        output_path = os.path.join(output_dir, img_data['file_name'])

        if not os.path.exists(img_path):
            print(f"[WARNING] Image file not found: {img_path}")
            continue

        image = Image.open(img_path).convert("RGB")
        draw = ImageDraw.Draw(image)

        ann_ids = coco.getAnnIds(imgIds=img_id)
        anns = coco.loadAnns(ann_ids)

        for ann in anns:
            x, y, w, h = ann['bbox']
            draw.rectangle([x, y, x + w, y + h], outline='red', width=3)

        image.save(output_path)


In [20]:
save_coco_bboxes_to_images(
    image_dir='datasets/unified_dataset/train_augmented/images',
    annotation_path='datasets/unified_dataset/train_augmented/train_augmented.json',
    output_dir='datasets/unified_dataset/train_augmented/images_bbox_test',
    max_images=10
)


loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
