# 1. Prueba - Data Augmentation 1 file

In [1]:
import os
import csv

In [3]:


# Set the directory path
#image_dir = 'ANPR2.v1i.yolov8/train_del/images'
image_dir = 'Peru Plate Numbers.v3i.yolov8/train_del/images'

# Supported image extensions
image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')

# Get list of image file names
image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(image_extensions)]

# Write to CSV
with open('image_files.csv', mode='w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    for filename in image_files:
        writer.writerow([filename])


# 2. Aplicar format COCO 

## 2.1 Limpiar JSON

In [2]:
import json
import os
from pathlib import Path
from shutil import copyfile

In [3]:
def filter_coco_by_images(coco_path, image_dir, output_path):
    with open(coco_path, 'r') as f:
        data = json.load(f)

    valid_filenames = {f.name for f in Path(image_dir).glob("*")}
    image_id_map = {
        img['id']: img for img in data['images']
        if img['file_name'] in valid_filenames
    }

    data['images'] = list(image_id_map.values())
    valid_ids = set(image_id_map.keys())
    data['annotations'] = [
        ann for ann in data['annotations'] if ann['image_id'] in valid_ids
    ]

    with open(output_path, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"Saved filtered annotations to {output_path}")

def check_duplicate_filenames(*dirs):
    seen = {}
    duplicates = []

    for dir_path in dirs:
        for img_file in Path(dir_path).glob("*"):
            fname = img_file.name
            if fname in seen:
                duplicates.append((fname, seen[fname], dir_path))
            else:
                seen[fname] = dir_path

    if duplicates:
        print("Duplicate image filenames found:")
        for fname, dir1, dir2 in duplicates:
            print(f"{fname} found in:\n - {dir1}\n - {dir2}\n")
    else:
        print("No duplicate image filenames found.")



In [10]:
# Paths
image_dir_1 = Path("Peru Plate Numbers.v3i.yolov8/train_filter_v7/images")
image_dir_2 = Path("ANPR2.v1i.yolov8/train_stage7_options/images")
coco_1_train = Path("Peru Plate Numbers.v3i.coco/train/_annotations.coco.json")
coco_2_train = Path("ANPR2.v1i.coco/train/_annotations.coco.json")

# Step 1 & 2
filter_coco_by_images(coco_1_train, image_dir_1, "PPN_train_coco_annotations.coco.json")
filter_coco_by_images(coco_2_train, image_dir_2, "ANPR2_train_coco_annotations.coco.json")

# Step 3
test_valid_dirs = [
    "Peru Plate Numbers.v3i.coco/test",
    "Peru Plate Numbers.v3i.coco/valid",
    "ANPR2.v1i.coco/test",
    "ANPR2.v1i.coco/valid"
]
check_duplicate_filenames(image_dir_1, image_dir_2, *test_valid_dirs)

Saved filtered annotations to PPN_train_coco_annotations.coco.json
Saved filtered annotations to ANPR2_train_coco_annotations.coco.json
Duplicate image filenames found:
_annotations.coco.json found in:
 - Peru Plate Numbers.v3i.coco/test
 - Peru Plate Numbers.v3i.coco/valid

_annotations.coco.json found in:
 - Peru Plate Numbers.v3i.coco/test
 - ANPR2.v1i.coco/test

_annotations.coco.json found in:
 - Peru Plate Numbers.v3i.coco/test
 - ANPR2.v1i.coco/valid



In [4]:
# Paths
image_dir_3 = Path("ANPR2.v1i.yolov8/test_v2/images")
image_dir_4 = Path("ANPR2.v1i.yolov8/valid_v2/images")
coco_3 = Path("Peru Plate Numbers.v3i.coco/test/_annotations.coco.json")
coco_4 = Path("ANPR2.v1i.coco/valid/_annotations.coco.json")

# Step 1 & 2
filter_coco_by_images(coco_3, image_dir_3, "PPN_test_coco_annotations.coco.json")
filter_coco_by_images(coco_4, image_dir_4, "ANPR2_valid_coco_annotations.coco.json")

Saved filtered annotations to PPN_test_coco_annotations.coco.json
Saved filtered annotations to ANPR2_valid_coco_annotations.coco.json


In [None]:
# Paths
image_dir_3 = Path("ANPR2.v1i.yolov8/test_v2/images")
image_dir_4 = Path("ANPR2.v1i.yolov8/valid_v2/images")
coco_3 = Path("Peru Plate Numbers.v3i.coco/test/_annotations.coco.json")
coco_4 = Path("ANPR2.v1i.coco/valid/_annotations.coco.json")

# Step 1 & 2
filter_coco_by_images(coco_3, image_dir_3, "PPN_test_coco_annotations.coco.json")
filter_coco_by_images(coco_4, image_dir_4, "ANPR2_valid_coco_annotations.coco.json")

## 2.2 Mostrar frecuencias de las categorias

In [9]:
import json
from collections import Counter

def display_category_frequencies_with_ids(annotation_path):
    with open(annotation_path, 'r') as f:
        data = json.load(f)

    category_counts = Counter(ann['category_id'] for ann in data['annotations'])
    category_names = {cat['id']: cat['name'] for cat in data['categories']}

    print(f"Frequencies in {annotation_path}:")
    for cat_id, count in category_counts.items():
        name = category_names.get(cat_id, f"Unknown")
        print(f"  ID {cat_id} - {name}: {count}")
    print()

In [None]:
display_category_frequencies_with_ids("PPN_train_coco_annotations.coco.json")
display_category_frequencies_with_ids("ANPR2_train_coco_annotations.coco.json")

Frequencies in PPN_coco_annotations.coco.json:
  ID 1 - Placa: 774
  ID 2 - placa: 137

Frequencies in ANPR2_coco_annotations.coco.json:
  ID 1 - placa: 679



## 2.3 Uniformizar "category"
{"id": 0, "name": "plate", "supercategory": "none"}

In [11]:
import json

def unify_category(annotation_path, output_path):
    with open(annotation_path, 'r') as f:
        data = json.load(f)

    # Replace categories with a single one
    data['categories'] = [{"id": 0, "name": "plate", "supercategory": "none"}]

    # Set all annotations to category_id 0
    for ann in data['annotations']:
        ann['category_id'] = 0

    with open(output_path, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"Saved unified category annotations to {output_path}")

# Process both files
unify_category("PPN_train_coco_annotations.coco.json", "PPN_fixed_train_coco_annotations.coco.json")
unify_category("ANPR2_train_coco_annotations.coco.json", "ANPR2_fixed_train_coco_annotations.coco.json")


Saved unified category annotations to PPN_fixed_train_coco_annotations.coco.json
Saved unified category annotations to ANPR2_fixed_train_coco_annotations.coco.json


In [None]:
#unify_category("ANPR2.v1i.coco/test/_annotations.coco.json", "ANPR2_fixed_test_coco_annotations.coco.json")
#unify_category("ANPR2.v1i.coco/valid/_annotations.coco.json", "ANPR2_fixed_valid_coco_annotations.coco.json")
#unify_category("Peru Plate Numbers.v3i.coco/test/_annotations.coco.json", "PPN_fixed_test_coco_annotations.coco.json")
#unify_category("Peru Plate Numbers.v3i.coco/valid/_annotations.coco.json", "PPN_fixed_valid_coco_annotations.coco.json")

Saved unified category annotations to ANPR2_fixed_test_coco_annotations.coco.json
Saved unified category annotations to ANPR2_fixed_valid_coco_annotations.coco.json
Saved unified category annotations to PPN_fixed_test_coco_annotations.coco.json
Saved unified category annotations to PPN_fixed_valid_coco_annotations.coco.json


## 2.4 Renombrar nombres de los archivos de imagen

In [7]:
import os
import json
import re
from pathlib import Path

def rename_images_and_update_coco(image_dir, coco_json_path, new_json_name):
    image_dir = Path(image_dir)
    coco_json_path = Path(coco_json_path)
    new_json_path = coco_json_path.parent / new_json_name

    # Load original JSON
    with open(coco_json_path, 'r') as f:
        coco = json.load(f)

    # Build mapping from old name to new name using regex
    name_map = {}
    for img_file in image_dir.glob("*.*"):
        old_name = img_file.name
        base, ext = os.path.splitext(old_name)

        match = re.match(r"(.+?)_?(?:jpg|jpeg|png)?\.rf\.[^.]+", base)
        if not match:
            continue  # skip if pattern doesn't match

        new_stem = match.group(1)
        new_name = f"{new_stem}{ext}"
        name_map[old_name] = new_name

        # Rename file
        img_file.rename(image_dir / new_name)

    # Update JSON
    updated_coco = coco.copy()
    for img in updated_coco['images']:
        old_name = Path(img['file_name']).name
        if old_name in name_map:
            img['file_name'] = name_map[old_name]

    # Save updated JSON
    with open(new_json_path, 'w') as f:
        json.dump(updated_coco, f, indent=2)

    print(f"Renamed {len(name_map)} images and saved updated JSON to {new_json_path}")


In [8]:
rename_images_and_update_coco(
    image_dir="ANPR2.v1i.yolov8/train_stage8_options/images",
    coco_json_path="ANPR2_fixed_train_coco_annotations.coco.json",
    new_json_name="ANPR2_renamed_train_coco_annotations.coco.json"
)

Renamed 437 images and saved updated JSON to ANPR2_renamed_train_coco_annotations.coco.json


In [12]:
rename_images_and_update_coco(
    image_dir="Peru Plate Numbers.v3i.yolov8/train_filter_v8/images",
    coco_json_path="PPN_fixed_train_coco_annotations.coco.json",
    new_json_name="PPN_renamed_train_coco_annotations.coco.json"
)

Renamed 450 images and saved updated JSON to PPN_renamed_train_coco_annotations.coco.json


## 2.5 Ver repetidos en train

In [14]:
import json
from pathlib import Path

def find_repeated_filenames(json_path_1, json_path_2, output_txt_path):
    # Load both JSONs
    with open(json_path_1, 'r') as f1, open(json_path_2, 'r') as f2:
        coco1 = json.load(f1)
        coco2 = json.load(f2)

    # Extract image filenames
    filenames1 = {Path(img['file_name']).name for img in coco1['images']}
    filenames2 = {Path(img['file_name']).name for img in coco2['images']}

    # Find duplicates
    repeated = sorted(filenames1 & filenames2)

    # Write results to output file
    with open(output_txt_path, 'w') as out:
        for name in repeated:
            out.write(name + '\n')

    print(f"Found {len(repeated)} repeated filenames. Saved to {output_txt_path}")


In [15]:
find_repeated_filenames(
    json_path_1="ANPR2_renamed_train_coco_annotations.coco.json",
    json_path_2="PPN_renamed_train_coco_annotations.coco.json",
    output_txt_path="coco_repeated_filenames.txt"
)


Found 223 repeated filenames. Saved to coco_repeated_filenames.txt


## 2.6 Unificar COCO JSON

In [12]:
import json

def load_and_offset(json_path, image_id_offset, annotation_id_offset):
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Offset image and annotation IDs
    images = data['images']
    annotations = data['annotations']

    for img in images:
        img['id'] += image_id_offset

    for ann in annotations:
        ann['image_id'] += image_id_offset
        ann['id'] += annotation_id_offset

    return images, annotations

# List of files to merge
files = [
    "PPN_fixed_train_coco_annotations.coco.json",
    "PPN_fixed_test_coco_annotations.coco.json",
    "PPN_fixed_valid_coco_annotations.coco.json",
    "ANPR2_fixed_train_coco_annotations.coco.json",
    "ANPR2_fixed_test_coco_annotations.coco.json",
    "ANPR2_fixed_valid_coco_annotations.coco.json",
]

unified = {
    "info": {"description": "Unified dataset"},
    "licenses": [],
    "categories": [{"id": 0, "name": "plate", "supercategory": "none"}],
    "images": [],
    "annotations": []
}

image_id_offset = 0
annotation_id_offset = 0

for path in files:
    with open(path, 'r') as f:
        data = json.load(f)

    images, annotations = load_and_offset(path, image_id_offset, annotation_id_offset)
    unified["images"].extend(images)
    unified["annotations"].extend(annotations)

    # Update offsets
    image_id_offset += max(img['id'] for img in images) + 1
    annotation_id_offset += max(ann['id'] for ann in annotations) + 1

# Save unified file
with open("unified_annotations.coco.json", "w") as f:
    json.dump(unified, f, indent=2)

print("Saved as unified_annotations.coco.json")


Saved as unified_annotations.coco.json


In [None]:
import json
from collections import defaultdict

def print_bbox_count_per_image(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)

    bbox_counts = defaultdict(int)
    for ann in data['annotations']:
        bbox_counts[ann['image_id']] += 1

    print(f"BBox counts for each image in {json_path}:")
    for image_id, count in bbox_counts.items():
        print(f"  Image ID {image_id}: {count} bbox")

# Example usage
print_bbox_count_per_image("unified_annotations.coco.json")


In [25]:
import os

def count_images_in_folder(folder_path):
    image_extensions = {'.jpg', '.jpeg', '.png'}
    image_files = [
        f for f in os.listdir(folder_path)
        if os.path.splitext(f)[1].lower() in image_extensions
    ]
    print(f"Total images in '{folder_path}': {len(image_files)}")

# Example usage
count_images_in_folder("unified_dataset")


Total images in 'unified_dataset': 1238
