# 1. Verificar que no se repiten los nombres de los archivos de imagenes

In [1]:
import os
import json
from collections import defaultdict

In [2]:


def encontrar_imagenes_duplicadas_por_nombre(working_directory):
    imagenes = defaultdict(list)

    for marca in os.listdir(working_directory):
        marca_path = os.path.join(working_directory, marca)
        if not os.path.isdir(marca_path):
            continue

        for modelo in os.listdir(marca_path):
            modelo_path = os.path.join(marca_path, modelo)
            annotation_path = os.path.join(modelo_path, 'instances_default.json')

            if not os.path.isfile(annotation_path):
                continue

            with open(annotation_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for img in data.get('images', []):
                    nombre_imagen = img.get('file_name')
                    if nombre_imagen:
                        ruta_logica = os.path.join(marca, modelo)
                        imagenes[nombre_imagen].append(ruta_logica)

    # Buscar duplicados
    duplicados = {k: v for k, v in imagenes.items() if len(v) > 1}

    if duplicados:
        print("Imágenes duplicadas encontradas en múltiples subcarpetas:\n")
        for nombre, rutas in duplicados.items():
            print(f"{nombre}:")
            for r in rutas:
                print(f"  - {r}")
            print()
    else:
        print("No se encontraron imágenes duplicadas por nombre.")




In [4]:
encontrar_imagenes_duplicadas_por_nombre("bing-model-images")

No se encontraron imágenes duplicadas por nombre.


# 2. Juntar los COCO JSON de los modelos en uno solo

In [5]:
import os
import json
import pandas as pd

In [11]:
def merge_annotations_with_hierarchy(working_dir, xlsx_path, output_json_path):
    # Read model-type mapping from Excel
    df = pd.read_excel(xlsx_path)
    model_info = {
        (row['make'], row['model']): row['type']
        for _, row in df.iterrows()
    }

    image_id_counter = 0
    annotation_id_counter = 0
    category_id_counter = 0

    all_images = []
    all_annotations = []
    all_categories = []
    category_lookup = {}

    for brand in os.listdir(working_dir):
        brand_path = os.path.join(working_dir, brand)
        if not os.path.isdir(brand_path):
            continue

        for model in os.listdir(brand_path):
            model_path = os.path.join(brand_path, model)
            json_path = os.path.join(model_path, 'instances_default.json')

            if not os.path.isfile(json_path):
                continue

            vehicle_type = model_info.get((brand, model), "Unknown")

            category_key = (model, brand, vehicle_type)
            if category_key not in category_lookup:
                category_lookup[category_key] = category_id_counter
                all_categories.append({
                    "id": category_id_counter,
                    "name": model,
                    "supercategory": brand,
                    "make": brand,
                    "type": vehicle_type
                })
                category_id_counter += 1

            current_category_id = category_lookup[category_key]

            with open(json_path, 'r', encoding='utf-8') as f:
                coco_data = json.load(f)

            image_id_map = {}
            for image in coco_data.get("images", []):
                new_image = image.copy()
                new_image["id"] = image_id_counter
                image_id_map[image["id"]] = image_id_counter
                all_images.append(new_image)
                image_id_counter += 1

            for ann in coco_data.get("annotations", []):
                new_ann = ann.copy()
                new_ann["id"] = annotation_id_counter
                new_ann["image_id"] = image_id_map[ann["image_id"]]
                new_ann["category_id"] = current_category_id
                all_annotations.append(new_ann)
                annotation_id_counter += 1

    merged_coco = {
        "images": all_images,
        "annotations": all_annotations,
        "categories": all_categories
    }
    
    # Ensure the output directory exists
    os.makedirs(os.path.dirname(output_json_path), exist_ok=True)

    with open(output_json_path, "w", encoding="utf-8") as f_out:
        json.dump(merged_coco, f_out, ensure_ascii=False, indent=2)



In [12]:
merge_annotations_with_hierarchy("bing-model-images", "top-50-type.xlsx", "merged-dataset/merged_annotations.json")

# 3. Copiar imagenes a nuevo directorio

In [13]:
import os
import json
import shutil

In [14]:
def copy_images_from_subfolders(working_dir, output_images_path):
    os.makedirs(output_images_path, exist_ok=True)

    for brand in os.listdir(working_dir):
        brand_path = os.path.join(working_dir, brand)
        if not os.path.isdir(brand_path):
            continue

        for model in os.listdir(brand_path):
            model_path = os.path.join(brand_path, model)
            json_path = os.path.join(model_path, 'instances_default.json')

            if not os.path.isfile(json_path):
                continue

            with open(json_path, 'r', encoding='utf-8') as f:
                coco_data = json.load(f)

            for image in coco_data.get("images", []):
                image_filename = image.get("file_name")
                if not image_filename:
                    continue

                src_image_path = os.path.join(model_path, image_filename)
                dst_image_path = os.path.join(output_images_path, image_filename)

                if os.path.exists(src_image_path):
                    shutil.copy2(src_image_path, dst_image_path)
                else:
                    print(f"[WARNING] Image not found: {src_image_path}")


In [15]:
copy_images_from_subfolders("bing-model-images", "merged-dataset/images")


# 4. Verificar correctos BBOX

In [19]:
import os
from PIL import Image, ImageDraw
from pycocotools.coco import COCO
import random

In [20]:
def save_coco_bboxes_to_images(image_dir, annotation_path, output_dir='output', max_images=None):
    """
    Save images with bounding boxes drawn from COCO annotations.

    Parameters:
        image_dir (str): Directory containing the images.
        annotation_path (str): Full path to the COCO annotation file.
        output_dir (str): Directory to save output images with drawn bounding boxes.
        max_images (int, optional): If provided, randomly selects up to this number of images (without repetition).
    """
    os.makedirs(output_dir, exist_ok=True)
    coco = COCO(annotation_path)
    image_ids = coco.getImgIds()

    if max_images is not None and max_images < len(image_ids):
        image_ids = random.sample(image_ids, max_images)

    for img_id in image_ids:
        img_data = coco.loadImgs(img_id)[0]
        img_path = os.path.join(image_dir, img_data['file_name'])
        output_path = os.path.join(output_dir, img_data['file_name'])

        if not os.path.exists(img_path):
            print(f"[WARNING] Image file not found: {img_path}")
            continue

        image = Image.open(img_path).convert("RGB")
        draw = ImageDraw.Draw(image)

        ann_ids = coco.getAnnIds(imgIds=img_id)
        anns = coco.loadAnns(ann_ids)

        for ann in anns:
            x, y, w, h = ann['bbox']
            draw.rectangle([x, y, x + w, y + h], outline='red', width=3)

        image.save(output_path)


In [21]:
save_coco_bboxes_to_images(
    image_dir='merged-dataset/images',
    annotation_path='merged-dataset/merged_annotations.json',
    output_dir='merged-dataset/images_bbox_test',
    max_images=10
)


loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


# 5. Separacion del Dataset

In [22]:
import os
import json
import random
from collections import defaultdict
from typing import Tuple

In [23]:
def split_dataset(
    images_path: str,
    coco_json_path: str,
    output_path: str,
    stratified: bool = True,
    split_ratio: Tuple[float, float, float] = (0.8, 0.1, 0.1)
):
    os.makedirs(output_path, exist_ok=True)

    with open(coco_json_path, 'r', encoding='utf-8') as f:
        coco = json.load(f)

    images = coco['images']
    annotations = coco['annotations']
    categories = coco['categories']

    image_id_to_image = {img['id']: img for img in images}
    image_id_to_annots = defaultdict(list)
    for ann in annotations:
        image_id_to_annots[ann['image_id']].append(ann)

    if stratified:
        image_id_to_categories = defaultdict(set)
        for ann in annotations:
            image_id_to_categories[ann['image_id']].add(ann['category_id'])

        category_to_image_ids = defaultdict(set)
        for img_id, cat_ids in image_id_to_categories.items():
            for cat_id in cat_ids:
                category_to_image_ids[cat_id].add(img_id)

        train_ids, valid_ids, test_ids = set(), set(), set()
        for cat_id, img_ids in category_to_image_ids.items():
            img_ids = list(img_ids)
            random.shuffle(img_ids)
            n = len(img_ids)
            n_train = int(n * split_ratio[0])
            n_valid = int(n * split_ratio[1])
            train_ids.update(img_ids[:n_train])
            valid_ids.update(img_ids[n_train:n_train + n_valid])
            test_ids.update(img_ids[n_train + n_valid:])
    else:
        all_image_ids = list(image_id_to_image.keys())
        random.shuffle(all_image_ids)
        n = len(all_image_ids)
        n_train = int(n * split_ratio[0])
        n_valid = int(n * split_ratio[1])
        train_ids = set(all_image_ids[:n_train])
        valid_ids = set(all_image_ids[n_train:n_train + n_valid])
        test_ids = set(all_image_ids[n_train + n_valid:])

    def save_subset(image_ids_set, filename):
        subset_images = [image_id_to_image[i] for i in image_ids_set]
        subset_annotations = [ann for ann in annotations if ann['image_id'] in image_ids_set]
        output_json = {
            'images': subset_images,
            'annotations': subset_annotations,
            'categories': categories
        }
        with open(os.path.join(output_path, filename), 'w', encoding='utf-8') as f:
            json.dump(output_json, f, indent=2)

    save_subset(train_ids, 'train.json')
    save_subset(valid_ids, 'valid.json')
    save_subset(test_ids, 'test.json')

    print(f"COCO JSONs saved to: {output_path} | Stratified: {stratified}")


In [24]:
split_dataset(
    images_path='merged-dataset/images',
    coco_json_path='merged-dataset/merged_annotations.json',
    output_path='merged-dataset/subsets',
    stratified=True
)

COCO JSONs saved to: merged-dataset/subsets | Stratified: True


In [25]:
import os
import json
from collections import defaultdict

# 6. Verificacion de la separacion

In [26]:

def count_images_per_category_in_jsons(input_path):
    """
    Reads all COCO-format JSON files in a directory and prints how many unique images are associated with each category.

    Parameters:
        input_path (str): Path to the directory containing COCO JSON files.
    """
    for file_name in os.listdir(input_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(input_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                coco = json.load(f)

            category_id_to_name = {cat['id']: cat['name'] for cat in coco['categories']}
            images_per_category = defaultdict(set)

            for annotation in coco['annotations']:
                category_id = annotation['category_id']
                image_id = annotation['image_id']
                images_per_category[category_id].add(image_id)

            print(f"\nFile: {file_name}")
            for category_id, image_ids in images_per_category.items():
                category_name = category_id_to_name.get(category_id, 'Unknown')
                print(f"- {category_name}: {len(image_ids)} images")


In [27]:
count_images_per_category_in_jsons('merged-dataset/subsets')


File: test.json
- Changan CS35: 4 images
- Changan CS55: 3 images
- Changan New Van: 4 images
- Chevrolet Camaro: 3 images
- Chevrolet Cruze: 4 images
- Chevrolet Onix: 4 images
- Chevrolet Spark: 3 images
- Chevrolet Tracker: 4 images
- Glory 330: 3 images
- Glory 500: 3 images
- Glory 580: 4 images
- Hyundai Accent: 4 images
- Hyundai Creta: 3 images
- Hyundai Elantra: 4 images
- Hyundai i20: 4 images
- Hyundai Santa Fe: 4 images
- Hyundai Sonata: 4 images
- Hyundai Tucson: 4 images
- Hyundai Veloster: 4 images
- JAC JS4: 4 images
- JAC T8: 3 images
- Kia Niro: 4 images
- Kia Rio: 3 images
- Kia Seltos: 4 images
- Kia Sorento: 4 images
- Kia Soul: 3 images
- Kia Sportage: 4 images
- Nissan Kicks: 4 images
- Nissan Sentra: 4 images
- Nissan Versa: 3 images
- Suzuki Jimny: 4 images
- Suzuki Swift: 4 images
- Suzuki Vitara: 4 images
- Toyota 4Runner: 3 images
- Toyota C-HR: 4 images
- Toyota Camry: 4 images
- Toyota Celica: 4 images
- Toyota Corolla: 4 images
- Toyota Corolla Cross: 3 

# 7. Creacion de carpetas de los subsets

In [28]:
import os
import json
import shutil

In [29]:

def copy_coco_images_to_output(coco_json_path, input_images_path, output_images_path):
    """
    Copies only the images referenced in a COCO JSON file from the input path to the output path.

    Parameters:
        coco_json_path (str): Path to the COCO JSON file.
        input_images_path (str): Path to the folder containing the source images.
        output_images_path (str): Path where the selected images will be copied.
    """
    os.makedirs(output_images_path, exist_ok=True)

    with open(coco_json_path, 'r', encoding='utf-8') as f:
        coco = json.load(f)

    image_files = {img['file_name'] for img in coco['images']}

    for img_file in image_files:
        src_path = os.path.join(input_images_path, img_file)
        dst_path = os.path.join(output_images_path, img_file)

        if os.path.exists(src_path):
            shutil.copy2(src_path, dst_path)
        else:
            print(f"Warning: Image not found -> {src_path}")


In [30]:
copy_coco_images_to_output(
    coco_json_path='merged-dataset/subsets/test.json',
    input_images_path='merged-dataset/images',
    output_images_path='merged-dataset/test-images'
)
