In [None]:
# Visualisation of the dataset with small, medium and large objects
import json
import matplotlib.pyplot as plt
from pycocotools.coco import COCO

def load_annotations(annotation_file):
    with open(annotation_file, 'r') as file:
        data = json.load(file)
    return data

def count_class_instances(coco, category_ids):
    # Count instances for each category ID
    class_counts = {cat_id: 0 for cat_id in category_ids}
    for ann in coco.anns.values():
        cat_id = ann['category_id']
        if cat_id in class_counts:
            class_counts[cat_id] += 1
    return class_counts

def count_images_per_class(coco, category_ids):
    # Count images per class
    image_counts = {cat_id: set() for cat_id in category_ids}
    for ann in coco.anns.values():
        cat_id = ann['category_id']
        image_id = ann['image_id']
        if cat_id in image_counts:
            image_counts[cat_id].add(image_id)
    # Convert sets to counts
    return {cat_id: len(images) for cat_id, images in image_counts.items()}

def count_object_sizes(coco):
    # Count small, medium, and large objects
    small, medium, large = 0, 0, 0
    for ann in coco.anns.values():
        _, _, width, height = ann['bbox']
        area = width * height

        if area < 32**2:
            small += 1
        elif 32**2 <= area < 96**2:
            medium += 1
        else:
            large += 1

    return small, medium, large

def plot_class_distribution(class_counts, coco):
    # Convert category IDs to names for plotting
    category_names = [coco.cats[cat_id]['name'] for cat_id in class_counts.keys()]
    counts = list(class_counts.values())
    
    plt.figure(figsize=(12, 8))
    plt.barh(category_names, counts, color='skyblue')
    plt.xlabel('Number of Instances')
    plt.title('Class Distribution')
    plt.show()

def plot_size_distribution(size_counts, dataset_type):
    sizes = ['Small', 'Medium', 'Large']
    counts = size_counts

    plt.figure(figsize=(8, 6))
    plt.bar(sizes, counts, color=['lightcoral', 'gold', 'lightgreen'])
    plt.xlabel('Object Size')
    plt.ylabel('Number of Objects')
    plt.title(f'Object Size Distribution ({dataset_type} Dataset)')
    plt.show()

# Paths to your annotation files
# train_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_train2017-8k-5c.json'
# val_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_val2017-2k-5c.json'

# train_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/non_weather-mini/annotations/mini_train2017_non_weather-2400k6c.json'
# val_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/non_weather-mini/annotations/mini_val2017_non_weather-500k6c.json'

# train_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/weather-mini/annotations/mini_train2017_weather-2400k6c.json'
# val_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/weather-mini/annotations/mini_val2017_weather-500k6c.json'
train_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/ACDC-1/ACDC-1-NEW/annotations/mini_train.json'
val_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/ACDC-1/ACDC-1-NEW/annotations/mini_val.json'


# Initialize COCO API for both train and validation sets
coco_train = COCO(train_annotation_file)
coco_val = COCO(val_annotation_file)

# Get category IDs from the training dataset
category_ids = coco_train.getCatIds()

# Count instances for training and validation sets
train_class_counts = count_class_instances(coco_train, category_ids)
val_class_counts = count_class_instances(coco_val, category_ids)

# Count images per class for training and validation sets
train_image_counts = count_images_per_class(coco_train, category_ids)
val_image_counts = count_images_per_class(coco_val, category_ids)

# Count object sizes for training and validation sets
train_size_counts = count_object_sizes(coco_train)
val_size_counts = count_object_sizes(coco_val)

# Total image count
total_train_images = len(coco_train.imgs)
total_val_images = len(coco_val.imgs)

# Plot distributions for training and validation sets
print("Training Set Class Distribution:")
plot_class_distribution(train_class_counts, coco_train)
print("Training Set Object Size Distribution:")
plot_size_distribution(train_size_counts, "Training")

print("Validation Set Class Distribution:")
plot_class_distribution(val_class_counts, coco_val)
print("Validation Set Object Size Distribution:")
plot_size_distribution(val_size_counts, "Validation")

# Print counts for object sizes
print("\nTraining Set Object Size Counts:")
print(f"Small: {train_size_counts[0]}, Medium: {train_size_counts[1]}, Large: {train_size_counts[2]}")

print("\nValidation Set Object Size Counts:")
print(f"Small: {val_size_counts[0]}, Medium: {val_size_counts[1]}, Large: {val_size_counts[2]}")

# Print counts for images per class
print("\nTraining Set Images per Class:")
for cat_id, count in train_image_counts.items():
    print(f"{coco_train.cats[cat_id]['name']}: {count}")

print("\nValidation Set Images per Class:")
for cat_id, count in val_image_counts.items():
    print(f"{coco_val.cats[cat_id]['name']}: {count}")

# Print total image counts
print(f"\nTotal Training Images: {total_train_images}")
print(f"Total Validation Images: {total_val_images}")


In [None]:
# For training dataset ONLY Randomly selected
import json
import random

# Path to the original COCO annotations
original_ann_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/annotations/instances_train2017.json'


# # MacBook Air
# original_ann_file = '/mnt/localssd/coco2017/annotations/instances_train2017.json'

# Parameters for customization
num_images = 4000  # Specify the number of images for training (e.g., 100 images)
selected_classes = ['bicycle', 'car', 'motorcycle', 'bus', 'truck']  # Specify category names to include, leave empty for all classes (e.g., ['person', 'car'])


# Load the original annotations
with open(original_ann_file, 'r') as f:
    coco_data = json.load(f)

# Filter images and annotations by selected classes if specified
if selected_classes:
    # Get category IDs for the selected classes
    category_ids = {cat['id'] for cat in coco_data['categories'] if cat['name'] in selected_classes}

    # Filter annotations by category IDs
    filtered_annotations = [ann for ann in coco_data['annotations'] if ann['category_id'] in category_ids]

    # Get image IDs for the filtered annotations
    image_ids = {ann['image_id'] for ann in filtered_annotations}

    # Filter images based on these image IDs
    filtered_images = [img for img in coco_data['images'] if img['id'] in image_ids]
else:
    # Use all images and annotations if no classes are specified
    filtered_images = coco_data['images']
    filtered_annotations = coco_data['annotations']

# Shuffle the filtered images randomly
random.shuffle(filtered_images)

# Limit the number of images to the specified value
train_images = filtered_images[:num_images]

# Filter annotations for the selected images
def filter_annotations(images, annotations):
    image_ids = {img['id'] for img in images}
    return [ann for ann in annotations if ann['image_id'] in image_ids]

train_annotations = filter_annotations(train_images, filtered_annotations)

# Create the new JSON structure for training
train_data = {
    'images': train_images,
    'annotations': train_annotations,
    'categories': coco_data['categories']
}

# Save the new JSON file
output_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_train2017-4k-5c.json'
with open(output_file, 'w') as f:
    json.dump(train_data, f)

print(f"Training images created with {len(train_images)} images.")
if selected_classes:
    print(f"Filtered by classes: {selected_classes}")
else:
    print("All classes included.")

In [None]:
# NEW
import json, random, os

src = '/nas.dbms/asera/PROJECTS/DATASET/COCO/annotations/instances_train2017.json'
dst = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_train2017-4k-5c.json'

num_images = 4000
keep_names = ['bicycle', 'car', 'motorcycle', 'bus', 'truck']

data = json.load(open(src))

# 1) keep only selected categories and build a contiguous id map 1..K
keep_cats = [c for c in data['categories'] if c['name'] in keep_names]
keep_cats = sorted(keep_cats, key=lambda c: keep_names.index(c['name']))
old2new = {c['id']: i+1 for i, c in enumerate(keep_cats)}
new_categories = [
    {'id': i+1, 'name': c['name'], 'supercategory': c.get('supercategory','none')}
    for i, c in enumerate(keep_cats)
]

# 2) keep only annotations in those categories, remap category_id
anns = [a for a in data['annotations'] if a['category_id'] in old2new]
for a in anns:
    a['category_id'] = old2new[a['category_id']]

# 3) keep only images that have at least one kept annotation
keep_img_ids = {a['image_id'] for a in anns}
imgs = [im for im in data['images'] if im['id'] in keep_img_ids]

# 4) randomly sample images, then restrict annotations to them
random.shuffle(imgs)
imgs = imgs[:num_images]
img_set = {im['id'] for im in imgs}
anns = [a for a in anns if a['image_id'] in img_set]

# 5) write out clean file
out = {
    'info': {'description': 'COCO-5 train subset', 'version': '1.0'},
    'images': imgs,
    'annotations': anns,
    'categories': new_categories
}
json.dump(out, open(dst,'w'))
print(f'wrote {dst}: {len(imgs)} images, {len(anns)} anns, {len(new_categories)} cats')


In [None]:
# For validation dataset ONLY Randomly selected

import json
import random

# Path to the original COCO annotations
original_ann_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/annotations/instances_val2017.json'

# # MacBook Air
# original_ann_file = '/mnt/localssd/coco2017/annotations/instances_val2017.json'


# Parameters for customization
num_images = 2000  # Specify the number of images for validation (e.g., 100 images)
selected_classes = ['bicycle', 'car', 'motorcycle', 'bus', 'truck']  # Specify category names to include, leave empty for all classes (e.g., ['person', 'car'])

# Load the original annotations
with open(original_ann_file, 'r') as f:
    coco_data = json.load(f)

# Filter images and annotations by selected classes if specified
if selected_classes:
    # Get category IDs for the selected classes
    category_ids = {cat['id'] for cat in coco_data['categories'] if cat['name'] in selected_classes}

    # Filter annotations by category IDs
    filtered_annotations = [ann for ann in coco_data['annotations'] if ann['category_id'] in category_ids]

    # Get image IDs for the filtered annotations
    image_ids = {ann['image_id'] for ann in filtered_annotations}

    # Filter images based on these image IDs
    filtered_images = [img for img in coco_data['images'] if img['id'] in image_ids]
else:
    # Use all images and annotations if no classes are specified
    filtered_images = coco_data['images']
    filtered_annotations = coco_data['annotations']

# Shuffle the filtered images randomly
random.shuffle(filtered_images)

# Limit the number of images to the specified value
val_images = filtered_images[:num_images]

# Filter annotations for the selected images
def filter_annotations(images, annotations):
    image_ids = {img['id'] for img in images}
    return [ann for ann in annotations if ann['image_id'] in image_ids]

val_annotations = filter_annotations(val_images, filtered_annotations)

# Create the new JSON structure for validation
val_data = {
    'images': val_images,
    'annotations': val_annotations,
    'categories': coco_data['categories']
}

# Save the new JSON file
output_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_val2017-2k-5c.json'
with open(output_file, 'w') as f:
    json.dump(val_data, f)

print(f"Validation dataset created with {len(val_images)} images.")
if selected_classes:
    print(f"Filtered by classes: {selected_classes}")
else:
    print("All classes included.")


In [None]:
# NEW
import json, random, os

src = '/nas.dbms/asera/PROJECTS/DATASET/COCO/annotations/instances_val2017.json'
dst = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_val2017-870-5c.json'

num_images = 2000
keep_names = ['bicycle', 'car', 'motorcycle', 'bus', 'truck']

data = json.load(open(src))
keep_cats = [c for c in data['categories'] if c['name'] in keep_names]
keep_cats = sorted(keep_cats, key=lambda c: keep_names.index(c['name']))
old2new = {c['id']: i+1 for i, c in enumerate(keep_cats)}
new_categories = [
    {'id': i+1, 'name': c['name'], 'supercategory': c.get('supercategory','none')}
    for i, c in enumerate(keep_cats)
]

anns = [a for a in data['annotations'] if a['category_id'] in old2new]
for a in anns:
    a['category_id'] = old2new[a['category_id']]

keep_img_ids = {a['image_id'] for a in anns}
imgs = [im for im in data['images'] if im['id'] in keep_img_ids]

random.shuffle(imgs)
imgs = imgs[:num_images]
img_set = {im['id'] for im in imgs}
anns = [a for a in anns if a['image_id'] in img_set]

out = {
    'info': {'description': 'COCO-5 val subset', 'version': '1.0'},
    'images': imgs,
    'annotations': anns,
    'categories': new_categories
}
json.dump(out, open(dst,'w'))
print(f'wrote {dst}: {len(imgs)} images, {len(anns)} anns, {len(new_categories)} cats')


In [None]:
# VERSION 1 Script for Splitting the Dataset to Training and Validation from a one source

import json
import random

# Path to the original COCO annotations
original_ann_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/instances_val2017.json'

# Load the original annotations
with open(original_ann_file, 'r') as f:
    coco_data = json.load(f)

# Shuffle the images randomly
random.shuffle(coco_data['images'])

# Split 80% for training and 20% for validation
split_idx = int(0.8 * len(coco_data['images']))
train_images = coco_data['images'][:split_idx]
val_images = coco_data['images'][split_idx:]

# Create a helper function to filter annotations by image IDs
def filter_annotations(images, annotations):
    image_ids = {img['id'] for img in images}
    return [ann for ann in annotations if ann['image_id'] in image_ids]

# Filter annotations for the new splits
train_annotations = filter_annotations(train_images, coco_data['annotations'])
val_annotations = filter_annotations(val_images, coco_data['annotations'])

# Create the new JSON structures
train_data = {'images': train_images, 'annotations': train_annotations, 'categories': coco_data['categories']}
val_data = {'images': val_images, 'annotations': val_annotations, 'categories': coco_data['categories']}

# Save the new JSON files
with open('/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_train2017.json', 'w') as f:
    json.dump(train_data, f)

with open('/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_val2017.json', 'w') as f:
    json.dump(val_data, f)

print(f"Training images: {len(train_images)}, Validation images: {len(val_images)}")


In [None]:
# VERSION 2 Script for Splitting the Dataset to Training and Validation from a one source

import json
import random
import shutil
import os
from collections import defaultdict

# Paths for the original COCO annotations and images
original_ann_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/annotations/instances_train2017.json'
original_image_folder = '/nas.dbms/asera/PROJECTS/DATASET/COCO/images/train2017'
mini_image_folder = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-images/val2017-2'

# Ensure the mini dataset image folder exists
os.makedirs(mini_image_folder, exist_ok=True)

# Load the original annotations
with open(original_ann_file, 'r') as f:
    coco_data = json.load(f)

# Parameters for the mini dataset
images_total = 5000  # Total images in mini dataset (optional)
images_per_class = None  # Number of images per class (optional)
selected_classes = None  # Set to None for all classes or specify class IDs, e.g., [1, 3, 6]
train_split = 0.8  # Percentage of images for training (e.g., 0.8 for 80% train, 20% val)

# Determine which classes to include
if selected_classes is None:
    selected_classes = [cat['id'] for cat in coco_data['categories']]
else:
    # Validate that selected_classes only includes valid COCO category IDs
    all_class_ids = {cat['id'] for cat in coco_data['categories']}
    selected_classes = [cls for cls in selected_classes if cls in all_class_ids]

# Organize annotations by class
annotations_by_class = defaultdict(list)
for ann in coco_data['annotations']:
    if ann['category_id'] in selected_classes:
        annotations_by_class[ann['category_id']].append(ann)

# Determine images_per_class if not specified
if images_per_class is None:
    # Calculate images_per_class based on images_total and number of classes
    total_classes = len(selected_classes)
    images_per_class = (images_total // (2 * total_classes))  # Divided by 2 for equal train/val split

    if images_per_class == 0:
        images_per_class = 1  # Ensure at least one image per class per split

# Collect images and annotations, ensuring an even split per class
selected_image_ids = set()
train_annotations = []
val_annotations = []

for class_id in selected_classes:
    class_annotations = annotations_by_class[class_id]

    # Shuffle and select annotations for this class
    random.shuffle(class_annotations)
    required_annotations = 2 * images_per_class  # Total needed for both splits

    # Ensure we don't select more annotations than available
    class_annotations = class_annotations[:required_annotations]

    # Split into training and validation
    train_class_annotations = class_annotations[:images_per_class]
    val_class_annotations = class_annotations[images_per_class:2 * images_per_class]

    # Collect image IDs and annotations for this class
    for ann in train_class_annotations:
        selected_image_ids.add(ann['image_id'])
        train_annotations.append(ann)
    for ann in val_class_annotations:
        selected_image_ids.add(ann['image_id'])
        val_annotations.append(ann)

# Filter images based on selected IDs
selected_images = [img for img in coco_data['images'] if img['id'] in selected_image_ids]

# Separate images into train and validation sets
train_image_ids = {ann['image_id'] for ann in train_annotations}
val_image_ids = {ann['image_id'] for ann in val_annotations}

train_images = [img for img in selected_images if img['id'] in train_image_ids]
val_images = [img for img in selected_images if img['id'] in val_image_ids]

# Create JSON structures for train and validation datasets
filtered_categories = [cat for cat in coco_data['categories'] if cat['id'] in selected_classes]

train_data = {
    'images': train_images,
    'annotations': train_annotations,
    'categories': filtered_categories
}

val_data = {
    'images': val_images,
    'annotations': val_annotations,
    'categories': filtered_categories
}

# Save the JSON files for the balanced mini dataset
with open('/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_train2017-2.json', 'w') as f:
    json.dump(train_data, f)

with open('/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_val2017-2.json', 'w') as f:
    json.dump(val_data, f)

# Copy selected images to the mini dataset folder
for img_info in selected_images:
    img_filename = img_info['file_name']
    src_path = os.path.join(original_image_folder, img_filename)
    dest_path = os.path.join(mini_image_folder, img_filename)
    shutil.copyfile(src_path, dest_path)

print(f"Balanced mini dataset created with:")
print(f" - Training images: {len(train_images)}")
print(f" - Validation images: {len(val_images)}")
print(f" - Total images: {len(selected_images)}")
print(f" - Classes included: {[cat['name'] for cat in filtered_categories]}")
print(f"Each split has {images_per_class} images per class.")


In [None]:
# VERSION 3 Script for Splitting the Dataset to Training and Validation from a one source

import json
import random
import shutil
import os
from collections import defaultdict

# Paths for the original COCO annotations and images
original_ann_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/annotations/instances_train2017.json'
original_image_folder = '/nas.dbms/asera/PROJECTS/DATASET/COCO/images/train2017'
mini_image_folder = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-images/val2017-25'

# Ensure the mini dataset image folder exists
os.makedirs(mini_image_folder, exist_ok=True)

# Load the original annotations
with open(original_ann_file, 'r') as f:
    coco_data = json.load(f)

# Parameters for the mini dataset
images_total = 25000  # Total images in mini dataset
images_per_class = None  # Specify exact number of images per class, e.g., 100, or leave as None to balance based on images_total
selected_classes = None  # Set to None for all classes or specify class IDs, e.g., [1, 3, 6]
train_split = 0.8  # Percentage of images for training (80%) and validation (20%)

# Determine which classes to include
if selected_classes is None:
    selected_classes = [cat['id'] for cat in coco_data['categories']]
else:
    # Validate that selected_classes only includes valid COCO category IDs
    all_class_ids = {cat['id'] for cat in coco_data['categories']}
    selected_classes = [cls for cls in selected_classes if cls in all_class_ids]

# Calculate how many images per class based on `images_total` if `images_per_class` is not set
if images_per_class is None:
    images_per_class = images_total // len(selected_classes)

# Organize annotations by class
annotations_by_class = defaultdict(list)
for ann in coco_data['annotations']:
    if ann['category_id'] in selected_classes:
        annotations_by_class[ann['category_id']].append(ann)

# Collect images and annotations in a balanced way across classes
selected_image_ids = set()
selected_annotations = []
for class_id in selected_classes:
    class_annotations = annotations_by_class[class_id]
    
    # Shuffle and select the specified number of images for this class
    random.shuffle(class_annotations)
    class_annotations = class_annotations[:images_per_class]
    
    # Collect image IDs and annotations for this class
    for ann in class_annotations:
        selected_image_ids.add(ann['image_id'])
        selected_annotations.append(ann)

# Filter images based on selected IDs
selected_images = [img for img in coco_data['images'] if img['id'] in selected_image_ids]

# Shuffle selected images and split into train and validation sets
random.shuffle(selected_images)
split_idx = int(train_split * len(selected_images))
train_images = selected_images[:split_idx]
val_images = selected_images[split_idx:]

# Filter annotations by train and validation image IDs
def filter_annotations(images, annotations):
    image_ids = {img['id'] for img in images}
    return [ann for ann in annotations if ann['image_id'] in image_ids]

train_annotations = filter_annotations(train_images, selected_annotations)
val_annotations = filter_annotations(val_images, selected_annotations)

# Create JSON structures for train and validation datasets
train_data = {'images': train_images, 'annotations': train_annotations, 'categories': coco_data['categories']}
val_data = {'images': val_images, 'annotations': val_annotations, 'categories': coco_data['categories']}

# Save the JSON files for the balanced mini dataset
with open('/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_train2017-25.json', 'w') as f:
    json.dump(train_data, f)

with open('/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_val2017-25.json', 'w') as f:
    json.dump(val_data, f)

# Copy selected images to the mini dataset folder
for img_info in selected_images:
    img_filename = img_info['file_name']
    src_path = os.path.join(original_image_folder, img_filename)
    dest_path = os.path.join(mini_image_folder, img_filename)
    shutil.copyfile(src_path, dest_path)

print(f"Balanced mini dataset created with {len(train_images)} training images and {len(val_images)} validation images.")


In [None]:
# VERSION 4 Script for Splitting the Dataset to Training and Validation from a one source

import json
import random
import shutil
import os
from collections import defaultdict

# Paths for the original COCO annotations and images
original_ann_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/annotations/instances_train2017.json'
original_image_folder = '/nas.dbms/asera/PROJECTS/DATASET/COCO/images/train2017'
mini_image_folder = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-images/val2017-2'

# Ensure the mini dataset image folder exists
os.makedirs(mini_image_folder, exist_ok=True)

# Load the original annotations
with open(original_ann_file, 'r') as f:
    coco_data = json.load(f)

# Parameters for the mini dataset
images_total = 5000           # Total images in mini dataset
train_split = 0.8             # Percentage of images for training
selected_classes = None #[1, 2, 3, 4, 6, 8]      # None for all classes, or specify class IDs, e.g., [1, 3, 6]

# Determine which classes to include
if selected_classes is None:
    selected_classes = [cat['id'] for cat in coco_data['categories']]
else:
    # Validate that selected_classes only includes valid COCO category IDs
    all_class_ids = {cat['id'] for cat in coco_data['categories']}
    selected_classes = [cls for cls in selected_classes if cls in all_class_ids]

# Organize annotations by class
annotations_by_class = defaultdict(list)
for ann in coco_data['annotations']:
    if ann['category_id'] in selected_classes:
        annotations_by_class[ann['category_id']].append(ann)

# Calculate images per class for each split
num_classes = len(selected_classes)
total_images_per_class = images_total // num_classes
images_per_class_train = int(total_images_per_class * train_split)
images_per_class_val = total_images_per_class - images_per_class_train

# Collect images and annotations, ensuring equal distribution for each class in both splits
selected_image_ids = set()
train_annotations = []
val_annotations = []

for class_id in selected_classes:
    class_annotations = annotations_by_class[class_id]

    # Shuffle and select the required number of annotations for this class
    random.shuffle(class_annotations)
    required_annotations = images_per_class_train + images_per_class_val

    # Ensure we don't select more annotations than available
    class_annotations = class_annotations[:required_annotations]

    # Split into training and validation
    train_class_annotations = class_annotations[:images_per_class_train]
    val_class_annotations = class_annotations[images_per_class_train:required_annotations]

    # Collect image IDs and annotations for this class
    for ann in train_class_annotations:
        selected_image_ids.add(ann['image_id'])
        train_annotations.append(ann)
    for ann in val_class_annotations:
        selected_image_ids.add(ann['image_id'])
        val_annotations.append(ann)

# Filter images based on selected IDs
selected_images = [img for img in coco_data['images'] if img['id'] in selected_image_ids]

# Separate images into train and validation sets
train_image_ids = {ann['image_id'] for ann in train_annotations}
val_image_ids = {ann['image_id'] for ann in val_annotations}

train_images = [img for img in selected_images if img['id'] in train_image_ids]
val_images = [img for img in selected_images if img['id'] in val_image_ids]

# Create JSON structures for train and validation datasets
filtered_categories = [cat for cat in coco_data['categories'] if cat['id'] in selected_classes]

train_data = {
    'images': train_images,
    'annotations': train_annotations,
    'categories': filtered_categories
}

val_data = {
    'images': val_images,
    'annotations': val_annotations,
    'categories': filtered_categories
}

# Save the JSON files for the balanced mini dataset
with open('/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_train2017-2.json', 'w') as f:
    json.dump(train_data, f)

with open('/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_val2017-2.json', 'w') as f:
    json.dump(val_data, f)

# Copy selected images to the mini dataset folder
for img_info in selected_images:
    img_filename = img_info['file_name']
    src_path = os.path.join(original_image_folder, img_filename)
    dest_path = os.path.join(mini_image_folder, img_filename)
    shutil.copyfile(src_path, dest_path)

print(f"Balanced mini dataset created with:")
print(f" - Training images: {len(train_images)}")
print(f" - Validation images: {len(val_images)}")
print(f" - Total images: {len(selected_images)}")
print(f" - Classes included: {[cat['name'] for cat in filtered_categories]}")
print(f"Each class has {images_per_class_train} images in training and {images_per_class_val} images in validation.")
