In [None]:
# Visualisation of the dataset with small, medium and large objects
import json
import matplotlib.pyplot as plt
from pycocotools.coco import COCO

def load_annotations(annotation_file):
    with open(annotation_file, 'r') as file:
        data = json.load(file)
    return data

def count_class_instances(coco, category_ids):
    # Count instances for each category ID
    class_counts = {cat_id: 0 for cat_id in category_ids}
    for ann in coco.anns.values():
        cat_id = ann['category_id']
        if cat_id in class_counts:
            class_counts[cat_id] += 1
    return class_counts

def count_images_per_class(coco, category_ids):
    # Count images per class
    image_counts = {cat_id: set() for cat_id in category_ids}
    for ann in coco.anns.values():
        cat_id = ann['category_id']
        image_id = ann['image_id']
        if cat_id in image_counts:
            image_counts[cat_id].add(image_id)
    # Convert sets to counts
    return {cat_id: len(images) for cat_id, images in image_counts.items()}

def count_object_sizes(coco):
    # Count small, medium, and large objects
    small, medium, large = 0, 0, 0
    for ann in coco.anns.values():
        _, _, width, height = ann['bbox']
        area = width * height

        if area < 32**2:
            small += 1
        elif 32**2 <= area < 96**2:
            medium += 1
        else:
            large += 1

    return small, medium, large

def plot_class_distribution(class_counts, coco):
    # Convert category IDs to names for plotting
    category_names = [coco.cats[cat_id]['name'] for cat_id in class_counts.keys()]
    counts = list(class_counts.values())
    
    plt.figure(figsize=(12, 8))
    plt.barh(category_names, counts, color='skyblue')
    plt.xlabel('Number of Instances')
    plt.title('Class Distribution')
    plt.show()

def plot_size_distribution(size_counts, dataset_type):
    sizes = ['Small', 'Medium', 'Large']
    counts = size_counts

    plt.figure(figsize=(8, 6))
    plt.bar(sizes, counts, color=['lightcoral', 'gold', 'lightgreen'])
    plt.xlabel('Object Size')
    plt.ylabel('Number of Objects')
    plt.title(f'Object Size Distribution ({dataset_type} Dataset)')
    plt.show()

# Paths to your annotation files
# train_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_train2017-8k-5c.json'
# val_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_val2017-2k-5c.json'

# train_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/non_weather-mini/annotations/mini_train2017_non_weather-2400k6c.json'
# val_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/non_weather-mini/annotations/mini_val2017_non_weather-500k6c.json'

# train_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/weather-mini/annotations/mini_train2017_weather-2400k6c.json'
# val_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/weather-mini/annotations/mini_val2017_weather-500k6c.json'
train_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/ACDC-1/ACDC-1-NEW/annotations/mini_train.json'
val_annotation_file = '/nas.dbms/asera/PROJECTS/DATASET/ACDC-1/ACDC-1-NEW/annotations/mini_val.json'


# Initialize COCO API for both train and validation sets
coco_train = COCO(train_annotation_file)
coco_val = COCO(val_annotation_file)

# Get category IDs from the training dataset
category_ids = coco_train.getCatIds()

# Count instances for training and validation sets
train_class_counts = count_class_instances(coco_train, category_ids)
val_class_counts = count_class_instances(coco_val, category_ids)

# Count images per class for training and validation sets
train_image_counts = count_images_per_class(coco_train, category_ids)
val_image_counts = count_images_per_class(coco_val, category_ids)

# Count object sizes for training and validation sets
train_size_counts = count_object_sizes(coco_train)
val_size_counts = count_object_sizes(coco_val)

# Total image count
total_train_images = len(coco_train.imgs)
total_val_images = len(coco_val.imgs)

# Plot distributions for training and validation sets
print("Training Set Class Distribution:")
plot_class_distribution(train_class_counts, coco_train)
print("Training Set Object Size Distribution:")
plot_size_distribution(train_size_counts, "Training")

print("Validation Set Class Distribution:")
plot_class_distribution(val_class_counts, coco_val)
print("Validation Set Object Size Distribution:")
plot_size_distribution(val_size_counts, "Validation")

# Print counts for object sizes
print("\nTraining Set Object Size Counts:")
print(f"Small: {train_size_counts[0]}, Medium: {train_size_counts[1]}, Large: {train_size_counts[2]}")

print("\nValidation Set Object Size Counts:")
print(f"Small: {val_size_counts[0]}, Medium: {val_size_counts[1]}, Large: {val_size_counts[2]}")

# Print counts for images per class
print("\nTraining Set Images per Class:")
for cat_id, count in train_image_counts.items():
    print(f"{coco_train.cats[cat_id]['name']}: {count}")

print("\nValidation Set Images per Class:")
for cat_id, count in val_image_counts.items():
    print(f"{coco_val.cats[cat_id]['name']}: {count}")

# Print total image counts
print(f"\nTotal Training Images: {total_train_images}")
print(f"Total Validation Images: {total_val_images}")


In [None]:
# For training dataset ONLY Randomly selected
import json
import random

# Path to the original COCO annotations
original_ann_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/annotations/instances_train2017.json'


# # MacBook Air
# original_ann_file = '/mnt/localssd/coco2017/annotations/instances_train2017.json'

# Parameters for customization
num_images = 4000  # Specify the number of images for training (e.g., 100 images)
selected_classes = ['bicycle', 'car', 'motorcycle', 'bus', 'truck']  # Specify category names to include, leave empty for all classes (e.g., ['person', 'car'])


# Load the original annotations
with open(original_ann_file, 'r') as f:
    coco_data = json.load(f)

# Filter images and annotations by selected classes if specified
if selected_classes:
    # Get category IDs for the selected classes
    category_ids = {cat['id'] for cat in coco_data['categories'] if cat['name'] in selected_classes}

    # Filter annotations by category IDs
    filtered_annotations = [ann for ann in coco_data['annotations'] if ann['category_id'] in category_ids]

    # Get image IDs for the filtered annotations
    image_ids = {ann['image_id'] for ann in filtered_annotations}

    # Filter images based on these image IDs
    filtered_images = [img for img in coco_data['images'] if img['id'] in image_ids]
else:
    # Use all images and annotations if no classes are specified
    filtered_images = coco_data['images']
    filtered_annotations = coco_data['annotations']

# Shuffle the filtered images randomly
random.shuffle(filtered_images)

# Limit the number of images to the specified value
train_images = filtered_images[:num_images]

# Filter annotations for the selected images
def filter_annotations(images, annotations):
    image_ids = {img['id'] for img in images}
    return [ann for ann in annotations if ann['image_id'] in image_ids]

train_annotations = filter_annotations(train_images, filtered_annotations)

# Create the new JSON structure for training
train_data = {
    'images': train_images,
    'annotations': train_annotations,
    'categories': coco_data['categories']
}

# Save the new JSON file
output_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_train2017-4k-5c.json'
with open(output_file, 'w') as f:
    json.dump(train_data, f)

print(f"Training images created with {len(train_images)} images.")
if selected_classes:
    print(f"Filtered by classes: {selected_classes}")
else:
    print("All classes included.")

In [None]:
# For validation dataset ONLY Randomly selected

import json
import random

# Path to the original COCO annotations
original_ann_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/annotations/instances_val2017.json'

# # MacBook Air
# original_ann_file = '/mnt/localssd/coco2017/annotations/instances_val2017.json'


# Parameters for customization
num_images = 2000  # Specify the number of images for validation (e.g., 100 images)
selected_classes = ['bicycle', 'car', 'motorcycle', 'bus', 'truck']  # Specify category names to include, leave empty for all classes (e.g., ['person', 'car'])

# Load the original annotations
with open(original_ann_file, 'r') as f:
    coco_data = json.load(f)

# Filter images and annotations by selected classes if specified
if selected_classes:
    # Get category IDs for the selected classes
    category_ids = {cat['id'] for cat in coco_data['categories'] if cat['name'] in selected_classes}

    # Filter annotations by category IDs
    filtered_annotations = [ann for ann in coco_data['annotations'] if ann['category_id'] in category_ids]

    # Get image IDs for the filtered annotations
    image_ids = {ann['image_id'] for ann in filtered_annotations}

    # Filter images based on these image IDs
    filtered_images = [img for img in coco_data['images'] if img['id'] in image_ids]
else:
    # Use all images and annotations if no classes are specified
    filtered_images = coco_data['images']
    filtered_annotations = coco_data['annotations']

# Shuffle the filtered images randomly
random.shuffle(filtered_images)

# Limit the number of images to the specified value
val_images = filtered_images[:num_images]

# Filter annotations for the selected images
def filter_annotations(images, annotations):
    image_ids = {img['id'] for img in images}
    return [ann for ann in annotations if ann['image_id'] in image_ids]

val_annotations = filter_annotations(val_images, filtered_annotations)

# Create the new JSON structure for validation
val_data = {
    'images': val_images,
    'annotations': val_annotations,
    'categories': coco_data['categories']
}

# Save the new JSON file
output_file = '/nas.dbms/asera/PROJECTS/DATASET/COCO/mini-annotations/mini_val2017-2k-5c.json'
with open(output_file, 'w') as f:
    json.dump(val_data, f)

print(f"Validation dataset created with {len(val_images)} images.")
if selected_classes:
    print(f"Filtered by classes: {selected_classes}")
else:
    print("All classes included.")
