In [2]:
import os
import json
import random
import shutil
from pathlib import Path
import argparse

annotations_file = "/scratch/anazeri/kitti_coco_format/kitti_val/annotations/annotations_testset.json"
images_dir = "/scratch/anazeri/kitti_coco_format/kitti_val/testset_original"
output_dir = "/scratch/anazeri/kitti_coco_format/kitti_val"
train_ratio=0.8 
seed=42

# def split_coco_dataset(annotations_file, images_dir, output_dir, train_ratio=0.8, seed=42):
"""
Split a COCO dataset into training and validation sets.

Args:
    annotations_file: Path to the COCO annotations JSON file
    images_dir: Directory containing the images
    output_dir: Directory to save the split datasets
    train_ratio: Ratio of images to use for training (default: 0.8)
    seed: Random seed for reproducibility
"""
# Set random seed for reproducibility
random.seed(seed)

# Create output directories
output_dir = Path(output_dir)

train_img_dir = output_dir / "train2017"
val_img_dir = output_dir / "val2017"

# Create directories if they don't exist
for directory in [train_img_dir, val_img_dir]:
    directory.mkdir(parents=True, exist_ok=True)

# Load annotations
print(f"Loading annotations from {annotations_file}")
with open(annotations_file, 'r') as f:
    annotations = json.load(f)

# Get list of images
images = annotations['images']
print(f"Found {len(images)} images in the dataset")

# Shuffle images
random.shuffle(images)

# Determine split point
split_idx = int(len(images) * train_ratio)
train_images = images[:split_idx]
val_images = images[split_idx:]

Loading annotations from /scratch/anazeri/kitti_coco_format/kitti_val/annotations/annotations_testset.json
Found 7481 images in the dataset


In [4]:
print(f"Split: {len(train_images)} images for training, {len(val_images)} images for validation")

# Create image ID lookup sets for fast filtering
train_image_ids = {img['id'] for img in train_images}
val_image_ids = {img['id'] for img in val_images}

# Create annotations for each split
train_annotations = [ann for ann in annotations['annotations'] if ann['image_id'] in train_image_ids]
val_annotations = [ann for ann in annotations['annotations'] if ann['image_id'] in val_image_ids]

print(f"Annotations split: {len(train_annotations)} for training, {len(val_annotations)} for validation")

# Create COCO data for each split
train_coco = {
    'info': annotations.get('info', {}),
    'licenses': annotations.get('licenses', []),
    'categories': annotations['categories'],
    'images': train_images,
    'annotations': train_annotations
}

val_coco = {
    'info': annotations.get('info', {}),
    'licenses': annotations.get('licenses', []),
    'categories': annotations['categories'],
    'images': val_images,
    'annotations': val_annotations
}

# Save annotations
with open(output_dir / "annotations" / "instances_train2017.json", 'w') as f:
    json.dump(train_coco, f)

with open(output_dir / "annotations" / "instances_val2017.json", 'w') as f:
    json.dump(val_coco, f)

# Copy images to respective directories
images_dir = Path(images_dir)

print("Copying training images...")
for img in train_images:
    src_path = images_dir / img['file_name']
    dst_path = train_img_dir / img['file_name']
    shutil.copy2(src_path, dst_path)

print("Copying validation images...")
for img in val_images:
    src_path = images_dir / img['file_name']
    dst_path = val_img_dir / img['file_name']
    shutil.copy2(src_path, dst_path)

print("Dataset split complete!")
print(f"Training set: {len(train_images)} images, {len(train_annotations)} annotations")
print(f"Validation set: {len(val_images)} images, {len(val_annotations)} annotations")
    

Split: 5984 images for training, 1497 images for validation
Annotations split: 41705 for training, 10160 for validation
Copying training images...
Copying validation images...
Dataset split complete!
Training set: 5984 images, 41705 annotations
Validation set: 1497 images, 10160 annotations
