In [26]:
from collections import Counter

label_dir = '/kaggle/input/annotations/labelTxt'
class_counts = Counter()

for file in os.listdir(label_dir):
    if not file.endswith('.txt'): continue
    with open(os.path.join(label_dir, file), 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 9:
                cls = parts[8]
                class_counts[cls] += 1

# Print top classes
print("Class distribution:")
for cls, count in class_counts.most_common():
    print(f"{cls}: {count}")

Class distribution:
ship: 28068
small-vehicle: 26126
large-vehicle: 16969
plane: 8055
harbor: 5983
storage-tank: 5029
tennis-court: 2367
bridge: 2047
swimming-pool: 1736
helicopter: 630
basketball-court: 515
baseball-diamond: 415
roundabout: 399
soccer-ball-field: 326
ground-track-field: 325


**PREPROCESSING**

Minor fraction of images were grayscale; model frameworks like YOLOv8 auto-handle this during loading. Explicit conversion was skipped to optimize preprocessing time.

In [38]:
# Paths
image_dir = '/kaggle/input/dota-data/DOTA/train/images'
label_dir = '/kaggle/working/yolo_labels'
base_out_dir = '/kaggle/working/yolo_dataset'

In [39]:
# Create required directories
for split in ['train', 'val']:
    os.makedirs(os.path.join(base_out_dir, 'images', split), exist_ok=True)
    os.makedirs(os.path.join(base_out_dir, 'labels', split), exist_ok=True)

In [40]:
# Get all image files
all_images = sorted(glob(os.path.join(image_dir, '*.png')))
print(f"Total images found: {len(all_images)}")

Total images found: 1411


In [41]:
# Shuffle and split 80% train / 20% val
random.seed(42)
random.shuffle(all_images)
split_idx = int(0.8 * len(all_images))
train_images = all_images[:split_idx]
val_images = all_images[split_idx:]

In [42]:
# Helper to copy files
def copy_image_and_label(image_list, split):
    for img_path in image_list:
        fname = os.path.basename(img_path)
        label_path = os.path.join(label_dir, fname.replace('.png', '.txt'))

        # Copy image
        shutil.copy(img_path, os.path.join(base_out_dir, 'images', split, fname))

        # Copy label if it exists
        if os.path.exists(label_path):
            shutil.copy(label_path, os.path.join(base_out_dir, 'labels', split, fname.replace('.png', '.txt')))

In [43]:
# Copy to train and val
copy_image_and_label(train_images, 'train')
copy_image_and_label(val_images, 'val')

In [44]:
# Summary
print(f"Train images: {len(train_images)}")
print(f"Val images: {len(val_images)}")

Train images: 1128
Val images: 283
