In [1]:
# standard library imports
import os

# third-party imports
import boto3
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from botocore import UNSIGNED
from botocore.config import Config
from sklearn.model_selection import train_test_split
print("All imports successful")

All imports successful


In [15]:
# initialise the S3 client
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# local folder to save the data
dataset_folder = '/Users/renatoboemer/code/developer/aot-object-tracking-aws/raw_data/airborne_data'
images_folder = os.path.join('/Users/renatoboemer/code/developer/aot-object-tracking-aws/raw_data/airborne_images')

# list the bucket contents
bucket_name = 'airborne-obj-detection-challenge-training'
prefix = 'part1/'
result = s3.list_objects_v2(Bucket=bucket_name)

In [3]:
gt_data = pd.read_csv(os.path.join(dataset_folder, 'groundtruth.csv'), index_col=0)

In [4]:
label_mapping = {
    'Airplane1': 'aircraft', 'Airplane2': 'aircraft', 'Airplane3': 'aircraft', 'Airplane4': 'aircraft',
    'Airplane5': 'aircraft', 'Airplane6': 'aircraft', 'Airplane7': 'aircraft', 'Airplane8': 'aircraft',
    'Airplane9': 'aircraft', 'Airplane10': 'aircraft',
    'Helicopter1': 'helicopter', 'Helicopter2': 'helicopter', 'Helicopter3': 'helicopter',

    'Bird1': 'bird', 'Bird2': 'bird', 'Bird3': 'bird', 'Bird4': 'bird', 'Bird5': 'bird',
    'Bird6': 'bird', 'Bird7': 'bird', 'Bird8': 'bird', 'Bird9': 'bird', 'Bird10': 'bird',
    'Bird11': 'bird', 'Bird12': 'bird', 'Bird13': 'bird', 'Bird14': 'bird', 'Bird15': 'bird',
    'Bird16': 'bird', 'Bird17': 'bird', 'Bird18': 'bird', 'Bird19': 'bird', 'Bird20': 'bird',
    'Bird21': 'bird', 'Bird22': 'bird', 'Bird23': 'bird', 'Bird24': 'bird', 'Bird25': 'bird',

    'Airborne1': 'airborne', 'Airborne2': 'airborne', 'Airborne3': 'airborne', 'Airborne4': 'airborne',
    'Airborne5': 'airborne', 'Airborne6': 'airborne', 'Airborne7': 'airborne', 'Airborne8': 'airborne',
    'Airborne9': 'airborne', 'Airborne10': 'airborne', 'Airborne11': 'airborne', 'Airborne12': 'airborne',
    'Airborne13': 'airborne', 'Airborne14': 'airborne', 'Airborne15': 'airborne', 'Airborne16': 'airborne',
    'Airborne17': 'airborne', 'Airborne18': 'airborne', 'Airborne19': 'airborne',

    'Drone1': 'drone',

    'Flock1': 'flock', 'Flock2': 'flock', 'Flock3': 'flock'
}


In [5]:
gt_df = pd.read_csv(os.path.join(dataset_folder, 'groundtruth.csv'), index_col=0)

gt_df['label'] = gt_df['id'].map(label_mapping).fillna('np.nan')

class_id_mapping = {label: i for i, label in enumerate(gt_df['label'].unique())}

In [6]:
gt_df['label'].unique()

array(['np.nan', 'aircraft', 'helicopter', 'bird', 'airborne', 'drone',
       'flock'], dtype=object)

In [7]:
gt_yolo_df = gt_df[gt_df['flight_id'] == '0001ba865c8e410e88609541b8f55ffc']

gt_yolo_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1199 entries, 684441 to 685639
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              1199 non-null   int64  
 1   flight_id         1199 non-null   object 
 2   img_name          1199 non-null   object 
 3   frame             1199 non-null   int64  
 4   id                324 non-null    object 
 5   range_distance_m  324 non-null    float64
 6   is_above_horizon  324 non-null    float64
 7   size_width        1199 non-null   int64  
 8   size_height       1199 non-null   int64  
 9   gt_left           324 non-null    float64
 10  gt_top            324 non-null    float64
 11  gt_right          324 non-null    float64
 12  gt_bottom         324 non-null    float64
 13  label             1199 non-null   object 
dtypes: float64(6), int64(4), object(4)
memory usage: 140.5+ KB


In [8]:
gt_yolo_df['label'].unique()

array(['helicopter', 'np.nan'], dtype=object)

In [12]:
gt_yolo_df.head()

Unnamed: 0,time,flight_id,img_name,frame,id,range_distance_m,is_above_horizon,size_width,size_height,gt_left,gt_top,gt_right,gt_bottom,label
684441,1554977349382218834,0001ba865c8e410e88609541b8f55ffc,15549773493822188340001ba865c8e410e88609541b8f...,2,Helicopter1,1937.340022,-1.0,2448,2048,1453.744186,1067.465116,1459.744186,1073.465116,helicopter
684442,1554977349483260352,0001ba865c8e410e88609541b8f55ffc,15549773494832603520001ba865c8e410e88609541b8f...,3,Helicopter1,1931.863012,-1.0,2448,2048,1455.604651,1063.976744,1461.604651,1069.976744,helicopter
684443,1554977349588710305,0001ba865c8e410e88609541b8f55ffc,15549773495887103050001ba865c8e410e88609541b8f...,4,Helicopter1,1926.277939,-1.0,2448,2048,1459.325581,1062.116279,1465.325581,1068.116279,helicopter
684444,1554977349721609076,0001ba865c8e410e88609541b8f55ffc,15549773497216090760001ba865c8e410e88609541b8f...,5,Helicopter1,1920.795038,-1.0,2448,2048,1464.674419,1066.534884,1470.674419,1072.534884,helicopter
684445,1554977349781159117,0001ba865c8e410e88609541b8f55ffc,15549773497811591170001ba865c8e410e88609541b8f...,6,Helicopter1,1915.321955,-1.0,2448,2048,1468.395349,1059.55814,1474.395349,1065.55814,helicopter


In [34]:
# load the dataframe
df = gt_yolo_df.copy()  # using the dataframe provided

# create a dictionary to map class labels to class IDs
class_mapping = {'helicopter': 0}

# directory to save YOLO annotations
yolo_annotations_dir = 'annotations'
os.makedirs(yolo_annotations_dir, exist_ok=True)

# loop through each row in the dataframe and create YOLO annotation files
for row in df.itertuples(index=False):
    # extract image information
    img_name = row.img_name
    img_width = row.size_width
    img_height = row.size_height

    # extract bounding box coordinates
    gt_left = row.gt_left
    gt_top = row.gt_top
    gt_right = row.gt_right
    gt_bottom = row.gt_bottom

    # calculate YOLO format values
    x_center = (gt_left + gt_right) / 2.0 / img_width
    y_center = (gt_top + gt_bottom) / 2.0 / img_height
    bbox_width = (gt_right - gt_left) / img_width
    bbox_height = (gt_bottom - gt_top) / img_height

    # get the class ID, handle missing labels
    label = row.label
    if pd.isna(label) or label not in class_mapping:
        continue
    class_id = class_mapping[label]

    # create YOLO annotation line
    yolo_annotation = f"{class_id} {x_center:.6f} {y_center:.6f} {bbox_width:.6f} {bbox_height:.6f}\n"

    # save to a text file (one text file per image)
    img_name_base = os.path.splitext(img_name)[0]
    annotation_file = os.path.join(yolo_annotations_dir, f"{img_name_base}.txt")
    with open(annotation_file, 'a') as f:  # using 'a' to append instead of 'w' for multiple objects per image
        f.write(yolo_annotation)

print("YOLO annotations saved successfully!")


YOLO annotations saved successfully!


In [40]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split

# Paths
root_dir = os.path.dirname(os.getcwd())
image_dir = os.path.join(root_dir, 'images')
annotation_dir = os.path.join(root_dir, 'annotations')
output_dir= os.path.join(root_dir, 'dataset_split')


In [41]:
# Get list of annotated images (helicopter-present images)
annotated_images = {f.replace('.txt', '.png') for f in os.listdir(annotation_dir) if f.endswith('.txt')}
all_images = {f for f in os.listdir(image_dir) if f.endswith('.png')}

# Determine images without annotations (helicopter-absent images)
non_annotated_images = list(all_images - annotated_images)
annotated_images = list(annotated_images)  # Convert to list for easier handling

# Split Parameters
train_ratio, val_ratio, test_ratio = 0.7, 0.2, 0.1

# Total number of positive and negative samples
total_positives = len(annotated_images)
total_negatives = len(non_annotated_images)

# Desired number of positive and negative samples for each split
train_positives = int(train_ratio * total_positives)
val_positives = int(val_ratio * total_positives)
test_positives = total_positives - train_positives - val_positives

train_negatives = int(train_ratio * total_negatives)
val_negatives = int(val_ratio * total_negatives)
test_negatives = total_negatives - train_negatives - val_negatives

# Split annotated images (helicopter present)
train_anno, remaining_anno = train_test_split(annotated_images, train_size=train_positives, random_state=42)
val_anno, test_anno = train_test_split(remaining_anno, train_size=val_positives, random_state=42)

# Split non-annotated images (helicopter absent)
train_non_anno, remaining_non_anno = train_test_split(non_annotated_images, train_size=train_negatives, random_state=42)
val_non_anno, test_non_anno = train_test_split(remaining_non_anno, train_size=val_negatives, random_state=42)

# Combine for each set while maintaining the 27% positive, 73% negative ratio
train_images = train_anno + train_non_anno
val_images = val_anno + val_non_anno
test_images = test_anno + test_non_anno

# Shuffle each set to ensure randomness
random.shuffle(train_images)
random.shuffle(val_images)
random.shuffle(test_images)

# Output Directory Structure
output_dirs = {
    'train': 'split/train/images/',
    'val': 'split/val/images/',
    'test': 'split/test/images/',
    'train_anno': 'split/train/labels/',
    'val_anno': 'split/val/labels/',
    'test_anno': 'split/test/labels/'
}

# Create directories if they don't exist
for dir_path in output_dirs.values():
    os.makedirs(dir_path, exist_ok=True)

# Function to copy images and annotations
def copy_files(image_list, dest_img_dir, dest_anno_dir):
    missing_annotations = []
    for img_file in image_list:
        # Copy image file
        shutil.copy(os.path.join(image_dir, img_file), dest_img_dir)

        # Copy annotation if it exists
        anno_file = img_file.replace('.png', '.txt')
        anno_path = os.path.join(annotation_dir, anno_file)
        if os.path.exists(anno_path):
            shutil.copy(anno_path, dest_anno_dir)
        else:
            # Log missing annotations for inspection (these are negative samples)
            missing_annotations.append(img_file)

    if missing_annotations:
        print(f"Warning: {len(missing_annotations)} images were expected to have annotations but did not (likely no helicopter).")
    else:
        print("All expected annotations were found and copied successfully.")

# Copy files to respective folders
copy_files(train_images, output_dirs['train'], output_dirs['train_anno'])
copy_files(val_images, output_dirs['val'], output_dirs['val_anno'])
copy_files(test_images, output_dirs['test'], output_dirs['test_anno'])

print("Data split and copied successfully into 'split' directory.")


Data split and copied successfully into 'split' directory.
