# 🗃️ Data Processing

In [None]:
test_files = [
    '2019_09_02_GOPR5871_1058_solo',
    'dji_matrice_210_off_focus',
    'dji_mavick_hillside_off_focus',
    'GOPR5843_002',
    'GOPR5843_005',
    'GOPR5847_003',
    'GOPR5847_004',
    'gopro_000',
    'gopro_001',
    'gopro_002',
    'gopro_003',
    'off_focus_parrot_birds',
    'two_uavs_plus_airplane'
]

In [None]:
val_files = [
    '00_09_30_to_00_10_09',
    '00_10_09_to_00_10_40',
    '2019_08_19_GOPR5869_1530_phantom',
    '2019_08_19_GP015869_1520_inspire',
    'dji_mavick_mountain_cruise',
    'GOPR5844_002',
    'GOPR5844_004',
    'GOPR5846_002',
    'GOPR5846_005',
    'dji_mavick_distant_hillside',
    'parrot_disco_zoomin_zoomout',
    'distant_parrot_with_birds'
]

In [None]:
# Collect a list of all image files

image_list = utils.image_data.get_image_files(os.path.join(utils.INTERIM_DATA_DIR, 'drone_vs_bird_data'))

length_initial = len(image_list)

print(f"Total number of images: {length_initial}")

In [None]:
# Re-read the dataframe from the csv file
black_images_df = pd.read_csv(os.path.join(utils.METADATA_DIR, 'dvb_black_images.csv'))

In [None]:
# Remove all blank images this should be 17 less than the previous total

image_list = [img for img in image_list if img not in black_images_df['file_name'].values]

length_remove_blanks = len(image_list)

print(f"Total number of images after removing blank images: {length_remove_blanks}")
print(f"Total removed: {length_initial - length_remove_blanks}")
print(f"Total identified blank images: {len(black_images_df)}")

In [None]:
# If the image contains the test file name move it from the list to a new list
test_images = [img for img in image_list if utils.files.file_contains_name(img, test_files)]
training_images = [img for img in image_list if img not in test_images]

In [None]:
# Do the same, splitting training into training and validation
val_images = [img for img in training_images if utils.files.file_contains_name(img, val_files)]
training_images = [img for img in training_images if img not in val_images]

In [None]:
# Check list sizes

print(f"Total number of test images: {len(test_images)}")
print(f"Total number of validation images: {len(val_images)}")
print(f"Total number of training images: {len(training_images)}")

In [None]:
# Duplicate the lists for the annotation files

test_annotations = test_images.copy()
val_annotations = val_images.copy()
training_annotations = training_images.copy()

# Change file extensions for each line to .txt
test_annotations = [os.path.splitext(ann)[0] + '.txt' for ann in test_annotations]
val_annotations = [os.path.splitext(ann)[0] + '.txt' for ann in val_annotations]
training_annotations = [os.path.splitext(ann)[0] + '.txt' for ann in training_annotations]

In [None]:
# Set random seed for reproducibility
rng = np.random.default_rng(42)

# Get a random number in the range of the test set
random_test_index = rng.integers(0, len(test_images))

# Get a random number in the range of the validation set
random_validation_index = rng.integers(0, len(val_images))

# Get a random number in the range of the training set
random_training_index = rng.integers(0, len(training_images))

# Ensure that the test index gives matching image and annotations
print(f"Test image: {test_images[random_test_index]}")
print(f"Test annotation: {test_annotations[random_test_index]}")

# Ensure that the validation index gives matching image and annotations
print(f"Validation image: {val_images[random_validation_index]}")
print(f"Validation annotation: {val_annotations[random_validation_index]}")

# Ensure that the training index gives matching image and annotations
print(f"Training image: {training_images[random_training_index]}")
print(f"Training annotation: {training_annotations[random_training_index]}")
