# 🗃️ Data Processing

In [None]:
import pandas as pd
import os
import shutil

from src import utils

## 1. Remove Blank Images

These blank images won't provide any meaningful benefit to the training or testing so they need to be removed.

In [2]:
# Read in the csv list of all blank images
black_images_df = pd.read_csv(os.path.join(utils.METADATA_DIR, 'dvb_black_images.csv'))

In [4]:
# For each image file in the dataframe move it and its annotation file to a new folder
blank_img_dir = "data\\interim\\junk_data\\blank images and with their annotations"

for index, row in black_images_df.iterrows():
    img_file = row['file_name']
    ann_file = img_file.replace('.png', '.txt')
    # Move the image file
    os.rename(img_file, os.path.join(blank_img_dir, os.path.basename(img_file)))
    # Move the annotation file
    os.rename(ann_file, os.path.join(blank_img_dir, os.path.basename(ann_file)))

## 2. Fix Lagging Annotations

File `2019_10_16_C0003_3633_inspire` had some issues with the annotations that were discovered during the data exploration. The annotations were lagging behind the actual object. An additional 24 frames (approximately 1 second of video) were in the annotation file that are absent from the actual video file. Shifting the annotations by 24 frames should make the annotations line up correctly.

In [9]:
# Get a list of the image files
inspire_img_files = utils.files.get_image_files(os.path.join(utils.INTERIM_DATA_DIR, 'drone_vs_bird_data'))

In [10]:
len(inspire_img_files)

106443

In [11]:
# Reduce the list of image files to just the files that contain 2019_10_16_C0003_3633_inspire
inspire_img_files = [f for f in inspire_img_files if '2019_10_16_C0003_3633_inspire' in f]

In [13]:
num_imgs =len(inspire_img_files)
num_imgs

1402

In [5]:
# Get a list of the annotation files
inspire_ann_files = utils.files.get_annotation_files(os.path.join(utils.INTERIM_DATA_DIR, 'drone_vs_bird_data'))

In [6]:
len(inspire_ann_files)

106444

In [7]:
# Reduce the list of annotation files to just the files that contain 2019_10_16_C0003_3633_inspire
inspire_ann_files = [f for f in inspire_ann_files if '2019_10_16_C0003_3633_inspire' in f]

In [14]:
num_anns = len(inspire_ann_files)
num_anns

1402

In [15]:
inspire_ann_files[0]

'data\\interim\\drone_vs_bird_data\\2019_10_16_C0003_3633_inspire_frame_0000.txt'

In [None]:
# Create a new list for the annotation files shifted by the difference in the number of images to annotations

# Difference between annotations and images, note that this must be >= 0
difference = num_anns - num_imgs
# Blank list to store new annotation file names
new_ann_files = []

if difference > 0:
    print(f"Number of annotation files is greater than number of image files by {difference}. Adjusting annotation file names.")
    # Loop through original list of file names
    for ann_file in inspire_ann_files:
        # Remove the extension from the file name
        file_name = os.path.splitext(ann_file)[0]
        # Get the frame number and adjust it by the difference
        frame = int(file_name[-4:]) # get the last 4 characters of the file
        frame = frame - difference
        frame = f"{frame:0>4}"
        # Create the new file name
        new_name = file_name[:-4] + frame + '.txt'
        # Append the new file name to the list
        new_ann_files.append(new_name)

In [None]:
# Rename the files from the original list to the new list
for old_name, new_name in zip(inspire_ann_files, new_ann_files):
    print(f"Renaming {old_name} to {new_name}")
    os.rename(old_name, new_name)

In [None]:
# Move the renamed annotation files with a negative frame number to a new directory
negative_frame_dir = "data\\interim\\junk_data\\2019_10_16_C0003_3633_inspire excess annotations"

for ann_file in new_ann_files:
    basename = os.path.basename(ann_file)
    frame = int(basename[-8:-4])
    if frame < 0:
        print(f"Moving {ann_file} to {negative_frame_dir}")
        shutil.move(ann_file, os.path.join(negative_frame_dir, basename))

In [None]:
test_files = [
    '2019_09_02_GOPR5871_1058_solo',
    'dji_matrice_210_off_focus',
    'dji_mavick_hillside_off_focus',
    'GOPR5843_002',
    'GOPR5843_005',
    'GOPR5847_003',
    'GOPR5847_004',
    'gopro_000',
    'gopro_001',
    'gopro_002',
    'gopro_003',
    'off_focus_parrot_birds',
    'two_uavs_plus_airplane'
]

In [None]:
val_files = [
    '00_09_30_to_00_10_09',
    '00_10_09_to_00_10_40',
    '2019_08_19_GOPR5869_1530_phantom',
    '2019_08_19_GP015869_1520_inspire',
    'dji_mavick_mountain_cruise',
    'GOPR5844_002',
    'GOPR5844_004',
    'GOPR5846_002',
    'GOPR5846_005',
    'dji_mavick_distant_hillside',
    'parrot_disco_zoomin_zoomout',
    'distant_parrot_with_birds'
]

In [None]:
# Collect a list of all image files

image_list = utils.image_data.get_image_files(os.path.join(utils.INTERIM_DATA_DIR, 'drone_vs_bird_data'))

length_initial = len(image_list)

print(f"Total number of images: {length_initial}")

In [None]:
# Re-read the dataframe from the csv file
black_images_df = pd.read_csv(os.path.join(utils.METADATA_DIR, 'dvb_black_images.csv'))

In [None]:
# Remove all blank images this should be 17 less than the previous total

image_list = [img for img in image_list if img not in black_images_df['file_name'].values]

length_remove_blanks = len(image_list)

print(f"Total number of images after removing blank images: {length_remove_blanks}")
print(f"Total removed: {length_initial - length_remove_blanks}")
print(f"Total identified blank images: {len(black_images_df)}")

In [None]:
# If the image contains the test file name move it from the list to a new list
test_images = [img for img in image_list if utils.files.file_contains_name(img, test_files)]
training_images = [img for img in image_list if img not in test_images]

In [None]:
# Do the same, splitting training into training and validation
val_images = [img for img in training_images if utils.files.file_contains_name(img, val_files)]
training_images = [img for img in training_images if img not in val_images]

In [None]:
# Check list sizes

print(f"Total number of test images: {len(test_images)}")
print(f"Total number of validation images: {len(val_images)}")
print(f"Total number of training images: {len(training_images)}")

In [None]:
# Duplicate the lists for the annotation files

test_annotations = test_images.copy()
val_annotations = val_images.copy()
training_annotations = training_images.copy()

# Change file extensions for each line to .txt
test_annotations = [os.path.splitext(ann)[0] + '.txt' for ann in test_annotations]
val_annotations = [os.path.splitext(ann)[0] + '.txt' for ann in val_annotations]
training_annotations = [os.path.splitext(ann)[0] + '.txt' for ann in training_annotations]

In [None]:
# Set random seed for reproducibility
rng = np.random.default_rng(42)

# Get a random number in the range of the test set
random_test_index = rng.integers(0, len(test_images))

# Get a random number in the range of the validation set
random_validation_index = rng.integers(0, len(val_images))

# Get a random number in the range of the training set
random_training_index = rng.integers(0, len(training_images))

# Ensure that the test index gives matching image and annotations
print(f"Test image: {test_images[random_test_index]}")
print(f"Test annotation: {test_annotations[random_test_index]}")

# Ensure that the validation index gives matching image and annotations
print(f"Validation image: {val_images[random_validation_index]}")
print(f"Validation annotation: {val_annotations[random_validation_index]}")

# Ensure that the training index gives matching image and annotations
print(f"Training image: {training_images[random_training_index]}")
print(f"Training annotation: {training_annotations[random_training_index]}")
