# 🔎 Data Exploration

To have a good understanding of the underlying dataset, and set up for train/validation/test splits some data exploration is necessary.

In [12]:
import os
import pandas as pd
import numpy as np
import shutil

from src import utils

## 1. Data Overview

In [2]:
# Get an overview of the videos from the original annotation files

file_names, empty_frames, frames = utils.label_data.frame_overview(os.path.join(utils.RAW_DATA_DIR, 'drone_vs_bird_competition'))

empty_frames = np.array(empty_frames)
frames = np.array(frames)
empty_ratio = np.divide(empty_frames, frames)

# Combine the data into a DataFrame
overview_df = pd.DataFrame({
    'file_name': file_names,
    'empty_frames': empty_frames,
    'total_frames': frames,
    'empty_ratio': empty_ratio
})

# Save the dataframe to the metadata folder
overview_df.to_csv(os.path.join(utils.METADATA_DIR, 'dvb_video_overview.csv'), index=False)

In [2]:
# Re-read the csv to a dataframe (so the previous step can be skipped in the future)
overview_df = pd.read_csv(os.path.join(utils.METADATA_DIR, 'dvb_video_overview.csv'))

In [3]:
# Display the overview dataframe
overview_df.head()

Unnamed: 0,file_name,empty_frames,total_frames,empty_ratio
0,00_01_52_to_00_01_58.txt,53,175,0.302857
1,00_02_45_to_00_03_10_cut.txt,1,400,0.0025
2,00_06_10_to_00_06_27.txt,272,499,0.54509
3,00_09_30_to_00_10_09.txt,80,1165,0.06867
4,00_10_09_to_00_10_40.txt,31,925,0.033514


In [4]:
# Describe the overview dataframe
overview_df.describe()

Unnamed: 0,empty_frames,total_frames,empty_ratio
count,77.0,77.0,77.0
mean,118.0,1382.922078,0.086012
std,203.845788,1062.066144,0.122985
min,0.0,175.0,0.0
25%,3.0,526.0,0.003506
50%,40.0,925.0,0.046698
75%,132.0,1576.0,0.101498
max,1103.0,4612.0,0.735333


In [5]:
print(f"Total number of frames: {overview_df['total_frames'].sum()}")
print(f"Total number of empty frames: {overview_df['empty_frames'].sum()}")

Total number of frames: 106485
Total number of empty frames: 9086


From the description statistics we can see that we have a total of 106,485 images, so for a test dataset of 10% we would ideally have around 10,648 images. These images must come from the same videos so there is no data leakage between the training and testing sets.

We can also see that there is an average of about 8% empty frames (frames with no annotated objects) which would be useful to match for the testing set, but having a higher percentage in the test set may be acceptable to have a better measure of false-positive results from the final test. However, having some sparsely populated images is helpful to have in the training set as well.

In [3]:
# Check the extracted images to see if any are blank (all black)

black_images = []

image_list = utils.image_data.get_image_files(os.path.join(utils.INTERIM_DATA_DIR, 'drone_vs_bird_data'))

# Check each image to see if it is black
for image in image_list:
    if utils.image_data.is_blank_image(image):
        black_images.append(image)

In [4]:
# Create a dataframe from the list of black images
black_images_df = pd.DataFrame(black_images, columns=['file_name'])

# Save the dataframe to the metadata folder
black_images_df.to_csv(os.path.join(utils.METADATA_DIR, 'dvb_black_images.csv'), index=False)

In [2]:
# Re-read the dataframe from the csv file
black_images_df = pd.read_csv(os.path.join(utils.METADATA_DIR, 'dvb_black_images.csv'))

In [3]:
# Print the full list of black images
black_images_df

Unnamed: 0,file_name
0,data\interim\drone_vs_bird_data\distant_parrot...
1,data\interim\drone_vs_bird_data\distant_parrot...
2,data\interim\drone_vs_bird_data\dji_mavick_hil...
3,data\interim\drone_vs_bird_data\dji_mavick_mou...
4,data\interim\drone_vs_bird_data\dji_phantom_4_...
5,data\interim\drone_vs_bird_data\dji_phantom_4_...
6,data\interim\drone_vs_bird_data\dji_phantom_4_...
7,data\interim\drone_vs_bird_data\dji_phantom_mo...
8,data\interim\drone_vs_bird_data\fixed_wing_ove...
9,data\interim\drone_vs_bird_data\fixed_wing_ove...


These completely blank images won't provide any specific benefit to the model, so they should be excluded.

## 2. Video Investigation

In [4]:
test_files = [
    '2019_09_02_GOPR5871_1058_solo',
    'dji_matrice_210_off_focus',
    'dji_mavick_hillside_off_focus',
    'GOPR5843_002',
    'GOPR5843_005',
    'GOPR5847_003',
    'GOPR5847_004',
    'gopro_000',
    'gopro_001',
    'gopro_002',
    'gopro_003',
    'off_focus_parrot_birds',
    'two_uavs_plus_airplane'
]

## 3. Saving Resulting Data

In [5]:
# Collect a list of all image files

image_list = utils.image_data.get_image_files(os.path.join(utils.INTERIM_DATA_DIR, 'drone_vs_bird_data'))

length_initial = len(image_list)

print(f"Total number of images: {length_initial}")

Total number of images: 106460


In [6]:
# Remove all blank images this should be 17 less than the previous total

image_list = [img for img in image_list if img not in black_images_df['file_name'].values]

length_remove_blanks = len(image_list)

print(f"Total number of images after removing blank images: {length_remove_blanks}")
print(f"Total removed: {length_initial - length_remove_blanks}")
print(f"Total identified blank images: {len(black_images_df)}")

Total number of images after removing blank images: 106443
Total removed: 17
Total identified blank images: 17


In [7]:
# If the image contains the test file name move it from the list to a new list
test_images = [img for img in image_list if utils.files.file_contains_name(img, test_files)]
training_images = [img for img in image_list if img not in test_images]

In [8]:
# Check list sizes

print(f"Total number of test images: {len(test_images)}")
print(f"Total number of training images: {len(training_images)}")

Total number of test images: 10725
Total number of training images: 95718


In [10]:
# Duplicate the lists for the annotation files

test_annotations = test_images.copy()
training_annotations = training_images.copy()

# Change file extensions for each line to .txt
test_annotations = [os.path.splitext(ann)[0] + '.txt' for ann in test_annotations]
training_annotations = [os.path.splitext(ann)[0] + '.txt' for ann in training_annotations]

In [11]:
# Set random seed for reproducibility
rng = np.random.default_rng(42)

# Get a random number in the range of the test set
random_test_index = rng.integers(0, len(test_images))

# Get a random number in the range of the training set
random_training_index = rng.integers(0, len(training_images))

# Ensure that the test index gives matching image and annotations
print(f"Test image: {test_images[random_test_index]}")
print(f"Test annotation: {test_annotations[random_test_index]}")

# Ensure that the training index gives matching image and annotations
print(f"Training image: {training_images[random_training_index]}")
print(f"Training annotation: {training_annotations[random_training_index]}")


Test image: data\interim\drone_vs_bird_data\dji_matrice_210_off_focus_frame_0232.png
Test annotation: data\interim\drone_vs_bird_data\dji_matrice_210_off_focus_frame_0232.txt
Training image: data\interim\drone_vs_bird_data\parrot_disco_distant_cross_frame_1991.png
Training annotation: data\interim\drone_vs_bird_data\parrot_disco_distant_cross_frame_1991.txt


In [13]:
# Test folder
testing_folder = os.path.join(utils.PROCESSED_DATA_DIR, "test")

# Copy the test annotations and images to the testing folder
for img, ann in zip(test_images, test_annotations):
    shutil.copy(img, os.path.join(testing_folder, os.path.basename(img)))
    shutil.copy(ann, os.path.join(testing_folder, os.path.basename(ann)))


In [14]:
# Training folder
training_folder = os.path.join(utils.PROCESSED_DATA_DIR, "train")

# Copy the training annotations and images to the training folder
for img, ann in zip(training_images, training_annotations):
    shutil.copy(img, os.path.join(training_folder, os.path.basename(img)))
    shutil.copy(ann, os.path.join(training_folder, os.path.basename(ann)))
