# 🔎 Data Exploration

To have a good understanding of the underlying dataset, and set up for train/validation/test splits some data exploration is necessary.

In [2]:
import os
import pandas as pd
import numpy as np

from src import utils

## 1. Data Overview

In [2]:
# Get an overview of the videos from the original annotation files

file_names, empty_frames, frames = utils.label_data.frame_overview(os.path.join(utils.RAW_DATA_DIR, 'drone_vs_bird_competition'))

empty_frames = np.array(empty_frames)
frames = np.array(frames)
empty_ratio = np.divide(empty_frames, frames)

# Combine the data into a DataFrame
overview_df = pd.DataFrame({
    'file_name': file_names,
    'empty_frames': empty_frames,
    'total_frames': frames,
    'empty_ratio': empty_ratio
})

# Save the dataframe to the metadata folder
overview_df.to_csv(os.path.join(utils.METADATA_DIR, 'dvb_video_overview.csv'), index=False)

In [3]:
# Re-read the csv to a dataframe (so the previous step can be skipped in the future)
overview_df = pd.read_csv(os.path.join(utils.METADATA_DIR, 'dvb_video_overview.csv'))

In [4]:
# Display the overview dataframe
overview_df.head()

Unnamed: 0,file_name,empty_frames,total_frames,empty_ratio
0,00_01_52_to_00_01_58.txt,53,175,0.302857
1,00_02_45_to_00_03_10_cut.txt,1,400,0.0025
2,00_06_10_to_00_06_27.txt,272,499,0.54509
3,00_09_30_to_00_10_09.txt,80,1165,0.06867
4,00_10_09_to_00_10_40.txt,31,925,0.033514


In [5]:
# Describe the overview dataframe
overview_df.describe()

Unnamed: 0,empty_frames,total_frames,empty_ratio
count,77.0,77.0,77.0
mean,118.0,1382.922078,0.086012
std,203.845788,1062.066144,0.122985
min,0.0,175.0,0.0
25%,3.0,526.0,0.003506
50%,40.0,925.0,0.046698
75%,132.0,1576.0,0.101498
max,1103.0,4612.0,0.735333


In [3]:
# Check the extracted images to see if any are blank (all black)

black_images = []

image_list = utils.image_data.get_image_files(os.path.join(utils.INTERIM_DATA_DIR, 'drone_vs_bird_data'))

# Check each image to see if it is black
for image in image_list:
    if utils.image_data.is_blank_image(image):
        black_images.append(image)

In [4]:
# Create a dataframe from the list of black images
black_images_df = pd.DataFrame(black_images, columns=['file_name'])

# Save the dataframe to the metadata folder
black_images_df.to_csv(os.path.join(utils.METADATA_DIR, 'dvb_black_images.csv'), index=False)

In [5]:
# Re-read the dataframe from the csv file
black_images_df = pd.read_csv(os.path.join(utils.METADATA_DIR, 'dvb_black_images.csv'))

In [6]:
# Print the full list of black images
black_images_df

Unnamed: 0,file_name
0,data\interim\drone_vs_bird_data\distant_parrot...
1,data\interim\drone_vs_bird_data\distant_parrot...
2,data\interim\drone_vs_bird_data\dji_mavick_hil...
3,data\interim\drone_vs_bird_data\dji_mavick_mou...
4,data\interim\drone_vs_bird_data\dji_phantom_4_...
5,data\interim\drone_vs_bird_data\dji_phantom_4_...
6,data\interim\drone_vs_bird_data\dji_phantom_4_...
7,data\interim\drone_vs_bird_data\dji_phantom_mo...
8,data\interim\drone_vs_bird_data\fixed_wing_ove...
9,data\interim\drone_vs_bird_data\fixed_wing_ove...
