# General Dataset Overview

This notebook prepares a dataframe to get a general overview of the dataset in terms of the annotation types, bounding box areas and bounding box aspect ratios per image

In [89]:
import os
import json
from PIL import Image
import pandas as pd
from tqdm import tqdm
import numpy as np

In [90]:
# Function to calculate bounding box area
def calculate_bbox_area(bbox):
    return (abs(bbox[0][0] - bbox[1][0])*abs(bbox[0][1] - bbox[1][1]))

In [91]:
# Function to calculate bounding box aspect ratio
def calculate_bbox_aspect_ratio(bbox):
    return (abs(bbox[0][0] - bbox[1][0]) / abs(bbox[0][1] - bbox[1][1]))

In [92]:
# Initialize data for the dataframe
data = {'Index': [], 'unR_Count': [], 'unR_Area_Mean': [], 'unR_Area_Std': [], 'unR_Aspect_Ratio_Mean': [], 'unR_Aspect_Ratio_Std': [],
        'dR_Count': [], 'dR_Area_Mean': [], 'dR_Area_Std': [], 'dR_Aspect_Ratio_Mean': [], 'dR_Aspect_Ratio_Std': [],
        'unC_Count': [], 'unC_Area_Mean': [], 'unC_Area_Std': [], 'unC_Aspect_Ratio_Mean': [], 'unC_Aspect_Ratio_Std': [],
        'dC_Count': [], 'dC_Area_Mean': [], 'dC_Area_Std': [], 'dC_Aspect_Ratio_Mean': [], 'dC_Aspect_Ratio_Std': []}

In [93]:
# !tar -xvf training_data_3.tar

In [94]:
# Path to your directory containing JPG and JSON files
directory_path = './training_data_3'

In [95]:
# Iterate through files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith("json"):
        # Extract index from the filename
        index = int(filename.split("_")[2][:4])

        # Read JSON file
        json_path = os.path.join(directory_path, filename)
        if os.path.exists(json_path):
            with open(json_path) as json_file:
                data_json = json.load(json_file)

            # Initialize counters and accumulators for each class
            class_counts = [0, 0, 0, 0]
            class_areas = [[], [], [], []]
            class_aspect_ratios = [[], [], [], []]
            class_index_dict = {"undamagedresidentialbuilding": [0,"unR"], 
                            "damagedresidentialbuilding": [1, "dR"],
                           "undamagedcommercialbuilding": [2, "unC"], 
                            "damagedcommercialbuilding": [3, "dC"]}

            # Process annotations in the JSON file
            for annotation in data_json["shapes"]:
                class_label = annotation["label"]
                class_index = class_index_dict[class_label][0]
                class_counts[class_index] += 1

                bbox = annotation["points"]
                bbox_area = calculate_bbox_area(bbox)
                class_areas[class_index].append(bbox_area)

                aspect_ratio = calculate_bbox_aspect_ratio(bbox)
                class_aspect_ratios[class_index].append(aspect_ratio)

            # Calculate means and std deviations for each class
            for label in class_index_dict:
                class_index = class_index_dict[label][0]
                class_name = class_index_dict[label][1]
                data[f'{class_name}_Count'].append(class_counts[class_index])
                data[f'{class_name}_Area_Mean'].append(np.average(class_areas[class_index]) if class_counts[class_index] > 0 else np.NaN)
                data[f'{class_name}_Area_Std'].append(np.std(class_areas[class_index]) if class_counts[class_index] > 0 else np.NaN)
                data[f'{class_name}_Aspect_Ratio_Mean'].append(np.average(class_aspect_ratios[class_index]) if class_counts[class_index] > 0 else np.NaN)
                data[f'{class_name}_Aspect_Ratio_Std'].append(np.std(class_aspect_ratios[class_index]) if class_counts[class_index] > 0 else np.NaN)

            data['Index'].append(index)

In [96]:
# Create pandas DataFrame
df = pd.DataFrame(data)

In [97]:
df.head()

Unnamed: 0,Index,unR_Count,unR_Area_Mean,unR_Area_Std,unR_Aspect_Ratio_Mean,unR_Aspect_Ratio_Std,dR_Count,dR_Area_Mean,dR_Area_Std,dR_Aspect_Ratio_Mean,...,unC_Count,unC_Area_Mean,unC_Area_Std,unC_Aspect_Ratio_Mean,unC_Aspect_Ratio_Std,dC_Count,dC_Area_Mean,dC_Area_Std,dC_Aspect_Ratio_Mean,dC_Aspect_Ratio_Std
0,43,7,2604.017857,424.03894,1.09302,0.444314,1,5295.833333,0.0,0.504065,...,0,,,,,0,,,,
1,23,34,2417.087044,927.939689,0.998166,0.243002,5,3243.333333,534.323553,0.841729,...,0,,,,,0,,,,
2,38,0,,,,,0,,,,...,3,29846.821426,5560.950945,2.005627,1.363291,0,,,,
3,8,5,4213.888889,2141.167843,0.948864,0.069079,0,,,,...,4,13662.705411,8475.486953,1.042112,0.312107,0,,,,
4,12,0,,,,,0,,,,...,3,36352.408812,2456.722301,0.755648,0.311375,1,19150.694444,0.0,2.321101,0.0


In [98]:
df.to_csv('./training_data_3_stats_general.csv', index=False)