Here, we augment images as per the previous transforms but this time, we only augment selected images so as to make sure we can have a more balanced class representation in our final dataset.

In [189]:
# !tar -xvf /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_10.tar

In [192]:
# !mv ./training_data_10 /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_10

In [194]:
# !rm /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_10/.*

In [324]:
!ls /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_10/*json | wc -l

299


In [325]:
!rm -rf /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_11

In [326]:
# !pip install -U albumentations

In [327]:
# !pip install labelme

In [328]:
# import the required libraries
import albumentations as A
import cv2
from matplotlib import pyplot as plt
import json
import random
from PIL import Image
import base64
import labelme
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
import shutil
import random
%matplotlib inline

We first generate an overview analytics on our dataset of interest.

In [329]:
# !ls /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_10/.*

In [378]:
# Function to calculate bounding box area
def calculate_bbox_area(bbox):
    return (abs(bbox[0][0] - bbox[1][0])*abs(bbox[0][1] - bbox[1][1]))

In [379]:
# Function to calculate bounding box aspect ratio
def calculate_bbox_aspect_ratio(bbox):
    return (abs(bbox[0][0] - bbox[1][0]) / abs(bbox[0][1] - bbox[1][1]))

In [380]:
# Initialize data for the dataframe
data = {'Index': [], 'unR_Count': [], 'dR_Count': [], 'unC_Count': [], 'dC_Count': []}

In [381]:
# Path to your directory containing JPG and JSON files
directory_path = '/content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_11/'

In [382]:
# Iterate through files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith("json"):
        # Extract index from the filename
        indexstring = filename.split("_")[1]
        index = int(indexstring.split('.')[0])

        # Read JSON file
        json_path = os.path.join(directory_path, filename)
        if os.path.exists(json_path):
            with open(json_path) as json_file:
                data_json = json.load(json_file)

            # Initialize counters and accumulators for each class
            class_counts = [0, 0, 0, 0]
            class_areas = [[], [], [], []]
            class_aspect_ratios = [[], [], [], []]
            class_index_dict = {"undamagedresidentialbuilding": [0,"unR"],
                            "damagedresidentialbuilding": [1, "dR"],
                           "undamagedcommercialbuilding": [2, "unC"],
                            "damagedcommercialbuilding": [3, "dC"]}

            # Process annotations in the JSON file
            for annotation in data_json["shapes"]:
                class_label = annotation["label"]
                class_index = class_index_dict[class_label][0]
                class_counts[class_index] += 1

                bbox = annotation["points"]
                bbox_area = calculate_bbox_area(bbox)
                class_areas[class_index].append(bbox_area)

                aspect_ratio = calculate_bbox_aspect_ratio(bbox)
                class_aspect_ratios[class_index].append(aspect_ratio)

            # Calculate means and std deviations for each class
            for label in class_index_dict:
                class_index = class_index_dict[label][0]
                class_name = class_index_dict[label][1]
                data[f'{class_name}_Count'].append(class_counts[class_index])

            data['Index'].append(index)

In [383]:
# Create pandas DataFrame
df = pd.DataFrame(data)

In [384]:
# count of undamaged residential buildings
sum(df['unR_Count'])

4444

In [385]:
# count of undamaged commercial buildings
sum(df['unC_Count'])

4934

In [386]:
# count of damaged residential buildings
sum(df['dR_Count'])

1093

In [387]:
# count of damaged commercial buildings
sum(df['dC_Count'])

839

Based on the above analysis, we see that we need to augment images on a selective basis due to the inherent class imbalance in the training data. To augment selectively, we first sort our images in terms of priority. We do the sorting in as follows:

In [341]:
df_sorted = df.sort_values(by=['unR_Count', 'dC_Count', 'dR_Count', 'unC_Count'], ascending=[True, False, False, False])

In [342]:
df_sorted.head()

Unnamed: 0,Index,unR_Count,dR_Count,unC_Count,dC_Count
146,65,0,0,8,5
76,228,0,0,4,5
22,150,0,0,7,3
239,2,0,1,2,2
114,177,0,0,7,2


For the next step, one simple way is to divide the above set into 4 halves. For the first half, which is of highest priority, we apply (factor-1 - 1) number of augmentations. For the next half, we apply (factor-2 - 1) number of augmentations. For the next half, we apply (factor-3 - 1) number of augmentations. Finally, we get rid of the final_half altogether. Before applying the augmentation, let's check whether this procedure will indeed give us a class balanced dataset or not. Note that this is just an upper bound of the class counts we will get as all augmentations do not preserve class balance (see: crops for exampke).

In [343]:
# Splitting the DataFrame into four halves
half_size = len(df_sorted) // 4
df_1 = df_sorted.iloc[:half_size]
df_2 = df_sorted.iloc[half_size:2*half_size]
df_3 = df_sorted.iloc[2*half_size:3*half_size]
df_4 = df_sorted.iloc[3*half_size:]

In [344]:
print(df_1.shape, df_2.shape, df_3.shape, df_4.shape)


(74, 5) (74, 5) (74, 5) (77, 5)


In [345]:
# Applying multiplication factors to each half
factor_1 = 7 # 6 augmentations
factor_2 = 5 # 4 augmentations
factor_3 = 2 # 1 augmentations
factor_4 = 1

In [346]:
df_1[['unR_Count', 'unC_Count', 'dC_Count', 'dR_Count']] *= factor_1
df_2[['unR_Count', 'unC_Count', 'dC_Count', 'dR_Count']] *= factor_2
df_3[['unR_Count', 'unC_Count', 'dC_Count', 'dR_Count']] *= factor_3
df_4[['unR_Count', 'unC_Count', 'dC_Count', 'dR_Count']] *= factor_4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1[['unR_Count', 'unC_Count', 'dC_Count', 'dR_Count']] *= factor_1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2[['unR_Count', 'unC_Count', 'dC_Count', 'dR_Count']] *= factor_2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3[['unR_Count', 'unC_Count', 'dC_Count', 'dR_Count']] *= factor_

In [347]:
# Merging the modified DataFrames back together, including the 'Index' column
result_df = pd.concat([df_1, df_2, df_3], axis=0)
result_df.shape

(222, 5)

In [348]:
# comparing the expected class representation before and after augmentation
print("Before Augmentation:", sum(df['unR_Count']), sum(df['unC_Count']), sum(df['dR_Count']), sum(df['dC_Count']))
print("After Augmentation:", sum(result_df['unR_Count']), sum(result_df['unC_Count']), sum(result_df['dR_Count']), sum(result_df['dC_Count']))

Before Augmentation: 4975 1410 620 183
After Augmentation: 4444 4934 1093 839


In [349]:
df_1['Index'].tolist()[0]

65

In [350]:
df_1.head()

Unnamed: 0,Index,unR_Count,dR_Count,unC_Count,dC_Count
146,65,0,0,56,35
76,228,0,0,28,35
22,150,0,0,49,21
239,2,0,7,14,14
114,177,0,0,49,14


In [351]:
# augmentations that will be included
# small rotations
# crops by mike
# random contrast and brightness
# random gamma
# sharpen

In [352]:
# small rotations

transform1 = A.Compose([
    A.SafeRotate(limit=5, p=1, border_mode=cv2.BORDER_CONSTANT, value=0),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['category_ids']))

In [353]:
# random contrast and brightness

transform2 = A.Compose([
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=1),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['category_ids']))


In [354]:
# random gamma

transform3 = A.Compose([
    A.RandomGamma(gamma_limit=(120,150), p=1),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['category_ids']))

In [356]:
# sharpen

transform4 = A.Compose([
    A.Sharpen(alpha=(0.2, 0.5), lightness=(0.5, 1.0), p=1),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['category_ids']))

In [357]:
# crops by mike

transform5 = A.Compose([
        A.RandomCrop(width=256, height=256),
        A.PadIfNeeded(min_height=512, min_width=512, border_mode=cv2.BORDER_CONSTANT, value=(0, 0, 0))],
        bbox_params=A.BboxParams(format='pascal_voc', label_fields=['category_ids'])
    )

In [358]:
transformations_df1 = [transform1, transform2, transform2, transform3, transform3, transform4]
transformations_df2 = [transform1, transform2, transform3, transform4]
transformations_df3 = [transform1, transform2, transform3, transform4] # we will randomly choose one for this purpose

In [359]:
category_id_dict = {
    "undamagedresidentialbuilding": 0,
    "damagedresidentialbuilding": 1,
    "undamagedcommercialbuilding": 2,
    "damagedcommercialbuilding": 3
}
category_id_to_name = {value: key for key, value in category_id_dict.items()}

In [360]:
def shapes_element_maker(new_bboxes, new_labels, category_id_to_name):
    shapes = []
    for bbox, category_id in zip(new_bboxes, new_labels):
        subdict = {
            "label": category_id_to_name[category_id],
            "points": [[bbox[0], bbox[1]],[bbox[2], bbox[3]]],
            "group_id": None,
            "description": "",
            "shape_type": "rectangle",
            "flags": {},
            "mask": None
        }
        shapes.append(subdict)
    return shapes

In [361]:
def process_json_file(filename):
  # takes as a json file as input and returns two lists
  # list 1 has the coordinates of all bounding boxes
  # list 2 has the category id for each bounding box in list 1
  with open(filename) as json_file:
    data_json = json.load(json_file)
  bboxes = []
  category_ids = []
  for annotation in data_json["shapes"]:
    class_label = annotation["label"]
    category_ids.append(category_id_dict[class_label])

    bbox = annotation["points"]
    x_min, x_max = min([bbox[0][0], bbox[1][0]]), max([bbox[0][0], bbox[1][0]])
    y_min, y_max = min([bbox[0][1], bbox[1][1]]), max([bbox[0][1], bbox[1][1]])
    bboxes.append([x_min, y_min, x_max, y_max])

  return bboxes, category_ids

In [362]:
def transform_annotations(original_json_file, transformed_json_file, new_bboxes, new_labels, new_img_filename, new_img_data):
    # Step 1: Read the original JSON file
    with open(original_json_file, 'r') as f:
        data = json.load(f)

    data['imagePath'] = new_img_filename
    data['imageData'] = new_img_data

    # Step 2: Modify the necessary fields (labels and bounding box coordinates)
    #transformation for when the number of bounding boxes is conserved
    if len(data['shapes']) == len(new_bboxes):
      for idx, annotation in enumerate(data['shapes']):
          # Assuming annotation format is like {'label': 'person', 'bbox': [x, y, width, height]}
          label = annotation['label']
          bbox = annotation['points']

          # Apply your transformation function to get the new bbox coordinates
          new_bbox = [[new_bboxes[idx][0], new_bboxes[idx][1]],[new_bboxes[idx][2], new_bboxes[idx][3]]]
          new_label = category_id_to_name[new_labels[idx]]

          # Update the annotation with new values
          annotation['label'] = new_label # Implement get_new_label function
          annotation['points'] = new_bbox

      # Step 3: Save the modified data into a new JSON file
      with open(transformed_json_file, 'w') as f:
          json.dump(data, f, indent=2)

    else:
      #transformation for when the number of bounding boxes is not conserved
      data['shapes'] = shapes_element_maker(new_bboxes, new_labels, category_id_to_name)
      with open(transformed_json_file, 'w') as f:
        json.dump(data, f, indent=2)

In [363]:
def process_json_indices(source_dir, outdir, indices, prefix, counter, transformations_df, random_pick):
  for i in tqdm(indices):
    source_json = f"{prefix}_{i}.json"
    source_img = f"{prefix}_{i}.jpg"
    if os.path.exists(os.path.join(source_dir, source_json)):
      out_img_file = prefix + "_" + str(counter) + ".jpg"
      out_json_file = prefix + "_" + str(counter) + ".json"
      shutil.copy2(os.path.join(source_dir, source_img), os.path.join(outdir, out_img_file))
      shutil.copy2(os.path.join(source_dir, source_json), os.path.join(outdir, out_json_file))
      counter += 1
      in_image_data = labelme.LabelFile.load_image_file(os.path.join(source_dir, source_img))
      in_image_data = base64.b64encode(in_image_data).decode('utf-8')
      bboxes, category_ids = process_json_file(os.path.join(source_dir, source_json))
      if random_pick:
        transformations = [random.choice(transformations_df)]
      else:
        transformations = transformations_df
      for idx, transform in enumerate(transformations):
        in_image = cv2.imread(os.path.join(source_dir, source_img))
        in_image = cv2.cvtColor(in_image, cv2.COLOR_BGR2RGB)
        transformed = transform(image=in_image, bboxes=bboxes, category_ids=category_ids)
        out_image = Image.fromarray(transformed['image'])
        out_img_file = prefix + "_" + str(counter) + ".jpg"
        out_image.save(os.path.join(outdir, out_img_file))
        out_image_data = labelme.LabelFile.load_image_file(os.path.join(outdir, out_img_file))
        out_image_data = base64.b64encode(out_image_data).decode('utf-8')
        out_json_file = prefix + "_" + str(counter) + ".json"
        transform_annotations(
            os.path.join(source_dir, source_json),
            os.path.join(outdir, out_json_file),
            transformed['bboxes'],
            transformed['category_ids'],
            os.path.join(source_dir, out_img_file),
            out_image_data
           )
        counter += 1
  print("Augmentation complete.", counter-1, " images were finally generated.")

In [364]:
BOX_COLOR = (255, 0, 0) # Red
TEXT_COLOR = (255, 255, 255) # White


def visualize_bbox(img, bbox, class_name, color=BOX_COLOR, thickness=2):
    """Visualizes a single bounding box on the image"""
    x_min, y_min, x_max, y_max = bbox
    x_min, x_max, y_min, y_max = int(x_min), int(x_max), int(y_min), int(y_max)

    cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color=color, thickness=thickness)

    ((text_width, text_height), _) = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.35, 1)
    cv2.rectangle(img, (x_min, y_min - int(1.3 * text_height)), (x_min + text_width, y_min), BOX_COLOR, -1)
    cv2.putText(
        img,
        text=class_name,
        org=(x_min, y_min - int(0.3 * text_height)),
        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
        fontScale=0.35,
        color=TEXT_COLOR,
        lineType=cv2.LINE_AA,
    )
    return img


def visualize(image, bboxes, category_ids, category_id_to_name):
    img = image.copy()
    for bbox, category_id in zip(bboxes, category_ids):
        class_name = category_id_to_name[category_id]
        img = visualize_bbox(img, bbox, class_name)
    plt.figure(figsize=(12, 12))
    plt.axis('off')
    plt.imshow(img)

In [365]:
# image = cv2.imread(os.path.join(source_dir, 'Image_1.jpg'))
# image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# bboxes, category_ids = process_json_file(os.path.join(source_dir, 'Image_1.json'))

In [366]:
source_dir = '/content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_10'
output_dir = '/content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_11'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [367]:
# augmenting df1
process_json_indices(source_dir,
                     output_dir,
                     df_1['Index'].tolist(),
                     "Image",
                     1,
                     transformations_df1,
                     False)

100%|██████████| 74/74 [00:26<00:00,  2.79it/s]

Augmentation complete. 518  images were finally generated.





In [368]:
# !ls /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_10/*json | wc -l

In [370]:
# !ls /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_11/*json | wc -l

In [371]:
# augmenting df2
process_json_indices(source_dir,
                     output_dir,
                     df_2['Index'].tolist(),
                     "Image",
                     519,
                     transformations_df2,
                     False)

100%|██████████| 74/74 [00:18<00:00,  3.96it/s]

Augmentation complete. 888  images were finally generated.





In [372]:
# !ls /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_10/*json | wc -l

In [374]:
# !ls /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_11/*json | wc -l

In [375]:
# augmenting df3
process_json_indices(source_dir,
                     output_dir,
                     df_3['Index'].tolist(),
                     "Image",
                     889,
                     transformations_df3,
                     True)

100%|██████████| 74/74 [00:08<00:00,  8.86it/s]

Augmentation complete. 1036  images were finally generated.





In [377]:
# !tar -cvf /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_11.tar /content/drive/MyDrive/EYOpenScienceDataChallenge/code/data_augmentation/training_data_11