In [0]:
import json
import shutil
import os
from pyspark.sql.types import StringType, ArrayType, FloatType, IntegerType
from pyspark.sql.functions import udf, col, explode, size, lit, split, collect_list
import cv2
from tqdm import tqdm
import csv

Load all of the image JSON files for the giraffe dataset, the hyena dataset, and the leopard dataset, and convert their corresponding bounding boxes to Yolov format.

In [0]:
# Load the new zebra and giraffe annotations dataset
zebra_giraffe_bbox = spark.read.json('/FileStore/tables/updated_giraffe_annotations.json') \
                            .select(explode('annotations').alias('annotation')) \
                            .select('annotation.*')
zebra_giraffe_img_coords = spark.read.json('/FileStore/tables/updated_giraffe_annotations.json').select(explode('images').alias('image')).select('image.*')

In [0]:
display(zebra_giraffe_img_coords)

coco_url,date_captured,file_name,flickr_url,gps_lat_captured,gps_lon_captured,height,id,license,photographer,uuid,width
,2015-03-01 14:53:46,000000000001.jpg,,-1.351341,36.800374,2000,1,3,"NNP GZC Car '10WHITE', Person 'A', Image 0005",826fb775-2f99-a8cf-7120-cba60562e82f,3000
,2015-03-01 14:53:46,000000000002.jpg,,-1.351341,36.800374,2000,2,3,"NNP GZC Car '10WHITE', Person 'A', Image 0006",26a32203-4923-723c-d1fb-d0e781010ee9,3000
,2015-03-01 14:53:52,000000000003.jpg,,-1.351341,36.800374,2000,3,3,"NNP GZC Car '10WHITE', Person 'A', Image 0007",a35e6b20-1cc5-c958-8e11-d46065e9062d,3000
,2015-03-01 14:53:58,000000000004.jpg,,-1.351341,36.800374,2000,4,3,"NNP GZC Car '10WHITE', Person 'A', Image 0008",5f369ecf-b1ac-eae3-4b23-e386786672b9,3000
,2015-03-01 15:02:32,000000000005.jpg,,-1.367088,36.781978,2000,5,3,"NNP GZC Car '10WHITE', Person 'A', Image 0010",981d8673-6440-3c95-2769-d1207033ffd0,3000
,2015-03-01 15:02:51,000000000006.jpg,,-1.367071,36.781993,2000,6,3,"NNP GZC Car '10WHITE', Person 'A', Image 0011",18cebee0-2802-e282-56be-ca9a6ee1e5ae,3000
,2015-03-01 15:03:02,000000000007.jpg,,-1.367071,36.781993,2000,7,3,"NNP GZC Car '10WHITE', Person 'A', Image 0012",f74791df-4fad-b1bb-f722-98925046db7e,3000
,2015-03-01 15:20:14,000000000008.jpg,,-1.373086,36.794396,2000,8,3,"NNP GZC Car '10WHITE', Person 'A', Image 0036",c669e14e-fe7e-b08e-3b79-d5f55fd7710a,3000
,2015-03-01 15:52:38,000000000009.jpg,,-1.378479,36.814727,2000,9,3,"NNP GZC Car '10WHITE', Person 'A', Image 0061",8f4f59d2-4874-f18f-7ff1-38dc39b54f66,3000
,2015-03-01 15:53:05,000000000010.jpg,,-1.378479,36.814727,2000,10,3,"NNP GZC Car '10WHITE', Person 'A', Image 0064",488aa8ac-4518-d502-d014-aa6c29e9ad06,3000


In [0]:
# Join the dataset and extract the necessary fields
img_df = zebra_giraffe_bbox.select('image_id', 'category_id', 'bbox').join(zebra_giraffe_img_coords.select('id', 'file_name', 'height', 'width'), 
                                                                           on=zebra_giraffe_bbox.image_id == zebra_giraffe_img_coords.id, how='inner')

# Filter the dataset for category '0', meaning that the image is of giraffes
giraffe_df = img_df.filter(col('category_id') == 0)

In [0]:
# Verify that the giraffe extracted data was created correctly
display(giraffe_df)

image_id,category_id,bbox,id,file_name,height,width
5,0,"List(1568.5, 942.5, 450.0, 462.5)",5,000000000005.jpg,2000,3000
6,0,"List(1924.0, 737.0, 548.5, 475.5)",6,000000000006.jpg,2000,3000
7,0,"List(1242.5, 942.5, 462.5, 394.0)",7,000000000007.jpg,2000,3000
44,0,"List(1310.9375, 470.3125, 946.875, 976.5625)",44,000000000044.jpg,2000,3000
45,0,"List(1018.75, 300.0, 1015.625, 1035.9375)",45,000000000045.jpg,2000,3000
46,0,"List(971.875, 295.3125, 971.875, 1070.3125)",46,000000000046.jpg,2000,3000
47,0,"List(1532.8125, 450.0, 1448.4375, 1542.1875)",47,000000000047.jpg,2000,3000
48,0,"List(1490.625, 585.9375, 1490.625, 1414.0625)",48,000000000048.jpg,2000,3000
49,0,"List(1117.1875, 217.1875, 1485.9375, 1717.1875)",49,000000000049.jpg,2000,3000
61,0,"List(740.625, 543.75, 1315.625, 1250.0)",61,000000000061.jpg,2000,3000


In [0]:
# Load the hyena annnotations and convert the bounding boxes to Yolov format
hyena_bbox = spark.read.json('/FileStore/tables/hyena_annotations.json') \
                            .select(explode('annotations').alias('annotation')) \
                            .select('annotation.*')
hyena_img_coords = spark.read.json('/FileStore/tables/hyena_annotations.json').select(explode('images').alias('image')).select('image.*')
# Join the dataset and extract the necessary fields
hyena_df = hyena_bbox.select('image_id', 'category_id', 'bbox').join(hyena_img_coords.select('id', 'file_name', 'height', 'width'), 
                                                                           on=hyena_bbox.image_id == hyena_img_coords.id, how='inner').withColumn('category_id', lit(1))
# Check if the rows were successfully processed
hyena_df.show()

In [0]:
# Load the hyena annnotations and convert the bounding boxes to Yolov format
leopard_bbox = spark.read.json('/FileStore/tables/leopard_annotations.json') \
                            .select(explode('annotations').alias('annotation')) \
                            .select('annotation.*')
leopard_img_coords = spark.read.json('/FileStore/tables/leopard_annotations.json').select(explode('images').alias('image')).select('image.*')
# Join the dataset and extract the necessary fields
leopard_df = leopard_bbox.select('image_id', 'category_id', 'bbox').join(leopard_img_coords.select('id', 'file_name', 'height', 'width'), 
                                                                           on=leopard_bbox.image_id == leopard_img_coords.id, how='inner').withColumn('category_id', lit(2))
# Check if the rows were successfully processed
leopard_df.show()

In [0]:
# Join the dataframes and filter into training, validation, and test
union_df = giraffe_df.union(hyena_df).union(leopard_df)
union_df.show()

In [0]:
def resize_coco_bx(bbox, height, width):
    """Produces rescaled bounding box coordinates based upon the new file size for Yolov5 (640)"""
    
    x_ = width
    y_ = height
    
    # Get the x and y scale factors
    targetSize=640
    x_scale = targetSize/x_
    y_scale = targetSize/y_
    w_scale = targetSize/width
    h_scale = targetSize/height
    
    x0, y0, w0, h0 = bbox
    
    x = x0*x_scale
    y = y0*y_scale
    w = w0*w_scale
    h = h0*h_scale
    
    return [x, y, w, h]

# Create a UDF to scale the images to the correct size to train Yolov
coco_yolov_resize_udf = udf(resize_coco_bx)

# Apply the UDF to the unioned dataset
union_df = union_df.withColumn('resized_coco_bx', coco_yolov_resize_udf(union_df.bbox, union_df.height, union_df.width))

# Create a custom UDF to cast lists as arrays
udf_array = udf(lambda row: list(row), ArrayType(FloatType()))
union_df = union_df.withColumn('resized_coco_bx', udf_array(union_df.resized_coco_bx))

In [0]:
display(union_df)

image_id,category_id,bbox,id,file_name,height,width,resized_coco_bx,yolov_bb,yolovbbformat
5,0,"List(1568.5, 942.5, 450.0, 462.5)",5,000000000005.jpg,2000,3000,"List(334.61334, 301.6, 96.0, 148.0)","[0.5978333333333333, 0.586875, 0.15000000000000002, 0.23125]","List(0.59783334, 0.586875, 0.15, 0.23125)"
6,0,"List(1924.0, 737.0, 548.5, 475.5)",6,000000000006.jpg,2000,3000,"List(410.45334, 235.84, 117.013336, 152.16)","[0.7327500000000001, 0.48737500000000006, 0.18283333333333335, 0.23775000000000002]","List(0.73275, 0.487375, 0.18283333, 0.23775)"
7,0,"List(1242.5, 942.5, 462.5, 394.0)",7,000000000007.jpg,2000,3000,"List(265.06668, 301.6, 98.666664, 126.08)","[0.49124999999999996, 0.5697500000000001, 0.15416666666666667, 0.197]","List(0.49125, 0.56975, 0.15416667, 0.197)"
44,0,"List(1310.9375, 470.3125, 946.875, 976.5625)",44,000000000044.jpg,2000,3000,"List(279.66666, 150.5, 202.0, 312.5)","[0.5947916666666667, 0.479296875, 0.31562500000000004, 0.48828125]","List(0.59479165, 0.47929686, 0.315625, 0.48828125)"
45,0,"List(1018.75, 300.0, 1015.625, 1035.9375)",45,000000000045.jpg,2000,3000,"List(217.33333, 96.0, 216.66667, 331.5)","[0.5088541666666667, 0.408984375, 0.33854166666666674, 0.51796875]","List(0.50885415, 0.40898436, 0.33854166, 0.5179688)"
46,0,"List(971.875, 295.3125, 971.875, 1070.3125)",46,000000000046.jpg,2000,3000,"List(207.33333, 94.5, 207.33333, 342.5)","[0.4859375, 0.41523437500000004, 0.32395833333333335, 0.53515625]","List(0.4859375, 0.4152344, 0.32395834, 0.53515625)"
47,0,"List(1532.8125, 450.0, 1448.4375, 1542.1875)",47,000000000047.jpg,2000,3000,"List(327.0, 144.0, 309.0, 493.5)","[0.7523437500000001, 0.610546875, 0.48281250000000003, 0.77109375]","List(0.7523438, 0.6105469, 0.4828125, 0.7710937)"
48,0,"List(1490.625, 585.9375, 1490.625, 1414.0625)",48,000000000048.jpg,2000,3000,"List(318.0, 187.5, 318.0, 452.5)","[0.7453125, 0.646484375, 0.496875, 0.70703125]","List(0.7453125, 0.6464844, 0.496875, 0.70703125)"
49,0,"List(1117.1875, 217.1875, 1485.9375, 1717.1875)",49,000000000049.jpg,2000,3000,"List(238.33333, 69.5, 317.0, 549.5)","[0.6200520833333334, 0.5378906250000001, 0.49531250000000004, 0.85859375]","List(0.6200521, 0.5378906, 0.4953125, 0.85859376)"
61,0,"List(740.625, 543.75, 1315.625, 1250.0)",61,000000000061.jpg,2000,3000,"List(158.0, 174.0, 280.66666, 400.0)","[0.4661458333333334, 0.584375, 0.4385416666666667, 0.625]","List(0.46614584, 0.584375, 0.43854168, 0.625)"


In [0]:
# Adapt code from https://haobin-tan.netlify.app/ai/computer-vision/object-detection/coco-json-to-yolo-txt/ to develop a field which represents the bounding box coordinates in Yolov format
def convert_bbox_coco2yolo(bbox):
    """
    Convert bounding box from COCO  format to YOLO format

    Parameters
    ----------
    img_width : int
        width of image
    img_height : int
        height of image
    bbox : list[int]
        bounding box annotation in COCO format: 
        [top left x position, top left y position, width, height]

    Returns
    -------
    list[float]
        bounding box annotation in YOLO format: 
        [x_center_rel, y_center_rel, width_rel, height_rel]
    """
    
    # YOLO bounding box format: [x_center, y_center, width, height]
    # (float values relative to width and height of image)
    x1, y1, w1, h1 = bbox

    dw = 1.0 / 640
    dh = 1.0 / 640

    x_center = x1 + w1 / 2.0
    y_center = y1 + h1 / 2.0

    x = x_center * dw
    y = y_center * dh
    w = w1 * dw
    h = h1 * dh

    return [x, y, w, h]

# Create a UDF from the conversion and apply to the PySpark dataframe
convertbb_udf = udf(convert_bbox_coco2yolo)

# Apply the custom UDF to the union dataframe, using the default value of 640 for the image height and width and using the rescaled bounding boxes created in the last step
union_df = union_df.withColumn('yolov_bb', convertbb_udf(union_df.resized_coco_bx))

# Convert the Yolov bounding box string to an array of floats
union_df = union_df.withColumn('yolovbbformat', udf_array(union_df.yolov_bb))

In [0]:
# Collapse the union dataframe so that all of the Yolov bounding boxes are in one field
reduced_df = union_df.groupBy('category_id', 'image_id', 'file_name').agg(collect_list('yolovbbformat').alias('yolov_bxs'))

In [0]:
display(reduced_df)

category_id,image_id,file_name,yolov_bxs
0,5,000000000005.jpg,"List(List(0.59783334, 0.586875, 0.15, 0.23125))"
0,6,000000000006.jpg,"List(List(0.73275, 0.487375, 0.18283333, 0.23775))"
0,7,000000000007.jpg,"List(List(0.49125, 0.56975, 0.15416667, 0.197))"
0,44,000000000044.jpg,"List(List(0.59479165, 0.47929686, 0.315625, 0.48828125))"
0,45,000000000045.jpg,"List(List(0.50885415, 0.40898436, 0.33854166, 0.5179688))"
0,46,000000000046.jpg,"List(List(0.4859375, 0.4152344, 0.32395834, 0.53515625))"
0,47,000000000047.jpg,"List(List(0.7523438, 0.6105469, 0.4828125, 0.7710937))"
0,48,000000000048.jpg,"List(List(0.7453125, 0.6464844, 0.496875, 0.70703125))"
0,49,000000000049.jpg,"List(List(0.6200521, 0.5378906, 0.4953125, 0.85859376))"
0,61,000000000061.jpg,"List(List(0.46614584, 0.584375, 0.43854168, 0.625))"


In [0]:
# Create the species specific lists of training, validation, and test images
hyena_train_imgs = os.listdir('/dbfs/mnt/w210_capstone/hyena.coco/images/output/train/train2022/')
hyena_val_imgs = os.listdir('/dbfs/mnt/w210_capstone/hyena.coco/images/output/val/train2022/')
hyena_test_imgs = os.listdir('/dbfs/mnt/w210_capstone/hyena.coco/images/output/test/train2022/')

leopard_train_imgs = os.listdir('/dbfs/mnt/w210_capstone/leopard.coco/images/output/train/train2022/')
leopard_val_imgs = os.listdir('/dbfs/mnt/w210_capstone/leopard.coco/images/output/val/train2022/')
leopard_test_imgs = os.listdir('/dbfs/mnt/w210_capstone/leopard.coco/images/output/test/train2022/')

giraffe_train_imgs = os.listdir('/dbfs/mnt/w210_capstone/great_zebra_giraffe/train/')
giraffe_val_imgs = os.listdir('/dbfs/mnt/w210_capstone/great_zebra_giraffe/val/')
giraffe_test_imgs = os.listdir('/dbfs/mnt/w210_capstone/great_zebra_giraffe/test/')

In [0]:
# Create the classification training, test, and validation folders
os.makedirs('/dbfs/mnt/w210_capstone/yolov5/customdata')
os.makedirs('/dbfs/mnt/w210_capstone/yolov5/customdata/images')
os.makedirs('/dbfs/mnt/w210_capstone/yolov5/customdata/labels')
items = ['train', 'val', 'test']
for f in items:
    os.mkdir(os.path.join('/dbfs/mnt/w210_capstone/yolov5/customdata/images', f))
    os.mkdir(os.path.join('/dbfs/mnt/w210_capstone/yolov5/customdata/labels', f))

In [0]:
def yolo_fields(image_id, file_name, category_id, yolov_bxs):
    """A function that creates the directory structure and txt annotations for training Yolov"""
    
    # Get the image name
    item = str(image_id) + '.jpg'
    
    if category_id == 0:        
        if item in giraffe_val_imgs:
            # Create the text file path
            text_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/labels/val/' + 'giraffe_' + str(image_id) + '.txt'
            # Define the paths for image copying
            old_path = os.path.join('/dbfs/mnt/w210_capstone/great_zebra_giraffe/val',item)
            new_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/images/val/' + 'giraffe_' + item
        elif item in giraffe_test_imgs:
            # Create the text file path
            text_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/labels/test/' + 'giraffe_' + str(image_id) + '.txt'
            # Define the paths for image copying
            old_path = os.path.join('/dbfs/mnt/w210_capstone/great_zebra_giraffe/test',item)
            new_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/images/test/' + 'giraffe_' + item
        else:
            # Create the text file path
            text_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/labels/train/' + 'giraffe_' + str(image_id) + '.txt'
            # Define the paths for image copying
            old_path = os.path.join('/dbfs/mnt/w210_capstone/great_zebra_giraffe/train',item)
            new_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/images/train/' + 'giraffe_' + item
        
    
    elif category_id == 1:
        if item in hyena_val_imgs:
            # Create the text file path
            text_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/labels/val/' + 'hyena_' + str(image_id) + '.txt'
            # Define the paths for image copying
            old_path = os.path.join('/dbfs/mnt/w210_capstone/hyena.coco/images/train2022',file_name)
            new_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/images/val/' + 'hyena_' + item
        elif item in hyena_test_imgs:
            # Create the text file path
            text_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/labels/test/' + 'hyena_' + str(image_id) + '.txt'
            # Define the paths for image copying
            old_path = os.path.join('/dbfs/mnt/w210_capstone/hyena.coco/images/train2022',file_name)
            new_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/images/test/' + 'hyena_' + item
        
        else:
            # Create the text file path
            text_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/labels/train/' + 'hyena_' + str(image_id) + '.txt'
            # Define the paths for image copying
            old_path = os.path.join('/dbfs/mnt/w210_capstone/hyena.coco/images/train2022',file_name)
            new_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/images/train/' + 'hyena_' + item

    
    else:
        if item in leopard_val_imgs:
            # Create the text file path
            text_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/labels/val/' + 'leopard_' + str(image_id) + '.txt'
            # Define the paths for image copying
            old_path = os.path.join('/dbfs/mnt/w210_capstone/leopard.coco/images/train2022',file_name)
            new_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/images/val/' + 'leopard_' + item   
        elif item in leopard_test_imgs:
            # Create the text file path
            text_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/labels/test/' + 'leopard_' + str(image_id) + '.txt'
            # Define the paths for image copying
            old_path = os.path.join('/dbfs/mnt/w210_capstone/leopard.coco/images/train2022',file_name)
            new_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/images/test/' + 'leopard_' + item
        else:
            # Create the text file path
            text_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/labels/train/' + 'leopard_' + str(image_id) + '.txt'
            # Define the paths for image copying
            old_path = os.path.join('/dbfs/mnt/w210_capstone/leopard.coco/images/train2022',file_name)
            new_path = '/dbfs/mnt/w210_capstone/yolov5/customdata/images/train/' + 'leopard_' + item   
    
    vals = [[round(y, 6) for y in line] for line in yolov_bxs]
    row_val = [[category_id]+item for item in vals]
    
    # Load and save the image from the old file path
    im = cv2.imread(old_path)
    new_im = cv2.resize(im, (640, 640))
    cv2.imwrite(new_path, new_im)
    
    with open(text_path, 'w', newline="") as x:
        csv.writer(x, delimiter=" ").writerows(row_val)
    
    return "Converted to Yolo"

In [0]:
# Convert the above Yolov function to a udf
yolov_udf = udf(yolo_fields)
reduced_df = reduced_df.withColumn('yolov_converted', yolov_udf(reduced_df.image_id, reduced_df.file_name, reduced_df.category_id, reduced_df.yolov_bxs))

In [0]:
reduced_df.show()