In [None]:
# Databricks notebook source
import os
from pyspark.sql.functions import *
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql.types import IntegerType,FloatType
from pyspark.sql.functions import udf, col

In [None]:
splitfolders.ratio("/dbfs/mnt/w210_capstone/hyena.coco/images", 
                   output="/dbfs/mnt/w210_capstone/hyena.coco/images/output",
                   seed=1337, ratio=(.8, 0.1,0.1))

In [None]:
splitfolders.ratio("/dbfs/mnt/w210_capstone/leopard.coco/images", 
                   output="/dbfs/mnt/w210_capstone/leopard.coco/images/output",
                   seed=1337, ratio=(.8, 0.1,0.1))

In [None]:
def hyena_leopard_rdds():
    """Create lists of the hyena and leopard images"""
    
    hyena_train_img = []
    hyena_val_img = []
    hyena_test_img = []
    
    leopard_train_img = []
    leopard_val_img = []
    leopard_test_img = []
    
    # Create the lists of all image paths in the training, validation, and test folders
    for (dir_path, dir_names, files) in os.walk("/dbfs/mnt/w210_capstone/hyena.coco/images/output/train/train2022"):
        for x in files:
            if x.endswith(".jpg"):
                hyena_train_img.append(os.path.join(dir_path, x))
    
    for (dir_path, dir_names, files) in os.walk("/dbfs/mnt/w210_capstone/hyena.coco/images/output/val/train2022"):
        for x in files:
            if x.endswith(".jpg"):
                hyena_val_img.append(os.path.join(dir_path, x))
    
    for (dir_path, dir_names, files) in os.walk("/dbfs/mnt/w210_capstone/hyena.coco/images/output/test/train2022"):
        for x in files:
            if x.endswith(".jpg"):
                hyena_test_img.append(os.path.join(dir_path, x))
    
    for (dir_path, dir_names, files) in os.walk("/dbfs/mnt/w210_capstone/leopard.coco/images/output/train/train2022"):
        for x in files:
            if x.endswith(".jpg"):
                leopard_train_img.append(os.path.join(dir_path, x))
    
    for (dir_path, dir_names, files) in os.walk("/dbfs/mnt/w210_capstone/leopard.coco/images/output/val/train2022"):
        for x in files:
            if x.endswith(".jpg"):
                leopard_val_img.append(os.path.join(dir_path, x))
    
    for (dir_path, dir_names, files) in os.walk("/dbfs/mnt/w210_capstone/leopard.coco/images/output/test/train2022"):
        for x in files:
            if x.endswith(".jpg"):
                leopard_test_img.append(os.path.join(dir_path, x))
    
    return sc.parallelize(hyena_train_img), sc.parallelize(hyena_val_img),sc.parallelize(hyena_test_img), sc.parallelize(leopard_train_img), sc.parallelize(leopard_val_img), sc.parallelize(leopard_test_img)

In [None]:
hyena_train, hyena_val, hyena_test, leopard_train, leopard_val, leopard_test = hyena_leopard_rdds()

In [None]:
display(hyena_img)

In [None]:
def remove_img_leadingzero(image):
    """Removes leading zeros from image names for the hyena and leopard train, validation, and test sets"""
    
    # Get the first part of the filepath
    parse_path = os.path.dirname(image)
    
    # Get the last part of the file name and remove leading zeros
    item = os.path.basename(image).lstrip('0')
    
    # Define the new path
    new_name = parse_path + '/' + item

    # Renaming the file
    return os.rename(image, new_name)

In [None]:
hyena_train_img = hyena_train.map(remove_img_leadingzero)
hyena_val_img = hyena_val.map(remove_img_leadingzero)
hyena_test_img = hyena_test.map(remove_img_leadingzero)

In [None]:
leopard_train_img = leopard_train.map(remove_img_leadingzero)
leopard_val_img = leopard_val.map(remove_img_leadingzero)
leopard_test_img = leopard_test.map(remove_img_leadingzero)

In [None]:
print('Renamed hyena train', hyena_train_img.collect())
print('Renamed hyena val', hyena_val_img.collect())
print('Renamed hyena test', hyena_test_img.collect())
print('Renamed leopard train', leopard_train_img.collect())
print('Renamed leopard val', leopard_val_img.collect())
print('Renamed leopard test', leopard_test_img.collect())


### Crop the hyena training, validation, and test images by their bounding boxes and save to a new folder, processed. 

Processed contains 3 subfolders: train, validation, and test. Each of these subfolders contain subfolders of unique animals, with the images inside corresponding to images of that animal in the corresponding set (train/validation/test). In the example path `/dbfs/mnt/w210_capstone/hyena.coco/processed/train/34365bcc_5b2d_4f80_b721_ef7039e64fe8/67.jpg`, the image is animal `34365bcc_5b2d_4f80_b721_ef7039e64fe8`, cropped from image 67 from the training dataset.

In [None]:
# Load the hyena annotations dataset
hyena_annotations_df = spark.read.json('/FileStore/tables/hyena_annotations.json') \
                            .select(explode('annotations').alias('annotation')) \
                            .select('annotation.*')

In [None]:
# Split the hyena annotations into training, validation, and test based upon the image names in their respective folders
hyena_train_imgs = [x[:-4] for x in os.listdir("/dbfs/mnt/w210_capstone/hyena.coco/images/output/train/train2022")]
hyena_annotations_df = hyena_annotations_df.withColumn('image_id', col('image_id').cast(StringType()))
hyena_train = hyena_annotations_df.filter((hyena_annotations_df.image_id).isin(hyena_train_imgs))

In [None]:
hyena_val_imgs = [x[:-4] for x in os.listdir("/dbfs/mnt/w210_capstone/hyena.coco/images/output/val/train2022")]
hyena_val = hyena_annotations_df.filter((hyena_annotations_df.image_id).isin(hyena_val_imgs))

In [None]:
hyena_test_imgs = [x[:-4] for x in os.listdir("/dbfs/mnt/w210_capstone/hyena.coco/images/output/test/train2022")]
hyena_test = hyena_annotations_df.filter((hyena_annotations_df.image_id).isin(hyena_test_imgs))

In [None]:
# Verify that the train dataframe was produced
hyena_train.show()

In [None]:
# UDF that takes 3 columns and return if the file is saved successfully.
def crop_trainBB(image_id, bbox, name):
    
    # Get the path to the image in the mounted storage
    image = '/dbfs/mnt/w210_capstone/hyena.coco/images/output/train/train2022/'+image_id+'.jpg'
    
    # Load the image into cv2
    im = cv2.imread(image)
    
    # Assign the bounding box values to their parameters and convert to int
    x0, y0, width, height = bbox
    x, y, w, h = int(x0), int(y0), int(width), int(height)
    
    # Crop the image to the bounding box bounds
    cropped = im[y:y+h, x:x+w]
        
    # Obtain the unique animal identifier for that bounding box
    animal_name = name.replace("-", "_")
    
    # Check if the animal already has a folder created
    path = '/dbfs/mnt/w210_capstone/hyena.coco/processed/train/{}'.format(animal_name)
    isExist = os.path.exists(path)
        
    # If there is not already a unique animal folder, create it
    if not isExist:
        os.makedirs(path)
            
    # Write the cropped image to blob storage
    cv2.imwrite('/dbfs/mnt/w210_capstone/hyena.coco/processed/train/{}/{}.jpg'.format(animal_name, image_id), cropped)

    return "SAVED"

In [None]:
# Register UDF to save the images cropped by bounding box
crop_train_udf = udf(crop_trainBB)

In [None]:
# Invoke UDF for each row of the Dataframe.
hyena_out_train = hyena_train.withColumn("processed", crop_train_udf(hyena_train.image_id, hyena_train.bbox, hyena_train.name))

In [None]:
# Check if the rows were successfully processed
hyena_out_train.show()

In [None]:
# UDF that takes 3 columns and return if the file is saved successfully.
def crop_valBB(image_id, bbox, name):
    
    # Get the path to the image in the mounted storage
    image = '/dbfs/mnt/w210_capstone/hyena.coco/images/output/val/train2022/'+image_id+'.jpg'
    
    # Load the image into cv2
    im = cv2.imread(image)
    
    # Assign the bounding box values to their parameters and convert to int
    x0, y0, width, height = bbox
    x, y, w, h = int(x0), int(y0), int(width), int(height)
        
    # Crop the image to the bounding box bounds
    cropped = im[y:y+h, x:x+w]
        
    # Obtain the unique animal identifier for that bounding box
    animal_name = name.replace("-", "_")
    
    # Check if the animal already has a folder created
    path = '/dbfs/mnt/w210_capstone/hyena.coco/processed/val/{}'.format(animal_name)
    isExist = os.path.exists(path)
        
    # If there is not already a unique animal folder, create it
    if not isExist:
        os.makedirs(path)
            
    # Write the cropped image to blob storage
    cv2.imwrite('/dbfs/mnt/w210_capstone/hyena.coco/processed/val/{}/{}.jpg'.format(animal_name, image_id), cropped)

    return "SAVED"

In [None]:
# Register UDF to save the images cropped by bounding box
crop_val_udf = udf(crop_valBB)

In [None]:
# Invoke UDF for each row of the Dataframe.
hyena_out_val = hyena_val.withColumn("processed", crop_val_udf(hyena_val.image_id, hyena_val.bbox, hyena_val.name))

In [None]:
# Check if the rows were successfully processed
hyena_out_val.show()

In [None]:
# UDF that takes 3 columns and return if the file is saved successfully.
def crop_testBB(image_id, bbox, name):
    
    # Get the path to the image in the mounted storage
    image = '/dbfs/mnt/w210_capstone/hyena.coco/images/output/test/train2022/'+image_id+'.jpg'
    
    # Load the image into cv2
    im = cv2.imread(image)
    
    # Assign the bounding box values to their parameters and convert to int
    x0, y0, width, height = bbox
    x, y, w, h = int(x0), int(y0), int(width), int(height)
    
    # Crop the image to the bounding box bounds
    cropped = im[y:y+h, x:x+w]
        
    # Obtain the unique animal identifier for that bounding box
    animal_name = name.replace("-", "_")
    
    # Check if the animal already has a folder created
    path = '/dbfs/mnt/w210_capstone/hyena.coco/processed/test/{}'.format(animal_name)
    isExist = os.path.exists(path)
        
    # If there is not already a unique animal folder, create it
    if not isExist:
        os.makedirs(path)
            
    # Write the cropped image to blob storage
    cv2.imwrite('/dbfs/mnt/w210_capstone/hyena.coco/processed/test/{}/{}.jpg'.format(animal_name, image_id), cropped)

    return "SAVED"

In [None]:
# Register UDF to save the images cropped by bounding box
crop_test_udf = udf(crop_testBB)

In [None]:
# Invoke UDF for each row of the Dataframe.
hyena_out_test = hyena_test.withColumn("processed", crop_test_udf(hyena_test.image_id, hyena_test.bbox, hyena_test.name))

In [None]:
# Check if the rows were successfully processed
hyena_out_test.show()

### Crop the leopard training, validation, and test images by their bounding boxes and save to a new folder, processed. 

Processed contains 3 subfolders: train, validation, and test. Each of these subfolders contain subfolders of unique animals, with the images inside corresponding to images of that animal in the corresponding set (train/validation/test). In the example path `/dbfs/mnt/w210_capstone/leopard.coco/processed/train/34365bcc_5b2d_4f80_b721_ef7039e64fe8/67.jpg`, the image is animal `34365bcc_5b2d_4f80_b721_ef7039e64fe8`, cropped from image 67 from the training dataset.

In [None]:
# Load the hyena annotations dataset
leopard_annotations_df = spark.read.json('/FileStore/tables/leopard_annotations.json') \
                            .select(explode('annotations').alias('annotation')) \
                            .select('annotation.*')

In [None]:
# Split the hyena annotations into training, validation, and test based upon the image names in their respective folders
leopard_train_imgs = [x[:-4] for x in os.listdir("/dbfs/mnt/w210_capstone/leopard.coco/images/output/train/train2022")]
leopard_annotations_df = leopard_annotations_df.withColumn('image_id', col('image_id').cast(StringType()))
leopard_train = leopard_annotations_df.filter((leopard_annotations_df.image_id).isin(leopard_train_imgs))

In [None]:
leopard_val_imgs = [x[:-4] for x in os.listdir("/dbfs/mnt/w210_capstone/leopard.coco/images/output/val/train2022")]
leopard_val = leopard_annotations_df.filter((leopard_annotations_df.image_id).isin(leopard_val_imgs))

In [None]:
leopard_test_imgs = [x[:-4] for x in os.listdir("/dbfs/mnt/w210_capstone/leopard.coco/images/output/test/train2022")]
leopard_test = leopard_annotations_df.filter((leopard_annotations_df.image_id).isin(leopard_test_imgs))

In [None]:
# UDF that takes 3 columns and return if the file is saved successfully.
def leopard_crop_trainBB(image_id, bbox, name):
    
    # Get the path to the image in the mounted storage
    image = '/dbfs/mnt/w210_capstone/leopard.coco/images/output/train/train2022/'+image_id+'.jpg'
    
    # Load the image into cv2
    im = cv2.imread(image)
    
    # Assign the bounding box values to their parameters and convert to int
    x0, y0, width, height = bbox
    x, y, w, h = int(x0), int(y0), int(width), int(height)
    
    # Crop the image to the bounding box bounds
    cropped = im[y:y+h, x:x+w]
        
    # Obtain the unique animal identifier for that bounding box
    animal_name = name.replace("-", "_")
    
    # Check if the animal already has a folder created
    path = '/dbfs/mnt/w210_capstone/leopard.coco/processed/train/{}'.format(animal_name)
    isExist = os.path.exists(path)
        
    # If there is not already a unique animal folder, create it
    if not isExist:
        os.makedirs(path)
            
    # Write the cropped image to blob storage
    cv2.imwrite('/dbfs/mnt/w210_capstone/leopard.coco/processed/train/{}/{}.jpg'.format(animal_name, image_id), cropped)

    return "SAVED"

In [None]:
# Register UDF to save the images cropped by bounding box
leopard_crop_train_udf = udf(leopard_crop_trainBB)

In [None]:
# Invoke UDF for each row of the Dataframe.
leopard_out_train = leopard_train.withColumn("processed", leopard_crop_train_udf(leopard_train.image_id, leopard_train.bbox, leopard_train.name))

In [None]:
# Check if the rows were successfully processed
leopard_out_train.show()

In [None]:
# UDF that takes 3 columns and return if the file is saved successfully.
def leopard_crop_valBB(image_id, bbox, name):
    
    # Get the path to the image in the mounted storage
    image = '/dbfs/mnt/w210_capstone/leopard.coco/images/output/val/train2022/'+image_id+'.jpg'
    
    # Load the image into cv2
    im = cv2.imread(image)
    
    # Assign the bounding box values to their parameters and convert to int
    x0, y0, width, height = bbox
    x, y, w, h = int(x0), int(y0), int(width), int(height)
    
    # Crop the image to the bounding box bounds
    cropped = im[y:y+h, x:x+w]
        
    # Obtain the unique animal identifier for that bounding box
    animal_name = name.replace("-", "_")
    
    # Check if the animal already has a folder created
    path = '/dbfs/mnt/w210_capstone/leopard.coco/processed/val/{}'.format(animal_name)
    isExist = os.path.exists(path)
        
    # If there is not already a unique animal folder, create it
    if not isExist:
        os.makedirs(path)
            
    # Write the cropped image to blob storage
    cv2.imwrite('/dbfs/mnt/w210_capstone/leopard.coco/processed/val/{}/{}.jpg'.format(animal_name, image_id), cropped)

    return "SAVED"

In [None]:
# Register UDF to save the images cropped by bounding box
leopard_crop_val_udf = udf(leopard_crop_valBB)

In [None]:
# Invoke UDF for each row of the Dataframe.
leopard_out_val = leopard_val.withColumn("processed", leopard_crop_val_udf(leopard_val.image_id, leopard_val.bbox, leopard_val.name))

In [None]:
# Check if the rows were successfully processed
leopard_out_val.show()

In [None]:
# UDF that takes 3 columns and return if the file is saved successfully.
def leopard_crop_testBB(image_id, bbox, name):
    
    # Get the path to the image in the mounted storage
    image = '/dbfs/mnt/w210_capstone/leopard.coco/images/output/test/train2022/'+image_id+'.jpg'
    
    # Load the image into cv2
    im = cv2.imread(image)
    
    # Assign the bounding box values to their parameters and convert to int
    x0, y0, width, height = bbox
    x, y, w, h = int(x0), int(y0), int(width), int(height)
    
    # Crop the image to the bounding box bounds
    cropped = im[y:y+h, x:x+w]
        
    # Obtain the unique animal identifier for that bounding box
    animal_name = name.replace("-", "_")
    
    # Check if the animal already has a folder created
    path = '/dbfs/mnt/w210_capstone/leopard.coco/processed/test/{}'.format(animal_name)
    isExist = os.path.exists(path)
        
    # If there is not already a unique animal folder, create it
    if not isExist:
        os.makedirs(path)
            
    # Write the cropped image to blob storage
    cv2.imwrite('/dbfs/mnt/w210_capstone/leopard.coco/processed/test/{}/{}.jpg'.format(animal_name, image_id), cropped)

    return "SAVED"

In [None]:
# Register UDF to save the images cropped by bounding box
leopard_crop_test_udf = udf(leopard_crop_testBB)

In [None]:
# Invoke UDF for each row of the Dataframe.
leopard_out_test = leopard_test.withColumn("processed", leopard_crop_test_udf(leopard_test.image_id, leopard_test.bbox, leopard_test.name))

In [None]:
# Check if the rows were successfully processed
leopard_out_test.show()