Databricks notebook source
Write the newly uploaded training annotations for the greate zebra and giraffe id dataset to the file store so that it can be loaded into a csv. Similarly to the hyena and leopard datasets, the annotations are in COCO format and the provided validation and test annotations files contain no records, meaning that they will need to be created.

In [None]:
import json
import shutil
import os
from pyspark.sql.types import StringType, ArrayType, FloatType, IntegerType
from pyspark.sql.functions import udf, col, explode, size
import cv2

In [None]:
with open('/dbfs/mnt/w210_capstone/great_zebra_giraffe/annotations/instances_train2020.json') as f:
    dbutils.fs.put("/FileStore/tables/updated_giraffe_annotations.json", json.dumps(json.load(f)), overwrite=True)

In [None]:
# Load the new zebra and giraffe annotations dataset
zebra_giraffe_annotations_df = spark.read.json('/FileStore/tables/updated_giraffe_annotations.json') \
                            .select(explode('annotations').alias('annotation')) \
                            .select('annotation.*')

In [None]:
# Check the first few rows
display(zebra_giraffe_annotations_df)

In [None]:
# Filter the images to only those containing giraffes
only_giraffes = zebra_giraffe_annotations_df.filter(col('name').startswith('NNP_GIRM_'))

In [None]:
# Filter the dataframe for rows where the individual ids field has a length of 1 - these images should go into the training dataset (they represent the only time that the animal was seen)
only_giraffes = only_giraffes.select('*', size('individual_ids').alias('ct_appearances'), size('bbox').alias('bbox_cord_ct'))
one_appearance = only_giraffes.filter(col('ct_appearances')==1)

In [None]:
# Find the unique values from the bbox coordinate count to verify that all animals have a bounding box - all animals have 4 coordinates
only_giraffes.select('bbox_cord_ct').distinct().collect()

In [None]:
# Check that there are no images without an animal identifier
from pyspark.sql.functions import col, isnan, when, count

In [None]:
only_giraffes.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in ['uuid']]).show()

In [None]:
# There are 42 instances when the animal is only seen once
one_appearance.count()

In [None]:
# Filter the only giraffes dataframe to create a dataframe where the image ids are not in the image id field belonging to the only appearance dataframe
more_than1 = only_giraffes.join(one_appearance, only_giraffes.image_id == one_appearance.image_id, how='left_anti')

In [None]:
# Check the first few rows
display(more_than1)

In [None]:
# Get all of the unique image ids for the one appearances dataset
only_one_imgid = one_appearance.select('image_id').distinct().collect()

In [None]:
# Get all of the unique image ids where the animal was seen more than once
more_than1_imgid = more_than1.select('image_id').distinct().collect()

In [None]:
# Split the image ids where the animal is captured more than once into training and testing
from sklearn.model_selection import train_test_split
giraffe_train_val, giraffe_test = train_test_split(more_than1_imgid, test_size=0.1, train_size=0.9, shuffle=True, random_state=8)

In [None]:
# Create the unique giraffe training and validation identifiers
giraffe_train, giraffe_val = train_test_split(giraffe_train_val, test_size=0.1, train_size=0.9, shuffle=True, random_state=42)

In [None]:
# Add the images of the animals only seen once to the giraffe training image list
giraffe_train = giraffe_train + only_one_imgid

In [None]:
# Extract the image name from the lists of row objects
giraffe_train = [x['image_id'] for x in giraffe_train]
giraffe_val = [x['image_id'] for x in giraffe_val]
giraffe_test = [x['image_id'] for x in giraffe_test]

In this section, we strip leading zeros from the file names of the giraffe images and save them in place. This allows us to filter the giraffe images by the image ids in the annotations file.

In [None]:
def cp_giraffe_imgs():
    """Extracts the giraffe training, validation, and test images and saves them to a new directory"""
    
    image_list = []
    
    # Create the lists of all image paths in the training, validation, and test folders
    for (dir_path, dir_names, files) in os.walk("/dbfs/mnt/w210_capstone/great_zebra_giraffe/images"):
        for x in files:
            # Get the stripped name of the file
            item = int(x.lstrip('0')[:-4])
            old_path = os.path.join(dir_path,x)
            if item in giraffe_train:
                new_path = '/dbfs/mnt/w210_capstone/great_zebra_giraffe/train/'+ str(item) + '.jpg'
                shutil.copy(old_path, new_path)
            elif item in giraffe_val:
                new_path = '/dbfs/mnt/w210_capstone/great_zebra_giraffe/val/'+ str(item) + '.jpg'
                shutil.copy(old_path, new_path)
            elif item in giraffe_test:
                new_path = '/dbfs/mnt/w210_capstone/great_zebra_giraffe/test/'+ str(item) + '.jpg'
                shutil.copy(old_path, new_path)
    
    print('Files have finished copying')

In [None]:
# Call the function to copy the giraffe training, validation, and testing images to their respective folders
cp_giraffe_imgs()

In [None]:
# Filter the annotations file by the image ids in the giraffe train, validation, and test sets and write out to Parquet
giraffe_train_df = only_giraffes.filter(col('image_id').isin(giraffe_train))
giraffe_val_df = only_giraffes.filter(col('image_id').isin(giraffe_val))
giraffe_test_df = only_giraffes.filter(col('image_id').isin(giraffe_test))

In [None]:
giraffe_train_df.write.parquet('dbfs/mnt/w210_capstone/great_zebra_giraffe/annotations/giraffe_train.parquet')
giraffe_val_df.write.parquet('dbfs/mnt/w210_capstone/great_zebra_giraffe/annotations/giraffe_val.parquet')
giraffe_test_df.write.parquet('dbfs/mnt/w210_capstone/great_zebra_giraffe/annotations/giraffe_test.parquet')

In [None]:
# Split the hyena annotations into training, validation, and test based upon the image names in their respective folders
giraffe_train_imgs = [x[:-4] for x in os.listdir("/dbfs/mnt/w210_capstone/great_zebra_giraffe/train")]
giraffe_val_imgs = [x[:-4] for x in os.listdir("/dbfs/mnt/w210_capstone/great_zebra_giraffe/val")]
giraffe_test_imgs = [x[:-4] for x in os.listdir("/dbfs/mnt/w210_capstone/great_zebra_giraffe/test")]

In [None]:
# UDF that takes 3 columns and return if the file is saved successfully.
def giraffe_crop_trainBB(image_id, bbox, name):
    
    # Get the path to the image in the mounted storage
    image = '/dbfs/mnt/w210_capstone/great_zebra_giraffe/train/'+str(image_id)+'.jpg'
    
    print(str(image_id))
    
    # Load the image into cv2
    im = cv2.imread(image)
    
    # Assign the bounding box values to their parameters and convert to int
    x0, y0, width, height = bbox
    x, y, w, h = int(x0), int(y0), int(width), int(height)
    
    # Crop the image to the bounding box bounds
    cropped = im[y:y+h, x:x+w]
        
    # Obtain the unique animal identifier for that bounding box
    animal_name = name.replace("-", "_")
    
    # Check if the animal already has a folder created
    path = '/dbfs/mnt/w210_capstone/great_zebra_giraffe/individual_recognition/train/{}'.format(animal_name)
    isExist = os.path.exists(path)
        
    # If there is not already a unique animal folder, create it
    if not isExist:
        os.makedirs(path)
            
    # Write the cropped image to blob storage
    try:
        cv2.imwrite('/dbfs/mnt/w210_capstone/great_zebra_giraffe/individual_recognition/train/{}/{}.jpg'.format(animal_name, str(image_id)), cropped)

        return "SAVED"
    except:
        return "Unable to crop"

In [None]:
# Register UDF to save the images cropped by bounding box
giraffe_crop_train_udf = udf(giraffe_crop_trainBB)

In [None]:
# Invoke UDF for each row of the Dataframe.
giraffe_out_train = giraffe_train_df.withColumn("processed", giraffe_crop_train_udf(giraffe_train_df.image_id, giraffe_train_df.bbox, giraffe_train_df.name))

In [None]:
# Check if the rows were successfully processed
giraffe_out_train.show()

In [None]:
# only 2 bounding boxes were unable to be cropped from the images
display(giraffe_out_train.filter(col('processed')=='Unable to crop'))

In [None]:
# UDF that takes 3 columns and return if the file is saved successfully.
def giraffe_crop_valBB(image_id, bbox, name):
    
    # Get the path to the image in the mounted storage
    image = '/dbfs/mnt/w210_capstone/great_zebra_giraffe/val/'+str(image_id)+'.jpg'
    
    # Load the image into cv2
    im = cv2.imread(image)
    
    # Assign the bounding box values to their parameters and convert to int
    x0, y0, width, height = bbox
    x, y, w, h = int(x0), int(y0), int(width), int(height)
    
    # Crop the image to the bounding box bounds
    cropped = im[y:y+h, x:x+w]
        
    # Obtain the unique animal identifier for that bounding box
    animal_name = name.replace("-", "_")
    
    # Check if the animal already has a folder created
    path = '/dbfs/mnt/w210_capstone/great_zebra_giraffe/individual_recognition/val/{}'.format(animal_name)
    isExist = os.path.exists(path)
        
    # If there is not already a unique animal folder, create it
    if not isExist:
        os.makedirs(path)
            
    # Write the cropped image to blob storage
    cv2.imwrite('/dbfs/mnt/w210_capstone/great_zebra_giraffe/individual_recognition/val/{}/{}.jpg'.format(animal_name, image_id), cropped)

    return "SAVED"

In [None]:
# Register UDF to save the images cropped by bounding box
giraffe_crop_val_udf = udf(giraffe_crop_valBB)

In [None]:
# Invoke UDF for each row of the Dataframe.
giraffe_out_val = giraffe_val_df.withColumn("processed", giraffe_crop_val_udf(giraffe_val_df.image_id, giraffe_val_df.bbox, giraffe_val_df.name))

In [None]:
# Check if the rows were successfully processed
giraffe_out_val.show()

In [None]:
# UDF that takes 3 columns and return if the file is saved successfully.
def giraffe_crop_testBB(image_id, bbox, name):
    
    # Get the path to the image in the mounted storage
    image = '/dbfs/mnt/w210_capstone/great_zebra_giraffe/test/'+str(image_id)+'.jpg'
    
    # Load the image into cv2
    im = cv2.imread(image)
    
    # Assign the bounding box values to their parameters and convert to int
    x0, y0, width, height = bbox
    x, y, w, h = int(x0), int(y0), int(width), int(height)
    
    # Crop the image to the bounding box bounds
    cropped = im[y:y+h, x:x+w]
        
    # Obtain the unique animal identifier for that bounding box
    animal_name = name.replace("-", "_")
    
    # Check if the animal already has a folder created
    path = '/dbfs/mnt/w210_capstone/great_zebra_giraffe/individual_recognition/test/{}'.format(animal_name)
    isExist = os.path.exists(path)
        
    # If there is not already a unique animal folder, create it
    if not isExist:
        os.makedirs(path)
            
    # Write the cropped image to blob storage
    cv2.imwrite('/dbfs/mnt/w210_capstone/great_zebra_giraffe/individual_recognition/test/{}/{}.jpg'.format(animal_name, image_id), cropped)

    return "SAVED"

In [None]:
# Register UDF to save the images cropped by bounding box
giraffe_crop_test_udf = udf(giraffe_crop_testBB)

In [None]:
# Invoke UDF for each row of the Dataframe.
giraffe_out_test = giraffe_test_df.withColumn("processed", giraffe_crop_test_udf(giraffe_test_df.image_id, giraffe_test_df.bbox, giraffe_test_df.name))

In [None]:
# Check if the rows were successfully processed
giraffe_out_test.show()