Combination of the object detection and segmentation. Would output the masks and the 'cutouts'

* Original image as a thumbnail -- (480 on the longest side as jpegs) **[DONE]**
* Output the individual cutouts as png (include, label in filename: image_{resource_id}_{object_name}_{instance__#} )
    * Normal the sizes of the individual cutouts (max_size= 480, on the longest side )
* Thumbnails of the binary masks: (include, label in filename: mask_{resource_id}_{object_name}_{instance__#} )
* Output the bounding box information -- JSON output (include, label in filename: image_{resource_id}_{object_name}_{instance__#}.JSON )
    * Including the four coordinates (Normalized 0-1)

Note: the outputs will be directed into the UI folders.


I. Imports

In [1]:
# General utility libraries
import os
import matplotlib.pyplot as plt
import numpy as np
import regex as re
import numpy as np
import json

# Importing Pytorch ML Libraries
import torch
import torchvision
from torchvision.transforms import ToTensor

# Importing the Models and their respective weights
from torchvision.models.detection import (
    # Faster R-CNN
    fasterrcnn_resnet50_fpn_v2,
    FasterRCNN_ResNet50_FPN_V2_Weights,
)

# Utility functions that help visualize the models and describe the model outputs.
from torchvision.io.image import read_image
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import to_pil_image
from PIL import ImageFont, ImageDraw, Image
from IPython.display import display
from torchvision.utils import make_grid

# Libraries Mask manipulation and generation
import cv2
from scipy.ndimage import binary_dilation, binary_erosion, binary_closing
from scipy.ndimage import binary_fill_holes
from workflow_helpers import *

# from utilities import show_anns_ours, run_ours_box_or_points
# ** For now, I have opted to not include these imports. I will define the 
# functions manually and later add to a `helper/utility.py` script.

II. Creation of a Model Dictionary

In [2]:
# How many Items do you want to output? Refer to the Notebook 1 value to output the same amount.

number_of_instances = 10

In [3]:
model_dictionary = {}
model_dictionary['items'] = []

for picture in os.listdir('image-collection-output/')[:number_of_instances]:
    item_dictionary = {}
    resource_id = extract_number(picture)
    item_dictionary['resource_id'] = resource_id
    model_dictionary['items'].append(item_dictionary)
    print(picture,resource_id)

# model_dictionary

image_2023632670.jpg 2023632670
image_89709659.jpg 89709659
image_2011631485.jpg 2011631485
image_2020742358.jpg 2020742358
image_2010651699.jpg 2010651699
image_2006681388.jpg 2006681388
image_2002705621.jpg 2002705621
image_97518968.jpg 97518968
image_2016826637.jpg 2016826637
image_00652544.jpg 00652544


III. Create Resource Thumbnail.
* Creates resource thumbnail using main dictionary.
* Sends Thumbnail to UI.
* Adds Thumbnail path to dictionary.

In [4]:
def create_main_thumbnail(image_path, output_path, item):
    # thumbnail_name
    # resource = os.path.basename(image_path)
    base_name = os.path.basename(image_path).split('.')[0]

    # Create Resource Thumbname
    thumbnail_image = Image.open(image_path)
    original_size = thumbnail_image.size
    max_size = (480,480)
    thumbnail_image.thumbnail(max_size)

    # Create Output directory if it doesn't exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    thumbnail_name = f'{base_name}_thumbnail' + '.jpg'
    output_filename =  os.path.join(output_path,thumbnail_name)  
    thumbnail_image.save(output_filename)
    print(f'Saved {thumbnail_name}')

    item['original_format'] = original_size
    item['thumbnail'] = thumbnail_name



In [5]:
for item in model_dictionary['items']:
    print(item)

{'resource_id': '2023632670'}
{'resource_id': '89709659'}
{'resource_id': '2011631485'}
{'resource_id': '2020742358'}
{'resource_id': '2010651699'}
{'resource_id': '2006681388'}
{'resource_id': '2002705621'}
{'resource_id': '97518968'}
{'resource_id': '2016826637'}
{'resource_id': '00652544'}


In [6]:
for item in model_dictionary['items']:
    id = item['resource_id']
    # workflow/image-collection-output/image_2016631670.jpg

    image = f'../workflow/image-collection-output/image_{id}.jpg'
    create_main_thumbnail(image,'../ui/dummy-data', item)


Saved image_2023632670_thumbnail.jpg


Saved image_89709659_thumbnail.jpg
Saved image_2011631485_thumbnail.jpg
Saved image_2020742358_thumbnail.jpg
Saved image_2010651699_thumbnail.jpg
Saved image_2006681388_thumbnail.jpg
Saved image_2002705621_thumbnail.jpg
Saved image_97518968_thumbnail.jpg
Saved image_2016826637_thumbnail.jpg
Saved image_00652544_thumbnail.jpg


IV. Load the Faster-RCNN Model and Weights

In [7]:
weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)
model.eval()
# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

V. Load the EfficientSAM Model from it's directory

In [8]:
# Importing the EfficientSAM Model and setting the correct directoy
parent_dir = os.getcwd()
os.chdir("../early_work/efficient_sam/EfficientSAM")

from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
import zipfile

efficient_sam_vitt_model = build_efficient_sam_vitt()
efficient_sam_vitt_model.eval()

# Since EfficientSAM-S checkpoint file is >100MB, we store the zip file.
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
    zip_ref.extractall("weights")
efficient_sam_vits_model = build_efficient_sam_vits()
efficient_sam_vits_model.eval()

os.chdir(parent_dir)


In [9]:
def process_image(image_path, output_path, item, structuring_value=25,threshold =0.9):
    # Read the image
    img = read_image(image_path)

    batch = [preprocess(img)]
    # Get prediction from the model
    prediction = model(batch)[0]
    
    if len(prediction['labels']) == 0:
        print(f'No Object Detection predictions within the Scope of MS COCO dataset: {os.path.basename(image_path)}')

    else:

        # Extracting the len of Index of the scores that meet the threshold value:
        score_len = (prediction["scores"] >= threshold).sum().item()
        # Limits the scores at the threshold to just the top 5
        if score_len >= 3:
            score_len = 3
        else:
            pass


        resource = os.path.basename(image_path)
        base_name = os.path.basename(image_path).split('.')[0]
        resource_id = item['resource_id']
        item['segments'] = []

        for i in range(score_len):                
            segment = {}
            segment['bounding_box'] = prediction['boxes'].tolist()[i]
            # Extract bounding box coordinates

            bbox = segment['bounding_box']
            x1 = bbox[0]
            y1 = bbox[1]
            x2 = bbox[2]
            y2 = bbox[3]
            w = x2 - x1
            h = y2 - y1

            class_index = prediction['labels'][i].item()
            class_label = weights.meta["categories"][class_index]
            # print(class_label)

            
            # fig, ax = plt.subplots(1, 3, figsize=(30, 30))
            input_point = np.array([[x1, y1], [x2, y2]])
            input_label = np.array([2, 3])
            

            mask_efficient_sam_vitt = run_ours_box_or_points(image_path, input_point, input_label, efficient_sam_vitt_model)
            # show_anns_ours(mask_efficient_sam_vitt, ax[1])
            binary_mask = mask_efficient_sam_vitt
            structuring_element = np.ones((structuring_value,structuring_value), dtype=bool)
            dilated_mask = binary_dilation(binary_mask, structure=structuring_element)
            eroded_mask = binary_erosion(dilated_mask, structure=structuring_element)

            closed_mask_uint8 = (eroded_mask * 255).astype(np.uint8)

            mask_name = f'mask_{resource_id}_{class_label}_{i}' + '.png'
            mask_path = os.path.join(output_path, f'masks/{mask_name}')
            cv2.imwrite(mask_path, closed_mask_uint8)
            img_val = cv2.imread(image_path) 
            mask = cv2.imread(mask_path)

            img_foreground = np.array((mask/255)*(img_val/255)) * img_val
            na = img_foreground
            

            '''
            Import to note that part of the following code is from substack
            '''
            # Make a True/False mask of pixels whose BGR values sum to more than zero
            alpha = np.sum(na, axis=-1) > 0

            # Convert True/False to 0/255 and change type to "uint8" to match "na"
            alpha = np.uint8(alpha * 255)

            # Stack new alpha layer with existing image to go from BGR to BGRA, i.e. 3 channels to 4 channels
            res = np.dstack((na, alpha))
            img = Image.fromarray(res, mode='RGBa')

            # Save result
            cutout_name =  f'cutout_{resource_id}_{class_label}_{i}' + '.png'
            cutout_path = os.path.join(output_path, f'cutouts/{cutout_name}')
            cv2.imwrite(cutout_path, res)
            
            resize_to_thumbnail(cutout_path)
            resize_to_thumbnail(mask_path)

            segment['confidence'] = prediction["scores"][i].item()
            segment['label'] =  class_label
            segment['cutout'] = cutout_name
            segment['mask'] =  mask_name
            item['segments'].append(segment)


In [10]:
for item in model_dictionary['items']:
    id = item['resource_id']
    image = f'../workflow/image-collection-output/image_{id}.jpg'
    process_image(image,'../ui/dummy-data', item)

No Object Detection predictions within the Scope of MS COCO dataset: image_2011631485.jpg


In [11]:
with open(f"model_results.json", 'w') as f:
        json.dump(model_dictionary, f, indent=4)