Combination of the object detection and segmentation. Would output the masks and the 'cutouts'

* Original image as a thumbnail -- (480 on the longest side as jpegs) **[DONE]**
* Output the individual cutouts as png (include, label in filename: image_{resource_id}_{object_name}_{instance__#} )
    * Normal the sizes of the individual cutouts (max_size= 480, on the longest side )
* Thumbnails of the binary masks: (include, label in filename: mask_{resource_id}_{object_name}_{instance__#} )
* Output the bounding box information -- JSON output (include, label in filename: image_{resource_id}_{object_name}_{instance__#}.JSON )
    * Including the four coordinates (Normalized 0-1)

Note: the outputs will be directed into the UI folders.


I. Imports

In [258]:
# General utility libraries
import os
import matplotlib.pyplot as plt
import numpy as np
import regex as re
import numpy as np


# Importing Pytorch ML Libraries
import torch
import torchvision
from torchvision.transforms import ToTensor

# Importing the Models and their respective weights
from torchvision.models.detection import (
    # Faster R-CNN
    fasterrcnn_resnet50_fpn_v2,
    FasterRCNN_ResNet50_FPN_V2_Weights,
)

# Utility functions that help visualize the models and describe the model outputs.
from torchvision.io.image import read_image
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import to_pil_image
from PIL import ImageFont, ImageDraw
from IPython.display import display
from torchvision.utils import make_grid

# Libraries Mask manipulation and generation
import cv2
from scipy.ndimage import binary_dilation, binary_erosion, binary_closing
from scipy.ndimage import binary_fill_holes

# from utilities import show_anns_ours, run_ours_box_or_points
# ** For now, I have opted to not include these imports. I will define the 
# functions manually and later add to a `helper/utility.py` script.

In [259]:
def show_anns_ours(mask, ax):
    ax.set_autoscale_on(False)
    img = np.ones((mask.shape[0], mask.shape[1], 4))
    img[:, :, 3] = 0
    color_mask = [0, 1, 0, 0.7]
    img[np.logical_not(mask)] = color_mask
    ax.imshow(img)


def run_ours_box_or_points(img_path, pts_sampled, pts_labels, model):
    image_np = np.array(Image.open(img_path))
    img_tensor = ToTensor()(image_np)
    pts_sampled = torch.reshape(torch.tensor(pts_sampled), [1, 1, -1, 2])
    pts_labels = torch.reshape(torch.tensor(pts_labels), [1, 1, -1])
    predicted_logits, predicted_iou = model(
        img_tensor[None, ...],
        pts_sampled,
        pts_labels,
    )

    sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
    predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
    predicted_logits = torch.take_along_dim(
        predicted_logits, sorted_ids[..., None, None], dim=2
    )

    return torch.ge(predicted_logits[0, 0, 0, :, :], 0).cpu().detach().numpy()

def resize_to_thumbnail(path):
    img = Image.open(path)
    img.thumbnail((480,480))
    img.save(path)

def crop_image(path,x1,y1,x2,y2):
    img = Image.open(path)
    box = (x1-20, y1-20, x2+20, y2+30)
    img = img.crop(box)
    img.save(path)

II. Creation of a Dummy Dictionary

In [260]:
def extract_number(directory_name):
    match = re.findall(r"\d+", directory_name)
    # if match:
    return int(match[0])

dummy_dictionary = {}
for picture in os.listdir('../early_work/images')[:1]:
    resource_info = {}
    resource_id = extract_number(picture)
    resource_info['resource_id'] = resource_id
    dummy_dictionary[picture] = resource_info


    

III. Create Resource Thumbnail.
* Creates resource thumbnail using main dictionary.
* Sends Thumbnail to UI.
* Adds Thumbnail path to dictionary.

In [261]:
def create_main_thumbnail(image_path, output_path, resource_dictionary):
    # thumbnail_name
    resource = os.path.basename(image_path)
    base_name = os.path.basename(image_path).split('.')[0]

    # Create Resource Thumbname
    thumbnail_image = Image.open(image_path)
    original_size = thumbnail_image.size
    max_size = (480,480)
    thumbnail_image.thumbnail(max_size)

    # Create Output directory if it doesn't exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    thumbnail_name = f'{base_name}_thumbnail' + '.jpg'
    output_filename =  os.path.join(output_path,thumbnail_name)  
    thumbnail_image.save(output_filename)
    print(f'Saved {thumbnail_name}')

    resource_dictionary[resource]['original_format'] = original_size
    resource_dictionary[resource]['thumbnail'] = thumbnail_name



In [262]:
for image in dummy_dictionary:
    # print(image)
    image = '../early_work/images/' + image
    create_main_thumbnail(image,'../workflow_ui_sample/', dummy_dictionary)

dummy_dictionary

Saved image_2017454465_thumbnail.jpg
Saved image_2007684280_thumbnail.jpg


Saved image_2011632159_thumbnail.jpg
Saved image_2017878880_thumbnail.jpg
Saved image_2003654393_thumbnail.jpg
Saved image_88694120_thumbnail.jpg
Saved image_00650949_thumbnail.jpg
Saved image_2016869441_thumbnail.jpg
Saved image_2010641712_thumbnail.jpg
Saved image_2016866957_thumbnail.jpg
Saved image_2016873397_thumbnail.jpg
Saved image_00650962_thumbnail.jpg


{'image_2017454465.jpg': {'resource_id': 2017454465,
  'original_format': (4019, 6252),
  'thumbnail': 'image_2017454465_thumbnail.jpg'},
 'image_2007684280.jpg': {'resource_id': 2007684280,
  'original_format': (2604, 2151),
  'thumbnail': 'image_2007684280_thumbnail.jpg'},
 'image_2011632159.jpg': {'resource_id': 2011632159,
  'original_format': (1213, 1690),
  'thumbnail': 'image_2011632159_thumbnail.jpg'},
 'image_2017878880.jpg': {'resource_id': 2017878880,
  'original_format': (808, 1024),
  'thumbnail': 'image_2017878880_thumbnail.jpg'},
 'image_2003654393.jpg': {'resource_id': 2003654393,
  'original_format': (1024, 705),
  'thumbnail': 'image_2003654393_thumbnail.jpg'},
 'image_88694120.jpg': {'resource_id': 88694120,
  'original_format': (1205, 994),
  'thumbnail': 'image_88694120_thumbnail.jpg'},
 'image_00650949.jpg': {'resource_id': 650949,
  'original_format': (1024, 628),
  'thumbnail': 'image_00650949_thumbnail.jpg'},
 'image_2016869441.jpg': {'resource_id': 2016869441,

IV. Load the Faster-RCNN Model and Weights

In [263]:
weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)
model.eval()
# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

V. Load the EfficientSAM Model from it's directory

In [264]:
# Importing the EfficientSAM Model and setting the correct directoy
parent_dir = os.getcwd()
os.chdir("../early_work/efficient_sam/EfficientSAM")

from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
import zipfile

efficient_sam_vitt_model = build_efficient_sam_vitt()
efficient_sam_vitt_model.eval()

# Since EfficientSAM-S checkpoint file is >100MB, we store the zip file.
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
    zip_ref.extractall("weights")
efficient_sam_vits_model = build_efficient_sam_vits()
efficient_sam_vits_model.eval()

os.chdir(parent_dir)


In [265]:
def process_image(image_path, output_path, resource_dictionary, structuring_value=25,threshold =0.9):
    # Read the image
    img = read_image(image_path)

    batch = [preprocess(img)]
    # Get prediction from the model
    prediction = model(batch)[0]
    
    if len(prediction['labels']) == 0:
        print(f'No Object Detection predictions within the Scope of MS COCO dataset: {os.path.basename(image_path)}')

    else:

        # Extracting the len of Index of the scores that meet the threshold value:
        score_len = (prediction["scores"] >= threshold).sum().item()
        # Limits the scores at the threshold to just the top 5
        if score_len >= 3:
            score_len = 3
        else:
            pass


        resource = os.path.basename(image_path)
        base_name = os.path.basename(image_path).split('.')[0]
        resource_id = resource_dictionary[resource]['resource_id']
        resource_dictionary[resource]['segments'] = []

        for i in range(score_len):                
            segment = {}
            segment['bounding_box'] = prediction['boxes'].tolist()[i]
            # Extract bounding box coordinates

            bbox = segment['bounding_box']
            x1 = bbox[0]
            y1 = bbox[1]
            x2 = bbox[2]
            y2 = bbox[3]
            w = x2 - x1
            h = y2 - y1

            class_index = prediction['labels'][i].item()
            class_label = weights.meta["categories"][class_index]
            # print(class_label)

            
            # fig, ax = plt.subplots(1, 3, figsize=(30, 30))
            input_point = np.array([[x1, y1], [x2, y2]])
            input_label = np.array([2, 3])
            

            mask_efficient_sam_vitt = run_ours_box_or_points(image_path, input_point, input_label, efficient_sam_vitt_model)
            # show_anns_ours(mask_efficient_sam_vitt, ax[1])
            binary_mask = mask_efficient_sam_vitt
            structuring_element = np.ones((structuring_value,structuring_value), dtype=bool)
            dilated_mask = binary_dilation(binary_mask, structure=structuring_element)
            eroded_mask = binary_erosion(dilated_mask, structure=structuring_element)

            closed_mask_uint8 = (eroded_mask * 255).astype(np.uint8)

            
            mask_path = os.path.join(output_path, f'masks/mask_{resource_id}_{class_label}_{i}' + '.png')
            cv2.imwrite(mask_path, closed_mask_uint8)
            img_val = cv2.imread(image_path) 
            mask = cv2.imread(mask_path)

            img_foreground = np.array((mask/255)*(img_val/255)) * img_val
            na = img_foreground
            

            '''
            Import to note that part of the following code is from substack
            '''
            # Make a True/False mask of pixels whose BGR values sum to more than zero
            alpha = np.sum(na, axis=-1) > 0

            # Convert True/False to 0/255 and change type to "uint8" to match "na"
            alpha = np.uint8(alpha * 255)

            # Stack new alpha layer with existing image to go from BGR to BGRA, i.e. 3 channels to 4 channels
            res = np.dstack((na, alpha))
            img = Image.fromarray(res, mode='RGBa')

            # Save result

            cutout_path = os.path.join(output_path, f'cutouts/cutout_{resource_id}_{class_label}_{i}' + '.png')
            cv2.imwrite(cutout_path, res)
            
            crop_image(cutout_path,x1,y1,x2,y2)
            crop_image(mask_path,x1,y1,x2,y2)
            resize_to_thumbnail(cutout_path)
            resize_to_thumbnail(mask_path)

            segment['label'] =  class_label
            segment['cutout'] = cutout_path
            segment['mask'] =  mask_path


            resource_dictionary[resource]['segments'].append(segment)


In [266]:
for image in dummy_dictionary:
    # print(image)
    image = '../early_work/images/' + image
    # process_image(image,'../workflow_ui_sample', dummy_dictionary)
    process_image(image,'../workflow_ui_sample/', dummy_dictionary)


No Object Detection predictions within the Scope of MS COCO dataset: image_2011632159.jpg
No Object Detection predictions within the Scope of MS COCO dataset: image_88694120.jpg


In [267]:
import json 

with open("dummy_manifest.json", "w") as outfile: 
    json.dump(dummy_dictionary, outfile)