In [None]:
import numpy as np
import cv2
import torch
import torch.nn.functional as F
import torchvision
import math
import time
import os
import io
import requests
import zipfile
from google.colab.patches import cv2_imshow

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def download_zipfile(path,url):
    if os.path.exists(path):
        return
    print("downloading",url)
    response = requests.get(url)
    if response.ok:
        file_like_object = io.BytesIO(response.content)
        zipfile_object = zipfile.ZipFile(file_like_object)
        zipfile_object.extractall(".")
    print("downloaded")

def download_glee():
    download_zipfile('GLEEmodel_swin_complete.pth','http://www.agentspace.org/download/GLEEmodel_swin_complete.zip')

download_glee()

In [None]:
model = torch.load('GLEEmodel_swin_complete.pth').to(device)

In [None]:
!wget -O "twocats.jpg" "http://images.cocodataset.org/val2017/000000039769.jpg"

In [None]:
imgpath='twocats.jpg'

In [None]:
img = cv2.imread(imgpath)
cv2_imshow(img)

In [None]:
cv_imshow(img)

In [None]:
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=-1)

def LSJ_box_postprocess( out_bbox,  padding_size, crop_size, img_h, img_w): # postprocess box height and width
    boxes = box_cxcywh_to_xyxy(out_bbox)
    lsj_scale = torch.tensor([padding_size[1], padding_size[0], padding_size[1], padding_size[0]]).to(out_bbox)
    crop_scale = torch.tensor([crop_size[1], crop_size[0], crop_size[1], crop_size[0]]).to(out_bbox)
    boxes = boxes * lsj_scale
    boxes = boxes / crop_scale
    boxes = torch.clamp(boxes,0,1)
    scale_fct = torch.tensor([img_w, img_h, img_w, img_h])
    scale_fct = scale_fct.to(out_bbox)
    boxes = boxes * scale_fct
    return boxes

In [None]:
input_expressions = [ "the first sleeping cat from the right side" ]
prompt_list = {'grounding':input_expressions}
task="grounding"

In [None]:
# preprocessing
copyed_img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
print('input shape',copyed_img.shape)
pixel_mean = torch.Tensor( [123.675, 116.28, 103.53]).to(device).view(3, 1, 1)
pixel_std = torch.Tensor([58.395, 57.12, 57.375]).to(device).view(3, 1, 1)
normalizer = lambda x: (x - pixel_mean) / pixel_std
inference_size = 800
resizer = torchvision.transforms.Resize(inference_size,antialias=True)
size_divisibility = 32
ori_image = torch.as_tensor(np.ascontiguousarray( copyed_img.transpose(2, 0, 1)))
ori_image = normalizer(ori_image.to(device))[None,]
_,_, ori_height, ori_width = ori_image.shape
resize_image = resizer(ori_image)
image_size = torch.as_tensor((resize_image.shape[-2],resize_image.shape[-1]))
re_size = resize_image.shape[-2:]
if size_divisibility > 1:
    stride = size_divisibility
    padding_size = ((image_size + (stride - 1)).div(stride, rounding_mode="floor") * stride).tolist()
    infer_image = torch.zeros(1,3,padding_size[0],padding_size[1]).to(resize_image)
    infer_image[0,:,:image_size[0],:image_size[1]] = resize_image
    infer_image = infer_image.to(device)

In [None]:
# loading model into GPU
t0 = time.time()
dummy = torch.rand(infer_image.shape).to(device)
with torch.no_grad():
    (outputs,_) = model(dummy, prompt_list, task=task, batch_name_list=[], is_train=False)
t1 = time.time()
print(f'model loaded in {t1-t0}s')

In [None]:
results_select=['box', 'mask', 'name', 'score', 'expression']

In [None]:
# run model
t0 = time.time()

with torch.no_grad():
    (outputs,_) = model(infer_image, prompt_list, task="grounding", batch_name_list=[], is_train=False)

mask_pred = outputs['pred_masks'][0].to('cpu')
mask_cls = outputs['pred_logits'][0].to('cpu')
boxes_pred = outputs['pred_boxes'][0].to('cpu')

t1 = time.time()
print('elapsed',t1-t0,'s') # 2.96s s loadovanim modelu, 0.65s bez loadovania na CUDA, 7,86s CPU

In [None]:
# postprocessing
scores = mask_cls.sigmoid().max(-1)[0]
topK_instance = 1
scores_per_image, topk_indices = scores.topk(topK_instance, sorted=True)

pred_class = mask_cls[topk_indices].max(-1)[1].tolist()
pred_boxes = boxes_pred[topk_indices]

boxes = LSJ_box_postprocess(pred_boxes,padding_size,re_size, ori_height,ori_width)
mask_pred = mask_pred[topk_indices]
pred_masks = F.interpolate( mask_pred[None,], size=(padding_size[0], padding_size[1]), mode="bilinear", align_corners=False  )
pred_masks = pred_masks[:,:,:re_size[0],:re_size[1]]
pred_masks = F.interpolate( pred_masks, size=(ori_height,ori_width), mode="bilinear", align_corners=False  )
pred_masks = (pred_masks>0).detach().cpu().numpy()[0]

In [None]:
# visualization
COLORS = [
    [0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
    [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933],
    [0.494, 0.000, 0.556], [0.494, 0.000, 0.000], [0.000, 0.745, 0.000],
    [0.700, 0.300, 0.600], [0.000, 0.447, 0.741], [0.850, 0.325, 0.098]
]

zero_mask = np.zeros_like(copyed_img)
for nn, mask in enumerate(pred_masks):
    mask = mask.reshape(mask.shape[0], mask.shape[1], 1)
    lar = np.concatenate((mask*COLORS[nn%12][2], mask*COLORS[nn%12][1], mask*COLORS[nn%12][0]), axis = 2)
    zero_mask = zero_mask+ lar

lar_valid = zero_mask>0
masked_image = lar_valid*copyed_img
mask_image_mix_ration = 0.65
img_n = masked_image*mask_image_mix_ration + np.clip(zero_mask,0,1)*255*(1-mask_image_mix_ration)
max_p = img_n.max()
img_n = 255*img_n/max_p
ret = (~lar_valid*copyed_img)*mask_image_mix_ration + img_n
ret = ret.astype('uint8')
retimg = cv2.cvtColor(ret,cv2.COLOR_RGB2BGR)

In [None]:
cv_imshow(retimg)

In [None]:
mask = pred_masks[0]
mask = mask.reshape(mask.shape[0], mask.shape[1], 1)
mask = mask.astype(np.uint8)
mask = np.squeeze(mask)
mask *= 255

In [None]:
cv_imshow(mask)