<div align="center">
<img src="https://blog.ucsp.edu.pe/hs-fs/hubfs/logo-de-UCSP-16.png?width=250&height=133&name=logo-de-UCSP-16.png" width="300" >
<br>
<a href="#">
  <img src="https://img.shields.io/badge/Articial-Intelligence-orange" alt="Support Ukraine - Help Provide Humanitarian Aid to Ukraine." />
  <img src="https://img.shields.io/badge/CCOMP-UCSP-brightgreen" alt="Support Ukraine - Help Provide Humanitarian Aid to Ukraine." />
</a>
</div>
A Transformer network, DETR model, is used to analyse the objects in real time, and a panoptic model is used for classification or context understanding.

## Integrantes
* [Chillitupa Quispihuanca, Alfred Addison](projects/)
* [Muñoz Curi, Rayver Aimar](projects/)
* [Gomez del Carpio, Alexander](projects/)
* [Quispe Salcedo, Josep](projects/)

# Research DETR

## Import Dependencies

### Install Dependencies

In [None]:
#!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117

In [None]:
#!git clone https://github.com/ultralytics/yolov5

In [None]:
#!pip3 install -r yolov5/requirements.txt

In [None]:
#! pip install git+https://github.com/cocodataset/panopticapi.git

### Python Dependencies

In [None]:
import numpy as np
import math
import requests
import io

from matplotlib import pyplot as plt
from PIL import Image

### Computer Vision Dependencies

In [None]:
import torch
import cv2
from torch import nn
from torchvision.models import resnet50

%config InlineBackend.figure_format = 'retina'
import ipywidgets as widgets
from IPython.display import display, clear_output

### Transformers Dependencies

In [None]:
import torchvision.transforms as T
torch.set_grad_enabled(False);

### Transformers Panoptical Dependencies

In [None]:
import panopticapi
import itertools
import seaborn as sns

from panopticapi.utils import id2rgb, rgb2id

## Model DETR

### Setup Coco Classes

In [None]:
CLASSES = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
]

COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

### Input Image Normalization

In [None]:
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

### Bounding box and post-processing

In [None]:
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

def plot_results(pil_img, prob, boxes):
    plt.figure(figsize=(16,10))
    plt.imshow(pil_img)
    ax = plt.gca()
    colors = COLORS * 100
    for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), colors):
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                   fill=False, color=c, linewidth=3))
        cl = p.argmax()
        text = f'{CLASSES[cl]}: {p[cl]:0.2f}'
        ax.text(xmin, ymin, text, fontsize=15,
                bbox=dict(facecolor='yellow', alpha=0.5))
    plt.axis('off')
    #plt.show()

### Load Model

In [None]:
model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)

### Transformer Object Detection

In [None]:
def transformer_detect( _frame, _accuracy = 0.5 ):
    _imageRGB = cv2.cvtColor(np.flip(_frame, 1), cv2.COLOR_BGR2RGB)
    _im = Image.fromarray(_imageRGB)
    _img = transform(_im).unsqueeze(0)

    _results = model(_img)
    _probas = _results['pred_logits'].softmax(-1)[0, :, :-1]
    _keep = _probas.max(-1).values > _accuracy
    
    _bboxes_scaled = rescale_bboxes(_results['pred_boxes'][0, _keep], _im.size)
    plot_results(_im, _probas[_keep], _bboxes_scaled)
    
    return [_im,_probas,_keep,_bboxes_scaled, _results]

### Map of Queries

In [None]:
def transformer_mapsss( data ):
    _img = transform(data[0]).unsqueeze(0)
    conv_features, enc_attn_weights, dec_attn_weights = [], [], []

    hooks = [ model.backbone[-2].register_forward_hook( lambda self, input, output: conv_features.append(output) ), model.transformer.encoder.layers[-1].self_attn.register_forward_hook(
            lambda self, input, output: enc_attn_weights.append(output[1])), model.transformer.decoder.layers[-1].multihead_attn.register_forward_hook(
            lambda self, input, output: dec_attn_weights.append(output[1])),
            ]

    # propagate through the model
    outputs = model(_img)
    
    for hook in hooks:
        hook.remove()
        
    conv_features = conv_features[0]
    enc_attn_weights = enc_attn_weights[0]
    dec_attn_weights = dec_attn_weights[0]
    
    
    h, w = conv_features['0'].tensors.shape[-2:]

    fig, axs = plt.subplots(ncols=len(data[3]), nrows=2, figsize=(22, 7))
    colors = COLORS * 100
    for idx, ax_i, (xmin, ymin, xmax, ymax) in zip(data[2].nonzero(), axs.T, data[3]):
        ax = ax_i[0]
        ax.imshow(dec_attn_weights[0, idx].view(h, w))
        ax.axis('off')
        ax.set_title(f'query id: {idx.item()}')
        ax = ax_i[1]
        ax.imshow(data[0])
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, color='blue', linewidth=3))
        ax.axis('off')
        ax.set_title(CLASSES[data[1][idx].argmax()])
    fig.tight_layout()
    
    return [conv_features, enc_attn_weights, dec_attn_weights]
    

### Sampling Self Atention

In [None]:
def self_attention(data, conv_data):
    _img = transform(data[0]).unsqueeze(0)
    f_map = conv_data[0]['0']
    shape = f_map.tensors.shape[-2:]

    sattn = conv_data[1][0].reshape(shape + shape)
    fact = 32

    idxs = [(200, 200), (450, 520), (540, 900), (540, 800)]

    fig = plt.figure(constrained_layout=True, figsize=(25 * 0.7, 8.5 * 0.7))

    gs = fig.add_gridspec(2, 4)
    axs = [
        fig.add_subplot(gs[0, 0]),
        fig.add_subplot(gs[1, 0]),
        fig.add_subplot(gs[0, -1]),
        fig.add_subplot(gs[1, -1]),
    ]

    for idx_o, ax in zip(idxs, axs):
        idx = (idx_o[0] // fact, idx_o[1] // fact)
        ax.imshow(sattn[..., idx[0], idx[1]], cmap='cividis', interpolation='nearest')
        ax.axis('off')
        ax.set_title(f'self-attention{idx_o}')

    fcenter_ax = fig.add_subplot(gs[:, 1:-1])
    fcenter_ax.imshow(data[0])
    for (y, x) in idxs:
        scale = data[0].height / _img.shape[-2]
        x = ((x // fact) + 0.5) * fact
        y = ((y // fact) + 0.5) * fact
        fcenter_ax.add_patch(plt.Circle((x * scale, y * scale), fact // 2, color='r'))
        fcenter_ax.axis('off')
    plt.axis('off')
    plt.show()

### Example

In [None]:
#url = 'https://www.teknofilo.com/wp-content/uploads/2019/12/41512060690_57ed31344c_h1-1280x754.jpg'
#im = Image.open(requests.get(url, stream=True).raw)
#im

In [None]:
#img = transform(im).unsqueeze(0)

#outputs = model(img)

#probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
#keep = probas.max(-1).values > 0.5

#bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)

In [None]:
#plot_results(im, probas[keep], bboxes_scaled)

## Model DETR Object Tracking and Analysis

In [None]:
cap = cv2.VideoCapture(0)
i = 1

while cap.isOpened():
    ret, frame = cap.read()
    
    if i == 150:
        _im = transformer_detect(frame)
        _conv_data = transformer_mapsss( _im )
        self_attention( _im, _conv_data )     
    
    cv2.imshow('YOLO', np.flip(frame, 1))
   
    i = (i + 1)%250;
    
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

## Model Panoptical DETR

### Load Model

In [None]:
model, postprocessor = torch.hub.load('facebookresearch/detr', 'detr_resnet101_panoptic', pretrained=True, return_postprocessor=True, num_classes=250)
#model.eval();

### Computing Mask

In [None]:
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
def computing_Mask( _frame ):
    _imageRGB = cv2.cvtColor(np.flip(_frame, 1), cv2.COLOR_BGR2RGB)
    _im = Image.fromarray(_imageRGB)
    _img = transform(_im).unsqueeze(0)
    out = model(_img)
       
    scores = out["pred_logits"].softmax(-1)[..., :-1].max(-1)[0]
    
    keep = scores > 0.85

    ncols = 5
   
    fig, axs = plt.subplots(ncols=ncols, nrows=math.ceil(keep.sum().item() / ncols), squeeze=False)

    for line in axs:
        for a in line:
            a.axis('off')
    
    for i, mask in enumerate(out["pred_masks"][keep]):
        ax = axs[i // ncols, i % ncols]
        ax.imshow(mask, cmap="cividis")
        ax.axis('off')
    
    fig.tight_layout()
    
    result = postprocessor(out, torch.as_tensor(_img.shape[-2:]).unsqueeze(0))[0]
    palette = itertools.cycle(sns.color_palette())

    panoptic_seg = Image.open(io.BytesIO(result['png_string']))
    panoptic_seg = np.array(panoptic_seg, dtype=np.uint8).copy()

    panoptic_seg_id = rgb2id(panoptic_seg)

    panoptic_seg[:, :, :] = 0
    for id in range(panoptic_seg_id.max() + 1):
      panoptic_seg[panoptic_seg_id == id] = np.asarray(next(palette)) * 255
    plt.figure(figsize=(15,15))
    plt.imshow(panoptic_seg)
    plt.axis('off')
    plt.show()
    
    return out

## Model DETR Panoptical Object Tracking and Analysis

In [None]:
cap = cv2.VideoCapture(0)
i = 1

while cap.isOpened():
    ret, frame = cap.read()
    
    if i == 150:
        _out = computing_Mask(frame)
        #color_Panoptical(_out, frame)
        
    
    cv2.imshow('YOLO', np.flip(frame, 1))
   
    i = (i + 1)%250;
    
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()