# End to End Object Detection with Transformers in ART

Demo for applying the DEtection TRansformer (DETR) estimator in ART for object detection and attacking the Detection Transformer using the Robust DPatch method.

### Define imports, constants and helper functions

In [1]:
import sys 
!{sys.executable} -m pip install --user adversarial-robustness-toolbox
!{sys.executable} -m pip install --user torch
!{sys.executable} -m pip install --user pillow
!{sys.executable} -m pip install --user fiftyone



from art.estimators.object_detection.pytorch_detection_transformer import PyTorchDetectionTransformer
from art.attacks.evasion.adversarial_patch.adversarial_patch_pytorch import AdversarialPatchPyTorch
from torchvision.transforms import transforms
import PIL.Image
import numpy as np
import torch
import cv2
import matplotlib.pyplot as plt
import requests
import pandas as pd

from sklearn.metrics import accuracy_score

COCO_CLASSES = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
]

def extract_predictions(predictions_, conf_thresh):
    predictions_class = [COCO_CLASSES[i] for i in list(predictions_["labels"])]
    if len(predictions_class) < 1:
        return [], [], []
        
    predictions_boxes = [[(i[0], i[1]), (i[2], i[3])] for i in list(predictions_["boxes"])]
    predictions_score = list(predictions_["scores"])

    threshold = conf_thresh
    predictions_t = [predictions_score.index(x) for x in predictions_score if x > threshold]
    if len(predictions_t) > 0:
        predictions_t = predictions_t
    else:
        return [], [], []
        
    predictions_boxes = [predictions_boxes[i] for i in predictions_t]
    predictions_class = [predictions_class[i] for i in predictions_t]
    predictions_scores = [predictions_score[i] for i in predictions_t]
    return predictions_class, predictions_boxes, predictions_scores

def plot_image_with_boxes(img, boxes, pred_cls, title):
    text_size = 2
    text_th = 2
    rect_th = 2

    for i in range(len(boxes)):
        cv2.rectangle(img, (int(boxes[i][0][0]), int(boxes[i][0][1])), (int(boxes[i][1][0]), int(boxes[i][1][1])),
                      color=(0, 255, 0), thickness=rect_th)
        cv2.putText(img, pred_cls[i], (int(boxes[i][0][0]), int(boxes[i][0][1])), cv2.FONT_HERSHEY_SIMPLEX, text_size,
                    (0, 255, 0), thickness=text_th)
    plt.figure()
    plt.axis("off")
    plt.title(title)
    plt.imshow(img)

def filter_boxes(predictions, conf_thresh):
    dictionary = {}

    boxes_list = []
    scores_list = []
    labels_list = []

    for i in range(len(predictions[0]["boxes"])):
        score = predictions[0]["scores"][i]
        if score >= conf_thresh:
            boxes_list.append(predictions[0]["boxes"][i])
            scores_list.append(predictions[0]["scores"][[i]])
            labels_list.append(predictions[0]["labels"][[i]])

    dictionary["boxes"] = np.vstack(boxes_list)
    dictionary["scores"] = np.hstack(scores_list)
    dictionary["labels"] = np.hstack(labels_list)

    y = [dictionary]

    return y

MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]
NUMBER_CHANNELS = 3
INPUT_SHAPE = (NUMBER_CHANNELS, 800, 800)

transform = transforms.Compose([
        transforms.Resize([INPUT_SHAPE[1], INPUT_SHAPE[2]], interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.ToTensor()
    ])



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.10 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.10 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.10 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.10 install --

In [2]:
PATCH_MAX_ITER = 100
WHITEBOX_MAX_ITER = 1

### Load COCO images and resize

In [3]:
import fiftyone as fo
import fiftyone.zoo as foz


dataset = foz.load_zoo_dataset(
    "coco-2017",
    split="validation",
    max_samples=3,
    shuffle=True,
    seed=51,
    label_types=["detections", "segmentations"],
    classes=["person", "car"],
)
try:
    dataset.rename_sample_field("detections", "ground_truth")
except AttributeError:
    pass

# You can now work with the PIL image
coco_images = []

for sample in dataset:
    # Get the file path to the image
    image_path = sample.filepath
    
    # Open the image using PIL
    pil_image = PIL.Image.open(image_path)
    
    # Add the PIL image to the list
    im=transform(pil_image).numpy()
    coco_images.append(im)
    
coco_images = np.array(coco_images)

Downloading split 'validation' to '/users/antilaan/fiftyone/coco-2017/validation' if necessary
Found annotations at '/users/antilaan/fiftyone/coco-2017/raw/instances_val2017.json'
Sufficient images already downloaded
Existing download of split 'validation' is sufficient
Loading 'coco-2017' split 'validation'
 100% |█████████████████████| 3/3 [200.7ms elapsed, 0s remaining, 14.9 samples/s]     
Dataset 'coco-2017-validation-3' created


### Create the detector

https://github.com/Trusted-AI/adversarial-robustness-toolbox/blob/435d4b87c42685ea9ba40dba2eebd94df2ce7ff9/art/estimators/object_detection/pytorch_detection_transformer.py#L42 Tää sanois että nää tulee samalla tavalla ulos tuotla niinkun tässä esimerkissä:
https://docs.voxel51.com/recipes/adding_detections.html
Joten implementoinnin vois tehä suoraan

In [4]:
detector = PyTorchDetectionTransformer(channels_first=True, preprocessing=(MEAN, STD), input_shape=INPUT_SHAPE, clip_values=(0,1))

Using cache found in /users/antilaan/.cache/torch/hub/facebookresearch_detr_main


### Test detector on COCO images

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
classes = dataset.default_classes
predictions_view = dataset.take(3)
print("Testing detector on COCO images")
print(f"detecting {len(coco_images)}")
results = detector.predict(coco_images)
for i in range(len(results)):

    preds = extract_predictions(results[i], 0.8) #0.8 on thresholdi jota voi vaihtaa, 0.8 on ihan hyvä tho
    im = (torch.from_numpy(coco_images)[i].numpy().transpose(1,2,0)*255).astype(np.uint8)
    c, h, w = im.shape
    print(c,h,w)

    image = PIL.Image.open(sample.filepath)
    image = func.to_tensor(image).to(device)
    
    c, h, w = image.shape
    print(c,h,w)
    
    #plot_image_with_boxes(img=im.copy(), boxes=preds[1], pred_cls=preds[0], title="Predictions on image without patch")
    detections = []

    boxes = preds[1]
    labels = preds[0]
    scores = preds[2]
    
    for label, score, box in zip(labels, scores, boxes):
            # Convert to [top-left-x, top-left-y, width, height]
            # in relative coordinates in [0, 1] x [0, 1]
            #x1, y1, x2, y2 = box
            x1, y1 = box[0]
            x2, y2 = box[1]
            print(x1, y1)
            print(x2,y2)
            rel_box = [x1 / w, y1 / h, (x2 - x1) / w, (y2 - y1) / h]
            print(rel_box)

            detections.append(
                fo.Detection(
                    label=label,
                    bounding_box=rel_box,
                    confidence=score
                )
            )

        # Save predictions to dataset
    sample["predictions"] = fo.Detections(detections=detections)
    sample.save()
    i+=1

# Tää pitäs olla kai niin, että yksi predictio on yksi looppi mutta miten???
#for i in range(len(results)):
    #preds = extract_predictions(results[i], 0.8)
    #im = (torch.from_numpy(coco_images)[i].numpy().transpose(1,2,0)*255).astype(np.uint8)
    #plot_image_with_boxes(img=im.copy(), boxes=preds[1], pred_cls=preds[0], title="Predictions on image without patch")
    #boxes = preds[1]
    #locations, sizes = make_into_fo_format(boxes)
 #   pass

predictions = []
#print(f"classes: {preds[0]}")
#print(f"score?: {preds[2]}")

#print(f"len {len(sizes)}")
#print(f"len {len(locations)}")
#print(f"detections: {detections}")
#print(f"sample: {sample}")

predictions = []
single_prediction = {}

Testing detector on COCO images
detecting 3
292.42258 483.70367
311.01352 510.30832
[97.4741923014323, 0.6046295928955078, 6.196980794270833, 0.03325580596923828]
290.45187 595.61755
307.8292 628.79346
[96.81729125976562, 0.7445219421386718, 5.792439778645833, 0.041469879150390625]
239.31216 584.85986
253.77473 616.9072
[79.77072143554688, 0.7310748291015625, 4.8208567301432295, 0.0400592041015625]
21.91446 622.0995
48.291992 674.7508
[7.304819742838542, 0.7776243591308594, 8.792510986328125, 0.06581413269042968]
739.0851 548.5187
799.9551 705.8572
[246.3616943359375, 0.6856483459472656, 20.289998372395832, 0.19667312622070313]
306.42508 625.06964
349.49677 654.15967
[102.14169311523438, 0.7813370513916016, 14.357228597005209, 0.036362533569335935]
299.29602 623.2366
339.17648 651.14124
[99.76534016927083, 0.7790457153320313, 13.293487548828125, 0.03488082885742187]
0.035987794 623.9548
37.256287 672.3843
[0.011995931466420492, 0.7799434661865234, 12.406766255696615, 0.0605368804931640

from torchvision.transforms import functional as func
predictions_view = dataset.take(50, seed=51)
i=0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
classes = dataset.default_classes

with fo.ProgressBar() as pb:
    for sample in pb(predictions_view):
        # Load image
        image = PIL.Image.open(sample.filepath)
        image = func.to_tensor(image).to(device)
        c, h, w = image.shape
        # Perform inference
        #results = detector.predict(coco_images[i])
        #results = detector.predict(coco_images[i])

        preds = extract_predictions(results[i], 0.8)
        labels = preds[0]
        scores = preds[2]
        boxes = preds[1]

        # Convert detections to FiftyOne format
        detections = []
        for label, score, box in zip(labels, scores, boxes):
            # Convert to [top-left-x, top-left-y, width, height]
            # in relative coordinates in [0, 1] x [0, 1]
            #x1, y1, x2, y2 = box
            x1, y1 = box[0]
            x2, y2 = box[1]
            rel_box = [x1 / w, y1 / h, (x2 - x1) / w, (y2 - y1) / h]

            detections.append(
                fo.Detection(
                    label=label,
                    bounding_box=rel_box,
                    confidence=score
                )
            )

        # Save predictions to dataset
        sample["predictions"] = fo.Detections(detections=detections)
        sample.save()
    i+=1
    
print("Finished adding predictions")

# Do evaluation for basic model

In [11]:
from fiftyone import ViewField as F
# Evaluate the predictions in the `faster_rcnn` field of our `high_conf_view`
# with respect to the objects in the `ground_truth` field
high_conf_view = predictions_view.filter_labels("predictions", F("confidence") > 0.8, only_matches=False) #lukua vaihdettu
#pitää kattoa onko "detections" oikeesti se oikea fieldi...
results = high_conf_view.evaluate_detections(
    "predictions",
    gt_field="ground_truth",
    eval_key="asd",
    method="coco",
    compute_mAP=True,
)
# original
'''results = high_conf_view.evaluate_detections(
    "faster_rcnn",
    gt_field="ground_truth",
    eval_key="eval",
    compute_mAP=True,
)'''


Evaluating detections...
 100% |█████████████████████| 3/3 [92.9ms elapsed, 0s remaining, 32.3 samples/s] 
Performing IoU sweep...
 100% |█████████████████████| 3/3 [42.9ms elapsed, 0s remaining, 70.0 samples/s] 


'results = high_conf_view.evaluate_detections(\n    "faster_rcnn",\n    gt_field="ground_truth",\n    eval_key="eval",\n    compute_mAP=True,\n)'

# Show results for basic evaluation

In [13]:
# Get the 10 most common classes in the dataset
counts = dataset.count_values("ground_truth.detections.label")
classes_top10 = sorted(counts, key=counts.get, reverse=True)[:10]
# Print a classification report for the top-10 classes
results.print_report()

print(f"map: {results.mAP()}")

               precision    recall  f1-score   support

          car       0.00      0.00      0.00       9.0
        chair       0.00      0.00      0.00       9.0
          dog       0.00      0.00      0.00       1.0
 fire hydrant       0.00      0.00      0.00       1.0
      handbag       0.00      0.00      0.00       1.0
       orange       0.00      0.00      0.00       1.0
       person       0.00      0.00      0.00       7.0
    stop sign       0.00      0.00      0.00       0.0
traffic light       0.00      0.00      0.00       8.0
     umbrella       0.00      0.00      0.00       3.0

    micro avg       0.00      0.00      0.00      40.0
    macro avg       0.00      0.00      0.00      40.0
 weighted avg       0.00      0.00      0.00      40.0

map: 0.0


# Adversarial patch attack

DPATCH self done https://github.com/Trusted-AI/adversarial-robustness-toolbox/blob/435d4b87c42685ea9ba40dba2eebd94df2ce7ff9/examples/get_started_fasterrcnn.py#L30

In [None]:
from art.attacks.evasion import RobustDPatch
print("Starting robust dpatch")
dets = detector.predict(coco_images)
filtered_dets = [filter_boxes([t], 0.8)[0] for t in dets]

x = coco_images[:-1]
targets = [filtered_dets[-1] for i in range(len(x))]

brightness_range = [1.0, 1.0]
rotation_weights = [1,0,0,0]
sample_size = 1
crop_range=[0,0]
rotation_max=0.0
scale_min=0.5
scale_max=1
distortion_scale_max=0.0
learning_rate=1.0 
max_iter=100 #Tää on esimerkissä 5000
batch_size=1
patch_shape=(3, 80, 80)
patch_location=(300,300)

ap = RobustDPatch(
        detector,
        patch_shape=patch_shape,
        patch_location=patch_location,
        crop_range=crop_range,
        brightness_range=brightness_range,
        rotation_weights=rotation_weights,
        sample_size=sample_size,
        learning_rate=learning_rate,
        max_iter=max_iter,
        batch_size=batch_size,
    )

patch = ap.generate(x=x[[0]])


In [None]:
count =0
accuracy_per_image = []
for image in coco_images:
    
    patched_images = ap.apply_patch(coco_images[[count]])
    dets = detector.predict(patched_images)
    count +=1
    if count == 5:
        break

    for i in range(len(dets)):
        preds_orig = extract_predictions(dets[i], 0.8)
        plot_image_with_boxes(img=patched_images[i].transpose(1,2,0).copy(), boxes=preds_orig[1], pred_cls=preds_orig[0],
                               title="Predictions on image with patch")
        accuracy = accuracy_score(y_true, preds_orig[2])

In [None]:
from art.attacks.evasion import DPatch
print("Starting dpatch")
dets = detector.predict(coco_images)
filtered_dets = [filter_boxes([t], 0.8)[0] for t in dets]

x = coco_images[:-1]
targets = [filtered_dets[-1] for i in range(len(x))]

brightness_range = [1.0, 1.0]
rotation_weights = [1,0,0,0]
sample_size = 1
crop_range=[0,0]
rotation_max=0.0
scale_min=0.5
scale_max=1
distortion_scale_max=0.0
learning_rate=1.0 
max_iter=100 #Tää on esimerkissä 5000
batch_size=1
patch_shape=(3, 80, 80)
patch_location=(300,300)

ap = DPatch(
        detector,
        patch_shape=patch_shape,
        learning_rate=learning_rate,
        max_iter=max_iter,
        batch_size=batch_size,
    )

patch = ap.generate(x=x[[0]])


In [None]:
count =0
for image in coco_images:
    
    patched_images = ap.apply_patch(coco_images[[count]])
    dets = detector.predict(patched_images)
    count +=1
    if count == 5:
        break

    for i in range(len(dets)):
        preds_orig = extract_predictions(dets[i], 0.8)
        plot_image_with_boxes(img=patched_images[i].transpose(1,2,0).copy(), boxes=preds_orig[1], pred_cls=preds_orig[0],
                               title="Predictions on image with patch")

##### Targeted attack

In [None]:
print("Starting patch")
dets = detector.predict(coco_images)
filtered_dets = [filter_boxes([t], 0.8)[0] for t in dets]

x = coco_images[:-1]
targets = [filtered_dets[-1] for i in range(len(x))]

rotation_max=0.0
scale_min=0.5
scale_max=1
distortion_scale_max=0.0
learning_rate=1.99 
max_iter=5000 #muokattu
batch_size=16
patch_shape=(3, 300, 300)
patch_location=(100,100) # Tämän vois tehdä niin, että paikka on random/pseudorandom
patch_type="circle"
optimizer="adam"

ap = AdversarialPatchPyTorch(estimator=detector, rotation_max=rotation_max, 
                      scale_min=scale_min, scale_max=scale_max, distortion_scale_max=distortion_scale_max,
                      learning_rate=learning_rate, max_iter=max_iter, batch_size=batch_size, patch_location=patch_location,
                      patch_shape=patch_shape, patch_type=patch_type, verbose=True, targeted=True)

patch, patch_mask = ap.generate(x=x[[0]], y=targets[:1])

plt.axis("off")
plt.imshow(((patch) * patch_mask).transpose(1,2,0))
plt.show()

In [None]:
# Do it for all 10 images
count =0
for image in coco_images:
    
    patched_images = ap.apply_patch(coco_images[[count]], scale=0.1)
    dets = detector.predict(patched_images)
    count +=1

    for i in range(len(dets)):
        preds_orig = extract_predictions(dets[i], 0.8)
        plot_image_with_boxes(img=patched_images[i].transpose(1,2,0).copy(), boxes=preds_orig[1], pred_cls=preds_orig[0],
                               title="Predictions on image with patch")
print("Patch done")

##### Untargeted attack

In [None]:
dets = detector.predict(coco_images)
print("Starting untargeted path")
filtered_dets = [filter_boxes([t], 0.8)[0] for t in dets]

x = coco_images[:-1]
targets = [filtered_dets[-1] for i in range(len(x))]

rotation_max=0.0
scale_min=0.5
scale_max=1
distortion_scale_max=0.0
learning_rate=1.99
max_iter=1
batch_size=16
patch_shape=(3, 300, 300)
patch_location=(100,100)
patch_type="circle"
optimizer="adam"

ap = AdversarialPatchPyTorch(estimator=detector, rotation_max=rotation_max, 
                      scale_min=scale_min, scale_max=scale_max, distortion_scale_max=distortion_scale_max,
                      learning_rate=learning_rate, max_iter=max_iter, batch_size=batch_size, patch_location=patch_location,
                      patch_shape=patch_shape, patch_type=patch_type, verbose=True, targeted=False)

patch, patch_mask = ap.generate(x=x[[0]], y=filtered_dets[:1])

plt.axis("off")
plt.imshow(((patch) * patch_mask).transpose(1,2,0))
plt.show()

In [None]:
# Apply patch to 10 images
count =0
for image in coco_images:
    
    patched_images = ap.apply_patch(coco_images[[count]], scale=0.3)
    dets = detector.predict(patched_images)
    count +=1
for i in range(len(dets)):
    preds_orig = extract_predictions(dets[i], 0.8)
    plot_image_with_boxes(img=patched_images[i].transpose(1,2,0).copy(), boxes=preds_orig[1], pred_cls=preds_orig[0],
                           title="Predictions on image with patch")
    print(f'Patched prediction classes for image {i+1}:', preds_orig[0])
print("untargeted patch done")

##### Untargeted attack plotting the loss components over epochs

ALHAALLA OLEVA KOHTA LAITETTU MARKDOWNIKSI KOSKA SE ON TURHA

from tqdm import tqdm

dets = detector.predict(coco_images)
y = [filter_boxes([t], 0.8)[0] for t in dets]

x = coco_images[:-1]
target = [y[-1] for i in range(len(coco_images[:-1]))]

rotation_max=0.0
scale_min=0.2
scale_max=0.4
distortion_scale_max=0.0
learning_rate=1.99
max_iter=PATCH_MAX_ITER
batch_size=16
patch_shape=(3, 80, 80)
patch_location=(10,10)
patch_type="square"
optimizer="adam"

attack = AdversarialPatchPyTorch(estimator=detector, rotation_max=rotation_max, 
                      scale_min=scale_min, scale_max=scale_max, distortion_scale_max=distortion_scale_max,
                      learning_rate=learning_rate, max_iter=max_iter, batch_size=batch_size, patch_location=patch_location,
                      patch_shape=patch_shape, patch_type=patch_type, verbose=False, targeted=False)

loss_history = []
for i in tqdm(range(50)):
    patch = attack.generate(x[[0]], y[:1])
    patched_images = attack.apply_patch(x[[3]], scale=0.4)

    loss_components = detector.compute_losses(patched_images, y[:1])
    losses = {}
    for loss in loss_components.keys():
        if 'loss' in loss:
            losses[loss] = loss_components[loss].item()
    loss_history.append(losses)

    if i%20==0:
        _y = detector.predict(patched_images)
        preds = extract_predictions(_y[0], 0.5)
        im = (patched_images[0].transpose(1,2,0))
        plot_image_with_boxes(img=im.copy(), boxes=preds[1], pred_cls=preds[0], title=f"Detection [with patch, iter: {i}]")

_y = detector.predict(patched_images)
preds = extract_predictions(_y[0], 0.5)
im = (patched_images[0].transpose(1,2,0))
plot_image_with_boxes(img=im.copy(), boxes=preds[1], pred_cls=preds[0], title=f"Detection [with patch,iter: {i}]")

losses = pd.DataFrame(loss_history)
fig, axes = plt.subplots(nrows=1,ncols=3,figsize=(12,3))
losses.loss_ce.plot(ylabel='loss', ax = axes[0], subplots=True, color='g')
losses.loss_bbox.plot(xlabel='epoch', ax = axes[1],subplots=True, color='royalblue')
losses.loss_giou.plot( ax = axes[2],subplots=True, color='orange')


axes[0].set_title('Classification Loss')
axes[1].set_title('BBox L1 Regression Loss')
axes[2].set_title('BBox GIoU Loss')
fig.tight_layout()