# Foundation models for zero-shot detection and segmentation

Based on [Ollama](https://github.com/ollama/ollama) project.

In [None]:
!curl -L https://ollama.com/download/ollama-linux-amd64 -o ollama
!chmod +x ollama

In [None]:
import subprocess
subprocess.Popen(["./ollama", "serve"])
import time
time.sleep(3)

In [None]:
!./ollama pull llava

In [None]:
!wget -q -O image.jpg https://github.com/ant-nik/neural_network_course/blob/main/practice_2_data/video_1_fixed/image_001.jpg?raw=true

In [None]:
%%writefile prompt.txt
Find objects on the image.
Split answer in two sections a LIST and an EXPLANATION.
Put only detected object names as single nouns to the LIST section.
Put an explanation of the answer into the EXPLANATION section

In [None]:
!echo '{ "model": "llava", "prompt": "'`cat prompt.txt`'", "images": ["'`base64 -w 0 /content/image.jpg`'"], "stream": false}' > body.json

In [None]:
!curl -o llava-reply.json http://localhost:11434/api/generate --data-binary "@body.json"

In [None]:
%%writefile llama-prompt.txt
Extract text between LIST and EXPLANATION sections and consider it as TEXT in the instruction below.
Split answer in two parts: OUTPUT and INFO.
Remove any enumeration symbols in the TEXT and place only one list entity per line to the OUTPUT section between START and END markers.
Put any explanation of the answer to INFO section.


In [None]:
import json

with open("llama-prompt.txt") as prompt_file:
    llama_prompt = prompt_file.read()
with open("llava-reply.json", "r") as llava_file:
    llama_prompt += json.loads(llava_file.read())["response"]
llama_prompt += "\n\nOUTPUT:\n\n"
with open("llama_prompt.txt", "w") as llama_prompt_file:
    llama_prompt_file.write(llama_prompt)

In [None]:
!cat llama_prompt.txt

In [None]:
!echo '{ "model": "llama3.1", "prompt": "'`cat llama_prompt.txt`'", "stream": false}' > llama_request_body.json

In [None]:
!cat llama_request_body.json

In [None]:
!./ollama pull llama3.1

In [None]:
!curl --data-binary "@llama_request_body.json" -o llama_reply.json http://localhost:11434/api/generate

In [None]:
!cat llama_reply.json

In [None]:
import json

with open("llama_reply.json", "r") as file:
    step2_response = json.loads(file.read())
print(step2_response["response"])

In [None]:
objects = [item for item in step2_response["response"].split("START")[1].split("END")[0].split("\n") if not item=='']
objects

In [None]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

model_id = "IDEA-Research/grounding-dino-base"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

image = Image.open("image.jpg")

In [None]:
# VERY important: text queries need to be lowercased + end with a dot
text = " . ".join([f"all {item}" for item in objects]).lower() + '.'
print(text)

In [None]:
# VERY important: text queries need to be lowercased + end with a dot
text = " . ".join([f"{item}" for item in objects]).lower() + '.'
print(text)

In [None]:
inputs = processor(images=image, text=text, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)

results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.2,
    text_threshold=0.2,
    target_sizes=[image.size[::-1]]
)
results

In [None]:
!pip install supervision

In [None]:
labels = results[0]["labels"]
unique_classes = list(set(labels))
class_to_index_map = {
    item: unique_classes.index(item) for item in unique_classes
}
classes = [class_to_index_map[item] for item in results[0]["labels"]]

In [None]:
labels

In [None]:
import cv2
import supervision
import numpy


box_annotator = supervision.BoxAnnotator()
label_annotator = supervision.LabelAnnotator()

image_boxes = supervision.Detections(
    xyxy=results[0]["boxes"].numpy(),
    class_id=numpy.array(classes, dtype=int)
)

#, 2, 3, 4])#results[0]["labels"]
"""
labels = [
    f"{class_id} {confidence:0.2f}"
    for confidence, class_id, boxes in results
]
"""
annotated_frame = box_annotator.annotate(scene=image.copy(),
                                         detections=image_boxes) #, labels=labels)
annotated_frame = label_annotator.annotate(
    scene=annotated_frame,
    detections=image_boxes,
    labels=labels
)


In [None]:
%matplotlib inline
supervision.plot_image(annotated_frame, (16, 16))