# Foundation models for zero-shot detection and segmentation

Based on [Ollama](https://github.com/ollama/ollama) project.

In [None]:
!curl -L https://ollama.com/download/ollama-linux-amd64 -o ollama
!chmod +x ollama

In [None]:
import subprocess
subprocess.Popen(["./ollama", "serve"])
import time
time.sleep(3)

In [None]:
!./ollama pull llava

In [None]:
!wget -q -O image.jpg https://github.com/ant-nik/neural_network_course/blob/main/practice_2_data/video_1_fixed/image_001.jpg?raw=true

In [None]:
%%writefile prompt.txt
Describe entities on the image as detailed as possible.

In [None]:
!echo '{ "model": "llava", "prompt": "'`cat prompt.txt`'", "images": ["'`base64 -w 0 /content/image.jpg`'"], "stream": false}' > llava-request.json
!cat llava-request.json

In [None]:
!curl -o llava-reply.json http://localhost:11434/api/generate --data-binary "@llava-request.json"
!cat llava-reply.json

In [None]:
%%writefile llama-prompt-prefix.txt
Extract all nouns from the TEXT section that are physical objects or living beings.
Split answer in two parts: OUTPUT and INFO.
In OUTPUT section place extracted nouns without enumerations symbols and one entity per line.
Put detailed explanation of the answer to INFO section.

TEXT:

In [None]:
import json

with open("llama-prompt-prefix.txt") as prompt_file:
    llama_prompt = prompt_file.read()
with open("llava-reply.json", "r") as llava_file:
    llava_answer = json.loads(llava_file.read())["response"].replace('"', '\\"')
    llama_prompt += llava_answer
llama_prompt += "\n\nOUTPUT:\n\n"
with open("llama-prompt.txt", "w") as llama_prompt_file:
    llama_prompt_file.write(llama_prompt)

In [None]:
!cat llama-prompt.txt

In [None]:
!echo '{ "model": "llama3.1", "prompt": "'`cat llama-prompt.txt`'", "stream": false}' > llama_request_body.json
!cat llama_request_body.json

In [None]:
!./ollama pull llama3.1

In [None]:
!curl --data-binary "@llama_request_body.json" -o llama_reply.json http://localhost:11434/api/generate
!cat llama_reply.json

In [None]:
import json
def load_answer(filename: str) -> dict[str, any]:
    with open(filename, "r") as file:
        step2_response = json.loads(file.read())
    return step2_response["response"] if "response" in step2_response else step2_response

In [None]:
step2_response = load_answer("llama_reply.json")
print(step2_response)

In [None]:
objects = [item for item in step2_response.replace("*","").replace(":", "").split("OUTPUT")[1].split("INFO")[0].split("\n") if not item=='']
objects

In [None]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

model_id = "IDEA-Research/grounding-dino-base"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

image = Image.open("image.jpg")

In [None]:
# VERY important: text queries need to be lowercased + end with a dot
text = " . ".join([f"all {item}" for item in objects]).lower() + '.'
print(text)

In [None]:
# VERY important: text queries need to be lowercased + end with a dot
text = " . ".join([f"{item}" for item in objects]).lower() + '.'
print(text)

In [None]:
inputs = processor(images=image, text=text, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)

results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.17,
    text_threshold=0.17,
    target_sizes=[image.size[::-1]]
)
results

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install supervision

In [None]:
labels = results[0]["labels"]
unique_classes = list(set(labels))
class_to_index_map = {
    item: unique_classes.index(item) for item in unique_classes
}
classes = [class_to_index_map[item] for item in results[0]["labels"]]
unique_classes

In [None]:
import cv2
import supervision
import numpy


box_annotator = supervision.BoxAnnotator()
label_annotator = supervision.LabelAnnotator()

image_boxes = supervision.Detections(
    xyxy=results[0]["boxes"].cpu().numpy(),
    class_id=numpy.array(classes, dtype=int)
)

#, 2, 3, 4])#results[0]["labels"]
"""
labels = [
    f"{class_id} {confidence:0.2f}"
    for confidence, class_id, boxes in results
]
"""
annotated_frame = box_annotator.annotate(scene=image.copy(),
                                         detections=image_boxes) #, labels=labels)
annotated_frame = label_annotator.annotate(
    scene=annotated_frame,
    detections=image_boxes,
    labels=labels
)


In [None]:
%matplotlib inline
supervision.plot_image(annotated_frame, (16, 16))

# Objects count by confidence score thresholds

In [None]:
with torch.no_grad():
    all_outputs = model(**inputs)

all_results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.17,
    text_threshold=0.17,
    target_sizes=[image.size[::-1]]
)

In [None]:
all_results

In [None]:
x = numpy.linspace(0.01, 1, 100)
y = numpy.diff([len([x for x in filter(lambda x: x > threshold, all_results[0]["scores"])]) for threshold in x])

In [None]:
import plotly.express


plotly.express.line(x=x[1:], y=y)

In [None]:
plotly.express.histogram(y)

In [None]:
numpy.quantile(y, [0.01, 0.05, 0.1, 0.15, 0.2])

In [None]:
5/99