# Object detection and segmentation with the Gemini API


**References:**
* [Conversational image segmentation with Gemini 2.5](https://developers.googleblog.com/en/conversational-image-segmentation-gemini-2-5/)
* [Use Gemini 2.5 for Zero-Shot Object Detection & Segmentation](https://blog.roboflow.com/gemini-2-5-object-detection-segmentation/)

---

## Setup

In [None]:
# Install
#!pip install google-genai supervision python-dotenv

# Standard library
import base64
import json
import os
import time
from io import BytesIO
from pathlib import Path

# Third-party libraries
from dotenv import load_dotenv
from google import genai
from google.genai import errors as genai_errors
from google.genai import types
import numpy as np
import pandas as pd
from PIL import Image
import requests
import supervision as sv

In [None]:
# Load API key from environment file
load_dotenv()
load_dotenv("gemini_api_key.env")
client = genai.Client(api_key = os.getenv("GOOGLE_API_KEY"))

## Object detection

### Model

Spatial understanding works best with the **Gemini 2.0 Flash** model.  
It performs even better with **Gemini 2.5 models** (like `gemini-2.5-pro`), which are more capable "thinking models" — though slightly slower.  
Some advanced features, such as **image segmentation**, are only supported by **Gemini 2.5 models**.

**Available model options:**
- `gemini-2.0-flash`
- `gemini-2.5-flash-lite`
- `gemini-2.5-flash-lite-preview-09-2025`
- `gemini-2.5-flash`
- `gemini-2.5-flash-preview-09-2025`
- `gemini-2.5-pro`

**Temperature (creativity control):**
- Controls randomness/creativity of model output.  
- `0.0` → very deterministic, consistent output (best for precise tasks).  
- `0.5` → moderately creative (good balance for tasks like bounding boxes).  
- `1.0` → more creative, potentially less consistent output.

In [None]:
# Paths
root_path = Path(r"C:\Users\amand\Amanda\GitHub\virtual_audit_ai\street_view_images")
excel_file = Path(r"C:\Users\amand\Amanda\GitHub\virtual_audit_ai\coordinates_annotated.xlsx")
output_folder = Path(r"C:\Users\amand\Amanda\GitHub\virtual_audit_ai\street_view_outputs")
output_folder.mkdir(parents=True, exist_ok=True)

# Model settings
model_gemini_2_0 = "gemini-2.0-flash"
temperature = 0.2
safety_settings = []
max_retries = 10

In [None]:
# Input
city_name = "Belo Horizonte"
id_name = "562002001"
point_name = "point_1"

selected_city = root_path / city_name
selected_id = selected_city / id_name
selected_point = selected_id / point_name

# Verifications
if not selected_city.exists():
    raise ValueError(f" City '{city_name}' not found.")

if not selected_id.exists():
    raise ValueError(f" ID '{id_name}' not found inside {city_name}.")

if not selected_point.exists():
    raise ValueError(f" Point folder '{point_name}' not found inside {id_name}.")

# Load images only for this specific point folder
test_images = list(selected_point.glob("*.[jp][pn]g"))

print(f" City: {selected_city.name}")
print(f" ID: {selected_id.name}")
print(f" Point: {selected_point.name}")
print(f" Total images found to process: {len(test_images)}")

In [None]:
# Load Excel
df = pd.read_excel(excel_file)

# Prompts Dictionary
prompts_dict = {
    "buildings": (
        "Detect ONLY buildings in this Street View image. "
        "Include houses, apartment buildings, commercial buildings, or other permanent built structures. "
        "Do NOT label trees, poles, vehicles, walls, fences, or temporary constructions. "
        "Output strictly a JSON list of bounding boxes with 'box_2d' and 'label':'building'."
    ),
    "trees": (
        "Detect ONLY real trees in this Street View image. "
        "Only label plants with a visible leafy canopy. "
        "Do NOT label shrubs, grass, poles, trunks, artificial trees, or fences. "
        "Output strictly a JSON list of bounding boxes with 'box_2d' and 'label':'tree'."
    ),
    "waste_containers": (
    "Detect ONLY public waste containers in this Street View image. "
    "Include fixed dumpsters, metal or plastic public trash bins, and street-side waste baskets. "
    "Do NOT label loose trash on the ground, cardboard boxes, improvised containers, recycling trucks, or small household bins. "
    "Focus on containers that are clearly designed for public trash collection, such as the standard Brazilian street waste bins (cylindrical, rectangular, or square). "
    "Output strictly a JSON list of bounding boxes with 'box_2d' coordinates and 'label':'waste_container'."
    )
}

# Ensure Excel columns exist
for item_key in prompts_dict.keys():
    if item_key not in df.columns:
        df[item_key] = "."

print(f" Ready to process: {list(prompts_dict.keys())}")

In [None]:
# Start processing images
start_time = time.time()

for idx, image_path in enumerate(test_images, start=1):
    try:
        # Extract metadata
        parts = image_path.stem.split("_")
        current_id = parts[0]
        current_point_num = parts[1].lstrip("p") 

        # Load and resize image
        image = Image.open(image_path)
        width, height = image.size
        target_height = int(1024 * height / width)
        resized_image = image.resize((1024, target_height), Image.Resampling.LANCZOS)
        resolution_wh = resized_image.size

        # Initialize image for annotation
        annotated = resized_image.copy()
        
        # Visual configuration
        thickness = sv.calculate_optimal_line_thickness(resolution_wh)
        text_scale = sv.calculate_optimal_text_scale(resolution_wh)
        box_annotator = sv.BoxAnnotator(thickness=thickness)
        label_annotator = sv.LabelAnnotator(
            smart_position=True, text_color=sv.Color.BLACK,
            text_scale=text_scale, text_position=sv.Position.CENTER
        )
        
        save_image = False 

        # Loop through prompts
        for prompt_index, (target_item, active_prompt) in enumerate(prompts_dict.items()):
            excel_col_name = target_item

            # API call with retry
            current_retry = 0
            wait_time = 1
            detections_json = None

            while current_retry < max_retries:
                try:
                    response = client.models.generate_content(
                        model=model_gemini_2_0,
                        contents=[resized_image, active_prompt],
                        config=types.GenerateContentConfig(
                            temperature=temperature,
                            safety_settings=safety_settings,
                            response_mime_type="application/json"))

                    text = response.text
                    text_clean = text.replace("```json", "").replace("```", "").strip()
                    json_start = text_clean.find("[")
                    json_end = text_clean.rfind("]") + 1

                    if json_start != -1 and json_end != -1:
                        detections_json = json.loads(text_clean[json_start:json_end])
                        break
                    else:
                        detections_json = []
                        break 

                except Exception:
                    current_retry += 1
                    time.sleep(wait_time)
                    wait_time *= 2
            
            if detections_json is None:
                detections_json = []

            # Update excel and prepare annotation
            mask = (df["id"].astype(str) == str(current_id)) & (df["points"].astype(str) == str(current_point_num))
            
            if mask.any():
                if len(detections_json) > 0:
                    df.loc[mask, excel_col_name] = "Yes"
                    save_image = True

                    # Create bounding boxes
                    xyxy = []
                    labels = []
                    class_ids = []

                    for item in detections_json:
                        if "box_2d" in item:
                            ymin, xmin, ymax, xmax = item["box_2d"]
                            x1 = (xmin / 1000) * resolution_wh[0]
                            y1 = (ymin / 1000) * resolution_wh[1]
                            x2 = (xmax / 1000) * resolution_wh[0]
                            y2 = (ymax / 1000) * resolution_wh[1]

                            xyxy.append([x1, y1, x2, y2])
                            labels.append(item.get("label", target_item))
                            class_ids.append(prompt_index)

                    if len(xyxy) > 0:
                        detections = sv.Detections(
                            xyxy=np.array(xyxy),
                            class_id=np.array(class_ids)
                        )
                        annotated = box_annotator.annotate(scene=annotated, detections=detections)
                        annotated = label_annotator.annotate(scene=annotated, detections=detections, labels=labels)

                else:
                    df.loc[mask, excel_col_name] = "No"

        # Save annotated image if any detection
        if save_image:
            final_output_path = output_folder / city_name / id_name / point_name
            final_output_path.mkdir(parents=True, exist_ok=True)
            file_save_path = final_output_path / f"{image_path.stem}_annotated.jpg"
            annotated.save(file_save_path)

        # Print progress
        print(f"[{idx}/{len(test_images)}] {image_path.name} processed")

    except Exception as e:
        print(f"[{idx}/{len(test_images)}] {image_path.name} Error: {e}")
        continue

# Save Excel
df.to_excel(excel_file, index=False)

# Execution time
end_time = time.time()
total_seconds = end_time - start_time
minutes = int(total_seconds // 60)
seconds = total_seconds % 60
print(f"\nProcessing complete in {minutes} minutes and {seconds:.2f} seconds.")