# Install packages

In [None]:
!pip install transformers torch pillow einops
!pip install torch==2.5.1+cu121 torchvision==0.20.1+cu121 --index-url https://download.pytorch.org/whl/cu121
!pip install pyvips-binary pyvips
!pip install accelerate

from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import shutil
from google.colab import files
import os
from google.colab import drive

drive.mount('/content/drive')


Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting pyvips-binary
  Using cached pyvips_binary-8.16.1-cp37-abi3-manylinux_2_28_x86_64.whl.metadata (2.3 kB)
Collecting pyvips
  Using cached pyvips-3.0.0.tar.gz (56 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading pyvips_binary-8.16.1-cp37-abi3-manylinux_2_28_x86_64.whl (7.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyvips
  Building wheel for pyvips (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyvips: filename=pyvips-3.0.0-py3-none-any.whl size=54256 sha256=4699f17d35a080e6df929021c6add80528a2c70baa4531e36633d87ecac9a7e2
  Stored in directory: /root/.cache/pip/wheels/8d/87/bb/ce9a0c257881486852c02c8c50a021684807b40d9579ec4568
Successfully built pyvi

# Moondream2 model loader

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "vikhyatk/moondream2",
    revision="2025-03-27",
    trust_remote_code=True,
    device_map={"": "cuda"}
)

# Generation of Yolo annotation from images

In [None]:
def run_moondream_inference(images, objects_to_detect, model):
    results = []

    for image in images:
        width, height = image.size
        encoded_image = model.encode_image(image)

        all_boxes = []

        for obj in objects_to_detect:
            object_id = obj["id"]
            object_name = obj["name"]

            detection = model.detect(encoded_image, object_name)

            for bbox in detection['objects']:
                x_min = bbox['x_min']
                y_min = bbox['y_min']
                x_max = bbox['x_max']
                y_max = bbox['y_max']

                # Converti in YOLO normalizzato
                cx = (x_min + x_max) / 2
                cy = (y_min + y_max) / 2
                w = x_max - x_min
                h = y_max - y_min

                all_boxes.append({
                    "class_id": object_id,
                    "cx": cx,
                    "cy": cy,
                    "w": w,
                    "h": h
                })

        results.append(all_boxes)

    return results


# saving annotations in a folder for each image

In [None]:
def save_Moondream_predictions_as_yolo(results, filenames, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    for boxes, fname in zip(results, filenames):
        name = os.path.splitext(fname)[0]
        txt_path = os.path.join(output_folder, f"{name}.txt")

        lines = [
            f"{box['class_id']} {box['cx']:.6f} {box['cy']:.6f} {box['w']:.6f} {box['h']:.6f}\n"
            for box in boxes
        ]

        with open(txt_path, "w") as f:
            f.writelines(lines)


# computing IoU between 2 boxes




In [None]:
def compute_iou(box1, box2):
    # Extracts values of two boxes
    cx1, cy1, w1, h1 = box1
    cx2, cy2, w2, h2 = box2

   # Convert from Yolo features to Corner box point: [cx, cy, w, h] → [x_min, y_min, x_max, y_max]
    x1_min = cx1 - w1 / 2
    y1_min = cy1 - h1 / 2
    x1_max = cx1 + w1 / 2
    y1_max = cy1 + h1 / 2

    x2_min = cx2 - w2 / 2
    y2_min = cy2 - h2 / 2
    x2_max = cx2 + w2 / 2
    y2_max = cy2 + h2 / 2

   # Calculate ​​intersection area
    inter_x_min = max(x1_min, x2_min)
    inter_y_min = max(y1_min, y2_min)
    inter_x_max = min(x1_max, x2_max)
    inter_y_max = min(y1_max, y2_max)

    inter_area = max(0, inter_x_max - inter_x_min) * max(0, inter_y_max - inter_y_min)

    # Calculate total areas and union area
    area1 = w1 * h1
    area2 = w2 * h2

    union_area = area1 + area2 - inter_area

    if union_area == 0:
        return 0.0

    # Calculate IoU
    iou = inter_area / union_area
    return round(iou, 4)

# Calculate average IoU for each file and globally

In [None]:
def evaluate_model(gt_folder, predicted_folder):
  global_ious = []

  for filename in sorted(os.listdir(gt_folder)):
      if not filename.endswith(".txt"):
          continue

      gt_path = os.path.join(gt_folder, filename)
      pred_path = os.path.join(predicted_folder, filename)

      if not os.path.exists(pred_path):
          print(f"⚠️ Predizione mancante per {filename}")
          continue

      # Leggi GT: class cx cy w h
      with open(gt_path, 'r') as f:
          gt_boxes = [list(map(float, line.strip().split())) for line in f if line.strip()]

      # Leggi predizioni: class cx cy w h
      with open(pred_path, 'r') as f:
          pred_boxes = [list(map(float, line.strip().split())) for line in f if line.strip()]

      file_ious = []

      for gt in gt_boxes:
          gt_box = gt[1:]
          best_iou = 0.0

          for pred in pred_boxes:
              pred_box = pred[1:]
              iou = compute_iou(gt_box, pred_box)
              best_iou = max(best_iou, iou)

          file_ious.append(best_iou)
          global_ious.append(best_iou)

      # IoU avarage for each file
      if file_ious:
          avg_file_iou = sum(file_ious) / len(file_ious)
          print(f"📁 {filename} — media IoU = {avg_file_iou:.4f}")
      else:
          print(f"📁 {filename} — ⚠️ nessun box da valutare.")

  # Global IoU avarage
  if global_ious:
      avg_iou = sum(global_ious) / len(global_ious)
      print(f"\n✅ Media IoU globale su {len(global_ious)} box: {avg_iou:.4f}")
  else:
      print("\n❌ Nessun confronto valido effettuato.")


# prompt definition and execution

In [None]:
# Specify image folder(dataset) predictions folder(pred) and ground truth annotations folder(gt)
dataset_folder = "/content/drive/MyDrive/Colab Notebooks/dataset"
pred_folder = "/content/drive/MyDrive/Colab Notebooks/dataset/MoonDream2Annotations"
gt_folder = "/content/drive/MyDrive/Colab Notebooks/dataset/annotations"

#specify objects to detect
objects = [
    {"id": 0, "name": " damaged car"},
]

# objects detection in images
moondream_results = run_moondream_inference(images, objects, model)

# Saving results as yolo format in predictions folder
save_predictions_as_yolo(moondream_results, filenames, pred_folder)

#evaluate model using IoU
evaluate_model(gt_folder, predicted_folder)

# saving predictions folder in a zip file(non provato)

In [None]:
# NON PROVATO
# Percorso del file ZIP
zip_path = '/content/dataset.zip'

# Crea ZIP dell'intera cartella del dataset
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', dataset_folder)

# Avvia download del file ZIP
files.download(zip_path)
