In [None]:
!pip install transformers torch pillow einops
!pip install torch==2.5.1+cu121 torchvision==0.20.1+cu121 --index-url https://download.pytorch.org/whl/cu121
!pip install pyvips-binary pyvips
!pip install accelerate

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import os

model = AutoModelForCausalLM.from_pretrained(
    "vikhyatk/moondream2",
    revision="2025-03-27",
    trust_remote_code=True,
    device_map={"": "cuda"}
)

In [3]:
# Function to process all images in a folder
def process_images_in_folder(folder_path, objects_to_detect):
    annotations_path = os.path.join(folder_path, "annotations")

    # Create the "annotations" folder if it doesn't exist
    if not os.path.exists(annotations_path):
        os.makedirs(annotations_path)

    # Iterate through all the files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is a valid image
        if filename.endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(folder_path, filename)

            # Load the image
            image = Image.open(image_path)
            width, height = image.size

            # Create a text file to save the results
            txt_filename = os.path.splitext(filename)[0] + ".txt"
            txt_file_path = os.path.join(annotations_path, txt_filename)

            results = []

            for obj in objects_to_detect:
                object_id = obj["id"]  # Object's numerical ID
                object_name = obj["name"]  # Object's name

                # Perform bounding box detection
                result = model.detect(image, object_name)

                # Add the bounding boxes to the results list with the numeric identifier
                for bbox in result['objects']:
                    x_min = int(bbox['x_min'] * width)
                    y_min = int(bbox['y_min'] * height)
                    x_max = int(bbox['x_max'] * width)
                    y_max = int(bbox['y_max'] * height)

                    results.append(f"{object_id}, {x_min}, {y_min}, {x_max}, {y_max}\n")

            # Write all the bounding boxes to the text file
            with open(txt_file_path, 'w') as f:
                f.writelines(results)


# Specify the path to the folder containing the images
dataset_folder = "/content/dataset"

# Specify the classes to detect
objects = [
    {"id": 1, "name": "car"},
    {"id": 2, "name": "person"},
    {"id": 3, "name": "bike"},
    {"id": 4, "name": "van"}
]

# Process all images in the folder
process_images_in_folder(dataset_folder, objects)


In [2]:
import shutil
from google.colab import files

dataset_folder = "/content/dataset"

# Path for the zip file that will be created
zip_path = '/content/dataset.zip'

# Compress the folder into a .zip file
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', dataset_folder)

# Download the zip file
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>