In [1]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.92-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading nv

In [2]:
import cv2
import torch
import numpy as np
from PIL import Image
from torchvision import transforms
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# Example libraries/models (replace with your favorite ones)
# ----------------------------------------------------------
#   - ultralytics/yolov5 for object detection
#   - huggingface transformers for image captioning or VQA
# ----------------------------------------------------------

# 1. Load your models
# -------------------
#   a. YOLO (for object detection): You can install via "pip install ultralytics"
#   b. A Vision-Language model (like BLIP, BLIP2, or another captioning model)

# Object Detection Model: YOLO
# ----------------------------------------------------------
try:
    from ultralytics import YOLO
    yolomodel = YOLO("yolov5s.pt")  # or your custom-trained weights
except ImportError:
    yolomodel = None
    print("YOLO model not found. Please install ultralytics for YOLO support.")

# Image Captioning Model (e.g., BLIP from Hugging Face)
# -----------------------------------------------------
from transformers import BlipProcessor, BlipForConditionalGeneration

# processor = BlipProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
# caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
caption_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)


def analyze_image(image_path: str) -> dict:
    """
    Analyze a single image with:
    1. Object detection
    2. Image caption generation
    3. (Optional) Additional logic to guess the action or context
    Returns a dictionary of results.
    """

    # 2. Load image
    # -------------
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Could not read image at path: {image_path}")

    # 3. Object Detection (if YOLO is available)
    # ------------------------------------------
    objects = []
    if yolomodel is not None:
        # YOLO expects images in RGB as a NumPy array or path
        results = yolomodel.predict(source=image, conf=0.25, show=False)
        # results is typically a list of 'Boxes' with xyxy coords, labels, confidence
        if len(results) > 0:
            # Take the first result if there's only one batch
            detection = results[0].boxes
            for box in detection:
                # Convert box to dictionary
                x1, y1, x2, y2 = box.xyxy[0].tolist()
                label_id = int(box.cls[0])
                confidence = float(box.conf[0])
                # YOLO model has built-in class names
                label_name = yolomodel.names[label_id] if hasattr(yolomodel, 'names') else str(label_id)
                objects.append({
                    "label": label_name,
                    "confidence": confidence,
                    "bbox": [x1, y1, x2, y2]
                })

    # 4. Generate a caption using a Vision-Language model
    # ---------------------------------------------------
    # Convert OpenCV's BGR to RGB, because PIL & BLIP expect RGB
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(image_rgb)
    inputs = processor(images=pil_image, return_tensors="pt")
    captions = caption_model.generate(**inputs)
    caption_text = processor.decode(captions[0], skip_special_tokens=True)

    # 5. (Optional) Additional “Action Context” or “Scene Reasoning”
    # --------------------------------------------------------------
    # For more advanced “action recognition,” you might use a specific model
    # that classifies what's happening (e.g., someone crawling, handing over an object).
    # Here we’ll keep it simple with a straightforward guess:
    # If objects contain "person" and the caption references "running" or "race",
    # you might guess it's a running competition, etc.

    inferred_action = "Not determined"
    # if any(obj['label'] in ["person"] for obj in objects):
    #     # Simple heuristic:
    #     if "running" in caption_text or "race" in caption_text:
    #         inferred_action = "Likely a runner in a race"
    #     elif "crawling" in caption_text or "falling" in caption_text:
    #         inferred_action = "A person crawling or falling"
    #     # You could improve this logic with an actual action-recognition model

    # 6. Return the combined analysis
    # -------------------------------
    return {
        "caption": caption_text,
        "objects_detected": objects,
        "inferred_action": inferred_action
    }

# 7. Main pipeline to process multiple images
# -------------------------------------------
def main():
    image_paths = [
        "car1.jpg",
        "car2.jpg",
        "car3.jpg",
    ]

    overall_description = []

    for img_path in image_paths:
        analysis_result = analyze_image(img_path)
        overall_description.append(analysis_result)
        print(f"Analysis for {img_path}: {analysis_result}")

    # 8. Summarize across all images
    # ------------------------------
    # This step might involve combining the captions/object detections from each image,
    # then passing them to a language model to get a single summary.
    # We’ll keep it simple and just print them.
    for idx, result in enumerate(overall_description):
        print(f"Image {idx+1} Analysis:")
        print("  - Caption: ", result["caption"])
        print("  - Objects Detected: ", result["objects_detected"])
        print("  - Inferred Action: ", result["inferred_action"])
        print()

    # (Optional) Additional logic to guess the overall event or story
    # For example, if multiple images mention "runner," "crawling," "exhaustion,"
    # you might guess "An exhausted runner is crawling to the finish line in a relay race."

if __name__ == "__main__":
    main()


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
PRO TIP 💡 Replace 'model=yolov5s.pt' with new 'model=yolov5su.pt'.
YOLOv5 'u' models are trained with https://github.com/ultralytics/ultralytics and feature improved performance vs standard YOLOv5 models trained with https://github.com/ultralytics/yolov5.

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov5su.pt to 'yolov5su.pt'...


100%|██████████| 17.7M/17.7M [00:00<00:00, 414MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

ValueError: Could not read image at path: car1.jpg