In [None]:
!pip install datasets --upgrade google-generativeai

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [None]:
import os
import random
import google.generativeai as genai
import PIL.Image
import matplotlib.pyplot as plt
from datasets import load_dataset
from PIL import Image
from io import BytesIO
import json
from tqdm import tqdm

In [None]:
# Set up API key (replace with your actual API key)
os.environ["GEMINI_API_KEY"] = "XXXXXXXXXXXXXXXX"
genai.configure(api_key=os.environ["GEMINI_API_KEY"])


In [None]:
# Load the DPHR dataset from Hugging Face (train split)
dataset = load_dataset("DHPR/Driving-Hazard-Prediction-and-Reasoning", split="train")
print(f"Loaded {len(dataset)} training samples.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

val-00000-of-00001.parquet:   0%|          | 0.00/112M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/115M [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/391M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/395M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/373M [00:00<?, ?B/s]

Generating val split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/12975 [00:00<?, ? examples/s]

Loaded 12975 training samples.


In [None]:
# Ask user for the segment number to process
segment_number = int(input("Enter the segment number to process (e.g., 1, 2, 3, ...): "))

# Define segment size (number of images per segment)
segment_size = 1000

# Calculate start and end indices for the segment
start_idx = (segment_number - 1) * segment_size
end_idx = min(start_idx + segment_size, len(dataset))
if start_idx >= len(dataset):
    print(f"Segment {segment_number} is out of range. There are only {len(dataset)} images.")
    exit()

print(f"Processing images from index {start_idx} to {end_idx - 1}.")

# Select images sequentially for the specified segment
selected_indices = list(range(start_idx, end_idx))
num_images = len(selected_indices)

# Create output folder for images (e.g., "images1" for segment 1)
output_folder = f"images{segment_number}"
os.makedirs(output_folder, exist_ok=True)


Enter the segment number to process (e.g., 1, 2, 3, ...): 2
Processing images from index 1000 to 1999.


In [None]:
# Function to process Gemini output and clean JSON
def clean_gemini_response(response_text):
    try:
        # Remove Markdown-style JSON formatting
        response_text = response_text.strip().strip("```json").strip("```").strip()

        # Parse JSON
        parsed_output = json.loads(response_text)

        # Validate structure
        if isinstance(parsed_output, dict) and "description" in parsed_output and "bounding_boxes" in parsed_output:
            parsed_output.pop("filename", None)  # Remove filename if included

            # Ensure bounding box coordinates are integers
            for box in parsed_output.get("bounding_boxes", []):
                box["coordinates"] = [int(coord) for coord in box["coordinates"]]

            return parsed_output
        else:
            return {"description": response_text.strip(), "bounding_boxes": []}
    except json.JSONDecodeError:
        return {"description": response_text.strip(), "bounding_boxes": []}


In [None]:
import time
from io import BytesIO

def generate_caption(image, retries=3, wait_time=5):
    for attempt in range(retries):  # ✅ Define `attempt` here
        try:
            model = genai.GenerativeModel("gemini-2.0-flash")
            img_byte_arr = BytesIO()
            image.save(img_byte_arr, format="JPEG")
            image_bytes = img_byte_arr.getvalue()
            image_input = {"mime_type": "image/jpeg", "data": image_bytes}

            prompt = (
                "Analyze this driving scene and produce a consistent, one-sentence description focused on factors that may affect driving decisions. "
                "Include details on environmental conditions, unusual objects, and vehicle behavior. "
                "Clearly categorize vehicles as follows: vehicles at a distance, vehicles directly in front, incoming vehicles, and side-approaching vehicles; "
                "if there are only a few vehicles, refer to them as a group. Highlight only the most relevant objects for situational awareness (limit to a maximum of 3 per image). "
                "Return your result strictly as a JSON object using the exact format: "
                "{\"filename\": \"<image_filename>\", \"description\": \"<concise and consistent scene description>\", "
                "\"bounding_boxes\": [{\"label\": \"Object_Type\", \"coordinates\": [x1, y1, x2, y2]}]}. "
                "Do not include any additional text; only valid JSON should be returned."
            )

            response = model.generate_content([image_input, prompt])

            if response.text:
                return clean_gemini_response(response.text)
            else:
                return {"description": "No caption generated.", "bounding_boxes": []}

        except Exception as e:
            error_message = str(e)
            if "429" in error_message:  # ✅ Handling rate limit errors
                print(f"⚠️ Rate limit hit! Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{retries})")
                time.sleep(wait_time)
                wait_time *= 2  # ✅ Exponential backoff (5s → 10s → 20s)
            else:
                return {"description": f"Error: {error_message}", "bounding_boxes": []}

    return {"description": "Failed after multiple retries.", "bounding_boxes": []}  # ✅ Return after all retries fail


In [None]:
# Process selected images: generate captions, save images, and collect caption data.
caption_data = []  # List to store objects with filename, description, and bounding boxes

print("\nProcessing images...\n")
for i, idx in tqdm(enumerate(selected_indices), total=num_images, desc="Progress"):
    # Retrieve image from dataset (already a PIL.Image)
    image = dataset[idx]["image"]
    filename = f"file{i+1}.jpg"
    save_path = os.path.join(output_folder, filename)
    image.save(save_path, format="JPEG")

    # Generate caption and bounding box info using Gemini API
    gemini_output = generate_caption(image)
    gemini_output["filename"] = filename  # Ensure filename is included
    caption_data.append(gemini_output)


# Save structured captions to a JSON file
json_filename = f"captions{segment_number}.json" if 'segment_number' in globals() else "captions.json"
with open(json_filename, "w", encoding="utf-8") as f:
    json.dump(caption_data, f, indent=4)

print(f"\n✅ Captions and bounding boxes saved to {json_filename}")


Processing images...



Progress: 100%|██████████| 1000/1000 [52:09<00:00,  3.13s/it]


✅ Captions and bounding boxes saved to captions2.json





In [None]:
from google.colab import drive
import shutil

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Define paths
drive_folder = "/content/drive/MyDrive/Gemini_Captions"
os.makedirs(drive_folder, exist_ok=True)

# Step 3: Copy images folder (e.g., images1) and JSON file to Google Drive
shutil.copytree("images2", os.path.join(drive_folder, "images2"), dirs_exist_ok=True)
shutil.copy("captions2.json", os.path.join(drive_folder, "captions2.json"))

print(f"\n✅ Successfully saved images1 and captions1.json to Google Drive at {drive_folder}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

✅ Successfully saved images1 and captions1.json to Google Drive at /content/drive/MyDrive/Gemini_Captions
