In [None]:
from PIL import Image
import os
import random
import google.generativeai as genai
import matplotlib.pyplot as plt
from datasets import load_dataset
from io import BytesIO
import json
from tqdm import tqdm
from google.colab import drive
import shutil

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define input folder path in Google Drive
input_folder = "/content/drive/MyDrive/images_a"  # Change this to your folder path
output_folder = "/content/drive/MyDrive/Gemini_Captions/images10"  # Output folder for results
os.makedirs(output_folder, exist_ok=True)

In [None]:
# Set up API key (replace with your actual API key)
os.environ["GEMINI_API_KEY"] = "XXXXXXXXXXXXXXXXX"
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [None]:
# Load images from the folder
image_files = [f for f in os.listdir(input_folder) if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))]
print(f"Loaded {len(image_files)} images from {input_folder}.")


In [None]:
# Function to process Gemini output and clean JSON
def clean_gemini_response(response_text):
    try:
        response_text = response_text.strip().strip("```json").strip("```").strip()
        parsed_output = json.loads(response_text)
        if isinstance(parsed_output, dict) and "description" in parsed_output and "bounding_boxes" in parsed_output:
            parsed_output.pop("filename", None)
            for box in parsed_output.get("bounding_boxes", []):
                box["coordinates"] = [int(coord) for coord in box["coordinates"]]
            return parsed_output
        else:
            return {"description": response_text.strip(), "bounding_boxes": []}
    except json.JSONDecodeError:
        return {"description": response_text.strip(), "bounding_boxes": []}

In [None]:
import time
from io import BytesIO

def generate_caption(image):
    try:
        model = genai.GenerativeModel("gemini-2.0-flash")
        img_byte_arr = BytesIO()
        image.save(img_byte_arr, format="JPEG")
        image_bytes = img_byte_arr.getvalue()
        image_input = {"mime_type": "image/jpeg", "data": image_bytes}

        prompt = (
            "Analyze this driving scene and produce a consistent, one-sentence description focused on factors that may affect driving decisions. "
            "Include details on environmental conditions, unusual objects, and vehicle behavior. "
            "Clearly categorize vehicles as follows: vehicles at a distance, vehicles directly in front, incoming vehicles, and side-approaching vehicles; "
            "if there are only a few vehicles, refer to them as a group. Highlight only the most relevant objects for situational awareness (limit to a maximum of 3 per image). "
            "Return your result strictly as a JSON object using the exact format: "
            "{\"filename\": \"<image_filename>\", \"description\": \"<concise and consistent scene description>\", "
            "\"bounding_boxes\": [{\"label\": \"Object_Type\", \"coordinates\": [x1, y1, x2, y2]}]}. "
            "Do not include any additional text; only valid JSON should be returned."
        )

        response = model.generate_content([image_input, prompt])

        if response.text:
            return clean_gemini_response(response.text)
        else:
            return {"description": "No caption generated.", "bounding_boxes": []}

    except Exception as e:
        return {"description": f"Error: {str(e)}", "bounding_boxes": []}  # ✅ Immediately return on failure


In [None]:
import time
import os
import json
from tqdm import tqdm

# Process selected images: generate captions, save images, and collect caption data.
caption_data = []  # List to store objects with filename, description, and bounding boxes
sleep_interval = 3  # ⏳ Adjust this based on API rate limits

print("\nProcessing images...\n")
for i, idx in tqdm(enumerate(selected_indices), total=num_images, desc="Progress"):
    # Retrieve image from dataset (already a PIL.Image)
    image = dataset[idx]["image"]
    filename = f"file{i+1}.jpg"
    save_path = os.path.join(output_folder, filename)
    image.save(save_path, format="JPEG")

    # Generate caption and bounding box info using Gemini API
    gemini_output = generate_caption(image)
    gemini_output["filename"] = filename  # Ensure filename is included
    caption_data.append(gemini_output)

    time.sleep(sleep_interval)  # ⏳ Prevent hitting API rate limits

# Save structured captions to a JSON file
json_filename = f"captions{segment_number}.json" if 'segment_number' in globals() else "captions.json"
with open(json_filename, "w", encoding="utf-8") as f:
    json.dump(caption_data, f, indent=4)

print(f"\n✅ Captions and bounding boxes saved to {json_filename}")
