In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
print(torch.cuda.is_available())  # Should print True if A100 is detected
print(torch.cuda.get_device_name(0))  # Should print "NVIDIA A100"
print(torch.cuda.get_device_capability(0))  # Should print (8, 0) for A100

True
NVIDIA A100-SXM4-40GB
(8, 0)


## Image captioning

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
# BLIP-2 example

# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")


image = Image.open("drive/MyDrive/Colab Notebooks/sidewalk.jpg").resize((224, 224))

# Generate caption
inputs = processor(images=image, return_tensors="pt")
output = model.generate(**inputs)

print(processor.decode(output[0], skip_special_tokens=True))

there are two people walking down the sidewalk on the sidewalk


### PaliGemma 2

In [None]:
from transformers import (
    PaliGemmaProcessor,
    PaliGemmaForConditionalGeneration,
)
from PIL import Image

model_id = "google/paligemma2-3b-mix-448"

image = Image.open("drive/MyDrive/Colab Notebooks/sidewalk.jpg")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).eval()
processor = PaliGemmaProcessor.from_pretrained(model_id)

prompt = "I am visually impaired, and the scene depicted in the image represents my current view. describe the image: "
model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch.bfloat16).to(model.device)
input_len = model_inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(
        **model_inputs,
        max_new_tokens=512,   # Increase output length
        do_sample=True,       # Enable sampling for more variation
        temperature=0.7,      # Controls randomness
        top_p=0.9,            # Ensures diverse but relevant words
        repetition_penalty=1.1  # Avoids repetition
    )

    generation = generation[0][input_len:]
    decoded = processor.decode(generation, skip_special_tokens=True)
    print(decoded)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


2 women walking down a city sidewalk


## LLaVa

In [None]:
import requests
from PIL import Image

from transformers import AutoProcessor, LlavaForConditionalGeneration

model_id = "llava-hf/llava-1.5-7b-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
).to(0)

processor = AutoProcessor.from_pretrained(model_id)

# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image")
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "I am visually impaired, and the scene depicted in the image represents my current view. describe the image"},
          {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

image = Image.open("drive/MyDrive/Colab Notebooks/sidewalk.jpg")
inputs = processor(images=image, text=prompt, return_tensors='pt').to(0, torch.float16)

output = model.generate(**inputs, max_new_tokens=500, do_sample=False)
print(processor.decode(output[0][2:], skip_special_tokens=True))


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

ER:  
I am visually impaired, and the scene depicted in the image represents my current view. describe the image ASSISTANT: The image shows a city street with a group of people walking down the sidewalk. There are two women walking together, and a few other people can be seen in the scene. A FedEx truck is parked on the side of the road, and a couple of cars are also visible in the background.

The street is lined with trees, providing a pleasant atmosphere for the pedestrians. There are multiple traffic lights in the scene, ensuring the safe flow of traffic. A handbag can be spotted on the sidewalk, possibly belonging to one of the pedestrians.


In [None]:
import json
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# 1. Load LLaVA Model and Processor
model_id = "llava-hf/llava-1.5-7b-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

# 2. Load Dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 3. Process an Image and Question using LLaVA's message format
def generate_navigation_instruction(image_path, question):
    # Load and preprocess image
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Error loading image {image_path}: {str(e)}")
        return None

    # Create conversation with image and text
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": question,
                },
                {"type": "image", "image": image},
            ],
        }
    ]

    try:
        # Extract images from conversation
        images_list = []
        for message in conversation:
            for content in message["content"]:
                if content["type"] == "image":
                    images_list.append(content["image"])

        # Prepare inputs
        prompt = processor.apply_chat_template(
            conversation,
            add_generation_prompt=True
        )
        inputs = processor(
            text=prompt,
            images=images_list,
            return_tensors="pt"
        ).to(device, torch.float16)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=200, do_sample=False)

        # Decode the output
        instruction = processor.decode(
            outputs[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )

        return instruction.split("ASSISTANT: ")[1].strip()

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# 4. Process the Entire Dataset
def process_dataset(dataset, image_dir):
    results = []
    for entry in dataset:
        image_id = entry["index"]
        question = entry["question"]
        image_path = f"{image_dir}/{image_id}.jpg"

        try:
            print(f"Processing {image_id}: {question}")
            instruction = generate_navigation_instruction(image_path, question)
            if instruction:
                results.append({
                    "image_id": image_id,
                    "question": question,
                    "instruction": instruction
                })
                print(f"Instruction: {instruction}\n---")
            else:
                print(f"Failed to generate instruction for {image_id}")

        except Exception as e:
            print(f"Error processing {image_id}: {str(e)}")

    return results

# 5. Save Results
def save_results(results, output_path="drive/MyDrive/Colab Notebooks/llava_outdoor_navigation_instructions.json"):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved results to {output_path}")

# 6. Main Execution
if __name__ == "__main__":
    annotationPath = "drive/MyDrive/Colab Notebooks/annotations/outdoor_qa_pair_gt.json"
    imageDir = "drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"

    # Load dataset
    dataset = load_dataset(annotationPath)

    # Process dataset
    results = process_dataset(dataset, imageDir)

    # Save outputs
    save_results(results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Processing 0: Hello, I am visually impaired and need your assistance to complete the task of 'Approaching the target object'. Here is where I stand, and the scene depicted in the image is the view in front of me.The target object I want to approach is: fire assembly point. Could you please guide me to it using tactile cues based on the scene shown in the image?
Instruction: I'm sorry, but I am unable to provide tactile cues as I am an AI visual assistant and do not have the ability to physically guide someone. However, I can provide you with information about the scene in the image. In the image, there is a fire assembly point sign on a pole. The sign is located near a grassy area with trees in the background. The sign is positioned on the side of the road, and there is a body of water nearby. Based on this information, you can approach the fire assembly point sign by following the road and looking for the sign on the side of the road.
---
Processing 1: Hello, I am visually impaired an

In [None]:
def clean_json(input_path, output_path):
    with open(input_path, 'r') as f:
        data = json.load(f)

    cleaned_data = []
    for entry in data:
        # Remove "ASSISTANT:" prefix (if present)
        instruction = entry["instruction"].split("ASSISTANT: ")[1].strip() if "ASSISTANT:" in entry["instruction"] else entry["instruction"]

        cleaned_entry = {
            "image_id": entry["image_id"],
            "question": entry["question"],
            "instruction": instruction
        }
        cleaned_data.append(cleaned_entry)

    with open(output_path, 'w') as f:
        json.dump(cleaned_data, f, indent=2)
    print(f"Cleaned data saved to {output_path}")

clean_json("llava_navigation_instructions.json", "llava_navigation_instructions_cleaned.json")

Cleaned data saved to llava_navigation_instructions_cleaned.json


### Environment understanding

In [None]:
import json
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# 1. Load LLaVA Model and Processor
model_id = "llava-hf/llava-1.5-7b-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

# 2. Load Dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 3. Process an Image and Question using LLaVA's message format
def generate_navigation_instruction(image_path, question):
    # Load and preprocess image
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Error loading image {image_path}: {str(e)}")
        return None

    # Create conversation with image and text
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": question,
                },
                {"type": "image", "image": image},
            ],
        }
    ]

    try:
        # Extract images from conversation
        images_list = []
        for message in conversation:
            for content in message["content"]:
                if content["type"] == "image":
                    images_list.append(content["image"])

        # Prepare inputs
        prompt = processor.apply_chat_template(
            conversation,
            add_generation_prompt=True
        )
        inputs = processor(
            text=prompt,
            images=images_list,
            return_tensors="pt"
        ).to(device, torch.float16)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=200, do_sample=False)

        # Decode the output
        instruction = processor.decode(
            outputs[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )

        return instruction.split("ASSISTANT: ")[1].strip()

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# 4. Process the Entire Dataset
def process_dataset(dataset, image_dir):
    results = []
    for i, entry in enumerate(dataset):
        if 0 <= i < 25:
          image_id = entry["index"]
          question = entry["EnvQ"]
          image_path = f"{image_dir}/{image_id}.jpg"


          try:
              print(f"Processing {image_id}: {question}")
              instruction = generate_navigation_instruction(image_path, question)
              if instruction:
                  results.append({
                      "image_id": image_id,
                      "question": question,
                      "instruction": instruction
                  })
                  print(f"Instruction: {instruction}\n---")
              else:
                  print(f"Failed to generate instruction for {image_id}")

          except Exception as e:
              print(f"Error processing {image_id}: {str(e)}")

    return results

# 5. Save Results
def save_results(results, output_path="drive/MyDrive/Colab Notebooks/envQA/llava_outdoor_env.json"):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved results to {output_path}")

# 6. Main Execution
if __name__ == "__main__":
    annotationPath = "drive/MyDrive/Colab Notebooks/annotations2/outdoor_qa_pair_gt_updated.json"
    imageDir = "drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"

    # Load dataset
    dataset = load_dataset(annotationPath)

    # Process dataset
    results = process_dataset(dataset, imageDir)

    # Save outputs
    save_results(results)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Processing 0: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: A green field with a sign and two trees in the foreground.
---
Processing 1: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: A large building with a staircase leading to a doorway, a bench and a potted plant in front of the building.
---
Processing 2: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: A park with a bench and a trash can, and a building with a solar panel on top.
---
Processing 3: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: A man is sitting at a picnic table in a park, with a bench nearby and a lamp post above him.
---
Processing 4: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: The image shows a parking lot with a row of parking meters and several cars parked i

### proximity and distance

In [None]:
import json
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration


# 1. Load LLaVA Model and Processor
model_id = "llava-hf/llava-1.5-7b-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

# 2. Load Dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 3. Process an Image and Question using LLaVA's message format
def generate_navigation_instruction(image_path, question):
    # Load and preprocess image
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Error loading image {image_path}: {str(e)}")
        return None

    # Create conversation with image and text
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": question,
                },
                {"type": "image", "image": image},
            ],
        }
    ]

    try:
        # Extract images from conversation
        images_list = []
        for message in conversation:
            for content in message["content"]:
                if content["type"] == "image":
                    images_list.append(content["image"])

        # Prepare inputs
        prompt = processor.apply_chat_template(
            conversation,
            add_generation_prompt=True
        )
        inputs = processor(
            text=prompt,
            images=images_list,
            return_tensors="pt"
        ).to(device, torch.float16)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=200, do_sample=False)

        # Decode the output
        instruction = processor.decode(
            outputs[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )

        return instruction.split("ASSISTANT: ")[1].strip()

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# 4. Process the Entire Dataset
def process_dataset(dataset, image_dir):
    results = []
    for i, entry in enumerate(dataset):
        if 0 <= i < 25:
          image_id = entry["index"]
          question = entry["distanceQ"]
          image_path = f"{image_dir}/{image_id}.jpg"


          try:
              print(f"Processing {image_id}: {question}")
              instruction = generate_navigation_instruction(image_path, question)
              if instruction:
                  results.append({
                      "image_id": image_id,
                      "question": question,
                      "instruction": instruction
                  })
                  print(f"Instruction: {instruction}\n---")
              else:
                  print(f"Failed to generate instruction for {image_id}")

          except Exception as e:
              print(f"Error processing {image_id}: {str(e)}")

    return results

# 5. Save Results
def save_results(results, output_path="drive/MyDrive/Colab Notebooks/distanceQA/llava_outdoor_distance.json"):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved results to {output_path}")

# 6. Main Execution
if __name__ == "__main__":
    annotationPath = "drive/MyDrive/Colab Notebooks/annotations3/outdoor_qa_pair_gt_updated.json"
    imageDir = "drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"

    # Load dataset
    dataset = load_dataset(annotationPath)

    # Process dataset
    results = process_dataset(dataset, imageDir)

    # Save outputs
    save_results(results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Processing 0: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the fire assembly point. Output only the number.
Instruction: 100.0
---
Processing 1: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the staircase. Output only the number.
Instruction: 100
---
Processing 2: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the chair. Output only the number.
Instruction: 10.0
---
Processing 3: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the desk. Output only the number.
Instruction: 10.0
---
Processing 4: Given the attached image, report the straight-line 3-D Euclidean dist

In [None]:

torch.cuda.empty_cache()  # Releases unused memory to the CUDA allocator
torch.cuda.ipc_collect()

# Qwen

In [None]:
!pip install bitsandbytes



In [None]:
import json
from PIL import Image
import torch
import cv2
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoProcessor,
    BitsAndBytesConfig
)

# 1. Load Qwen-VL Model and Processor
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    torch_dtype=torch.float16
).eval()

processor = AutoProcessor.from_pretrained(model_name)

# 2. Load Dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 3. Process an Image and Question using the message format
def generate_navigation_instruction(image_path, question):
    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")

    # Create messages structure (as per the example)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},  # Pass the PIL image directly
                {"type": "text", "text": question},
            ],
        }
    ]

    # Process messages into inputs
    try:
        # Apply chat template and process vision inputs
        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Extract vision inputs (images/videos)
        # Note: `process_vision_info` is a helper function you need to implement
        # Here's a simple implementation for images:
        image_inputs = []
        for message in messages:
            for content in message["content"]:
                if content["type"] == "image":
                    image_inputs.append(content["image"])

        # Create model inputs
        inputs = processor(
            text=[text],
            images=image_inputs,
            padding=True,
            return_tensors="pt"
        ).to(model.device)  # Move to the correct device

        # Generate response
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=350)

        # Decode the output
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, outputs)
        ]
        instruction = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]  # Extract first result

        return instruction

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# 4. Process the Entire Dataset
def process_dataset(dataset, image_dir):
    results = []
    for entry in dataset:
        image_id = entry["index"]
        question = entry["question"]
        image_path = f"{image_dir}/{image_id}.jpg"

        try:
            print(f"Processing {image_id}: {question}")
            instruction = generate_navigation_instruction(image_path, question)
            if instruction:
                results.append({
                    "image_id": image_id,
                    "question": question,
                    "instruction": instruction
                })
                print(f"Instruction: {instruction}\n---")
            else:
                print(f"Failed to generate instruction for {image_id}")

        except Exception as e:
            print(f"Error processing {image_id}: {str(e)}")

    return results

# 5. Save Results
def save_results(results, output_path="drive/MyDrive/Colab Notebooks/Qwen_outdoor_navigation_instructions.json"):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved results to {output_path}")

# 6. Main Execution
if __name__ == "__main__":
    annotationPath = "drive/MyDrive/Colab Notebooks/annotations/outdoor_qa_pair_gt.json"
    imageDir = "drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"

    # Load dataset
    dataset = load_dataset(annotationPath)

    # Process dataset
    results = process_dataset(dataset, imageDir)

    # Save outputs
    save_results(results)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Processing 0: Hello, I am visually impaired and need your assistance to complete the task of 'Approaching the target object'. Here is where I stand, and the scene depicted in the image is the view in front of me.The target object I want to approach is: fire assembly point. Could you please guide me to it using tactile cues based on the scene shown in the image?
Instruction: To guide you towards the fire assembly point, let's analyze the image step by step:

1. **Identify the Sign**: The sign with the letter "W" and a checkmark inside a circle is likely indicating a specific location or direction. This could be a marker for the fire assembly point.

2. **Direction of the Sign**: The sign is positioned on the right side of the road, slightly ahead of the viewer. It appears to be pointing towards the area where the trees are located.

3. **Pathway**: There is a paved path or road that runs through the scene. You should follow this path as it leads towards the sign.

4. **Surroundings**: T

### env understanding

In [None]:
import json
from PIL import Image
import torch
import cv2
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoProcessor,
    BitsAndBytesConfig
)

# 1. Load Qwen-VL Model and Processor
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    torch_dtype=torch.float16
).eval()

processor = AutoProcessor.from_pretrained(model_name)

# 2. Load Dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 3. Process an Image and Question using the message format
def generate_navigation_instruction(image_path, question):
    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")

    # Create messages structure (as per the example)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},  # Pass the PIL image directly
                {"type": "text", "text": question},
            ],
        }
    ]

    # Process messages into inputs
    try:
        # Apply chat template and process vision inputs
        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Extract vision inputs (images/videos)
        # Note: `process_vision_info` is a helper function you need to implement
        # Here's a simple implementation for images:
        image_inputs = []
        for message in messages:
            for content in message["content"]:
                if content["type"] == "image":
                    image_inputs.append(content["image"])

        # Create model inputs
        inputs = processor(
            text=[text],
            images=image_inputs,
            padding=True,
            return_tensors="pt"
        ).to(model.device)  # Move to the correct device

        # Generate response
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=350)

        # Decode the output
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, outputs)
        ]
        instruction = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]  # Extract first result

        return instruction

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# 4. Process the Entire Dataset
def process_dataset(dataset, image_dir):
    results = []
    for i, entry in enumerate(dataset):
        if 0 <= i < 25:
          image_id = entry["index"]
          question = entry["EnvQ"]
          image_path = f"{image_dir}/{image_id}.jpg"

          try:
              print(f"Processing {image_id}: {question}")
              instruction = generate_navigation_instruction(image_path, question)
              if instruction:
                  results.append({
                      "image_id": image_id,
                      "question": question,
                      "instruction": instruction
                  })
                  print(f"Instruction: {instruction}\n---")
              else:
                  print(f"Failed to generate instruction for {image_id}")

          except Exception as e:
              print(f"Error processing {image_id}: {str(e)}")

    return results

# 5. Save Results
def save_results(results, output_path="drive/MyDrive/Colab Notebooks/envQA/Qwen_outdoor_env.json"):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved results to {output_path}")

# 6. Main Execution
if __name__ == "__main__":
    annotationPath = "drive/MyDrive/Colab Notebooks/annotations2/outdoor_qa_pair_gt_updated.json"
    imageDir = "drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"

    # Load dataset
    dataset = load_dataset(annotationPath)

    # Process dataset
    results = process_dataset(dataset, imageDir)

    # Save outputs
    save_results(results)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Processing 0: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: The image depicts a serene outdoor scene featuring a grassy area with young trees, a sign indicating a water feature nearby, and a road running alongside a body of water under a clear blue sky.
---
Processing 1: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: The image depicts a modern building labeled "Computer Science Building" with a staircase leading up to it, surrounded by greenery and a clear blue sky.
---
Processing 2: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: The image depicts a modern outdoor seating area with transparent shelters, a trash bin labeled "LITTER," and a sign featuring a QR code, set against a backdrop of greenery and a partly cloudy sky.
---
Processing 3: In one sentence, give an overall description of the scene in the image (no numbers).
Instructio

### Proximity and distance

In [None]:
import json
from PIL import Image
import torch
import cv2
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoProcessor,
    BitsAndBytesConfig
)

# 1. Load Qwen-VL Model and Processor
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    torch_dtype=torch.float16
).eval()

processor = AutoProcessor.from_pretrained(model_name)

# 2. Load Dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 3. Process an Image and Question using the message format
def generate_navigation_instruction(image_path, question):
    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")

    # Create messages structure (as per the example)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},  # Pass the PIL image directly
                {"type": "text", "text": question},
            ],
        }
    ]

    # Process messages into inputs
    try:
        # Apply chat template and process vision inputs
        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Extract vision inputs (images/videos)
        # Note: `process_vision_info` is a helper function you need to implement
        # Here's a simple implementation for images:
        image_inputs = []
        for message in messages:
            for content in message["content"]:
                if content["type"] == "image":
                    image_inputs.append(content["image"])

        # Create model inputs
        inputs = processor(
            text=[text],
            images=image_inputs,
            padding=True,
            return_tensors="pt"
        ).to(model.device)  # Move to the correct device

        # Generate response
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=350)

        # Decode the output
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, outputs)
        ]
        instruction = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]  # Extract first result

        return instruction

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# 4. Process the Entire Dataset
def process_dataset(dataset, image_dir):
    results = []
    for i, entry in enumerate(dataset):
        if 0 <= i < 25:
          image_id = entry["index"]
          question = entry["distanceQ"]
          image_path = f"{image_dir}/{image_id}.jpg"

          try:
              print(f"Processing {image_id}: {question}")
              instruction = generate_navigation_instruction(image_path, question)
              if instruction:
                  results.append({
                      "image_id": image_id,
                      "question": question,
                      "instruction": instruction
                  })
                  print(f"Instruction: {instruction}\n---")
              else:
                  print(f"Failed to generate instruction for {image_id}")

          except Exception as e:
              print(f"Error processing {image_id}: {str(e)}")

    return results

# 5. Save Results
def save_results(results, output_path="drive/MyDrive/Colab Notebooks/distanceQA/Qwen_home_distance.json"):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved results to {output_path}")

# 6. Main Execution
if __name__ == "__main__":
    annotationPath = "drive/MyDrive/Colab Notebooks/annotations3/domestic_home_qa_pair_gt_updated.json"
    imageDir = "drive/MyDrive/Colab Notebooks/environment_images/domestic_home_images_1024x768"

    # Load dataset
    dataset = load_dataset(annotationPath)

    # Process dataset
    results = process_dataset(dataset, imageDir)

    # Save outputs
    save_results(results)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Processing 0: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the shower faucet. Output only the number.
Instruction: 1.50
---
Processing 1: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the sofa. Output only the number.
Instruction: 1.50
---
Processing 2: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the kitchen faucet. Output only the number.
Instruction: 1.50
---
Processing 3: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the fridge. Output only the number.
Instruction: 1.50
---
Processing 4: Given the attached image, report the straight-line 3-D Euclidean dist


# OpenAI

In [None]:
!pip install openai



In [None]:
import os
from openai import OpenAI

In [None]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

client = OpenAI(
    # This is the default and can be omitted
    api_key=OPENAI_API_KEY,
)

response = client.responses.create(
    model="gpt-4o",
    instructions="You are a coding assistant that talks like a pirate.",
    input="How do I check if a Python object is an instance of a class?",
)

print(response.output_text)



Arrr matey! If ye be wantin' to check if a Python object be an instance of a class, ye can use the `isinstance()` function. Here’s how ye do it:

```python
if isinstance(yar_object, YarClass):
    print("Aye, 'tis an instance of the class!")
else:
    print("Nay, it be not!")
```

Just replace `yar_object` with the object ye be checkin' and `YarClass` with the class ye suspect it belongs to. Smooth sailin’! 🏴‍☠️


In [None]:
import json
from PIL import Image
import base64


# 2. Helper function to encode image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# 3. Load dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 4. Generate navigation instruction using GPT-4o
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        base64_data = base64.b64encode(image_file.read()).decode('utf-8')
        return f"data:image/jpeg;base64,{base64_data}"

# 3. Generate instruction
def generate_navigation_instruction(image_path, question):
    try:
        base64_image_url = encode_image_to_base64(image_path)

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {"type": "image_url", "image_url": {"url": base64_image_url}},
                    ],
                }
            ],
            max_tokens=350
        )

        instruction = response.choices[0].message.content
        return instruction

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# 5. Process the entire dataset
def process_dataset(dataset, image_dir):
    results = []
    for entry in dataset:
        image_id = entry["index"]
        question = entry["question"]
        image_path = f"{image_dir}/{image_id}.jpg"

        try:
            print(f"Processing {image_id}: {question}")
            instruction = generate_navigation_instruction(image_path, question)
            if instruction:
                results.append({
                    "image_id": image_id,
                    "question": question,
                    "instruction": instruction
                })
                print(f"Instruction: {instruction}\n---")
            else:
                print(f"Failed to generate instruction for {image_id}")

        except Exception as e:
            print(f"Error processing {image_id}: {str(e)}")

    return results

# 6. Save results
def save_results(results, output_path="drive/MyDrive/Colab Notebooks/gpt4o_outdoor_navigation_instructions.json"):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved results to {output_path}")

# 7. Main Execution
if __name__ == "__main__":
    annotationPath = "drive/MyDrive/Colab Notebooks/annotations/outdoor_qa_pair_gt.json"
    imageDir = "drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"

    # Load dataset
    dataset = load_dataset(annotationPath)

    # Process dataset
    results = process_dataset(dataset, imageDir)

    # Save outputs
    save_results(results)

Processing 0: Hello, I am visually impaired and need your assistance to complete the task of 'Approaching the target object'. Here is where I stand, and the scene depicted in the image is the view in front of me.The target object I want to approach is: fire assembly point. Could you please guide me to it using tactile cues based on the scene shown in the image?
Instruction: To guide you to the fire assembly point using tactile cues, here's a step-by-step approach based on the scene:

1. **Starting Position**: Ensure you are facing the open grassy area.

2. **Move Forward**: Carefully walk straight across the grass. The ground may feel slightly uneven due to small plants and the natural texture of the grass.

3. **Listen for the Road**: As you move forward, listen for the change in texture or a slight incline as you approach the road. You'll also hear the change from the soft grass to the harder surface of the road.

4. **Locate the Sign**: Once you reach the edge of the road, you'll ne

### Env understanding

In [None]:
import json
from PIL import Image
import base64


# 2. Helper function to encode image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# 3. Load dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 4. Generate navigation instruction using GPT-4o
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        base64_data = base64.b64encode(image_file.read()).decode('utf-8')
        return f"data:image/jpeg;base64,{base64_data}"

# 3. Generate instruction
def generate_navigation_instruction(image_path, question):
    try:
        base64_image_url = encode_image_to_base64(image_path)

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {"type": "image_url", "image_url": {"url": base64_image_url}},
                    ],
                }
            ],
            max_tokens=350
        )

        instruction = response.choices[0].message.content
        return instruction

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# 5. Process the entire dataset
def process_dataset(dataset, image_dir):
    results = []
    for i, entry in enumerate(dataset):
        if 0 <= i < 25:
          image_id = entry["index"]
          question = entry["EnvQ"]
          image_path = f"{image_dir}/{image_id}.jpg"

          try:
              print(f"Processing {image_id}: {question}")
              instruction = generate_navigation_instruction(image_path, question)
              if instruction:
                  results.append({
                      "image_id": image_id,
                      "question": question,
                      "instruction": instruction
                  })
                  print(f"Instruction: {instruction}\n---")
              else:
                  print(f"Failed to generate instruction for {image_id}")

          except Exception as e:
              print(f"Error processing {image_id}: {str(e)}")

    return results

# 6. Save results
def save_results(results, output_path="drive/MyDrive/Colab Notebooks/envQA/gpt4o_outdoor_env.json"):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved results to {output_path}")

# 7. Main Execution
if __name__ == "__main__":
    annotationPath = "drive/MyDrive/Colab Notebooks/annotations2/outdoor_qa_pair_gt_updated.json"
    imageDir = "drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"

    # Load dataset
    dataset = load_dataset(annotationPath)

    # Process dataset
    results = process_dataset(dataset, imageDir)

    # Save outputs
    save_results(results)

Processing 0: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: The scene depicts a peaceful park with lush green grass, trees, a distant water body, and a partly cloudy blue sky.
---
Processing 1: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: The image shows a modern university building with a stairway, surrounded by greenery, under a partly cloudy sky.
---
Processing 2: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: The image depicts an outdoor area with covered seating, a litter bin, landscaped greenery, and a clear blue sky with some clouds.
---
Processing 3: In one sentence, give an overall description of the scene in the image (no numbers).
Instruction: The scene depicts an outdoor area with picnic tables, planters, and a person sitting, under a bright blue sky with scattered clouds.
---
Processing 4: In one sentence, give an overa

### Proximity and distance

In [None]:
import json
from PIL import Image
import base64


# 2. Helper function to encode image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# 3. Load dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 4. Generate navigation instruction using GPT-4o
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        base64_data = base64.b64encode(image_file.read()).decode('utf-8')
        return f"data:image/jpeg;base64,{base64_data}"

# 3. Generate instruction
def generate_navigation_instruction(image_path, question):
    try:
        base64_image_url = encode_image_to_base64(image_path)

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {"type": "image_url", "image_url": {"url": base64_image_url}},
                    ],
                }
            ],
            max_tokens=350
        )

        instruction = response.choices[0].message.content
        return instruction

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# 5. Process the entire dataset
def process_dataset(dataset, image_dir):
    results = []
    for i, entry in enumerate(dataset):
        if 0 <= i < 25:
          image_id = entry["index"]
          question = entry["distanceQ"]
          image_path = f"{image_dir}/{image_id}.jpg"

          try:
              print(f"Processing {image_id}: {question}")
              instruction = generate_navigation_instruction(image_path, question)
              if instruction:
                  results.append({
                      "image_id": image_id,
                      "question": question,
                      "instruction": instruction
                  })
                  print(f"Instruction: {instruction}\n---")
              else:
                  print(f"Failed to generate instruction for {image_id}")

          except Exception as e:
              print(f"Error processing {image_id}: {str(e)}")

    return results

# 6. Save results
def save_results(results, output_path="drive/MyDrive/Colab Notebooks/distanceQA/gpt4o_home_distance.json"):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved results to {output_path}")

# 7. Main Execution
if __name__ == "__main__":
    annotationPath = "drive/MyDrive/Colab Notebooks/annotations3/domestic_home_qa_pair_gt_updated.json"
    imageDir = "drive/MyDrive/Colab Notebooks/environment_images/domestic_home_images_1024x768"

    # Load dataset
    dataset = load_dataset(annotationPath)

    # Process dataset
    results = process_dataset(dataset, imageDir)

    # Save outputs
    save_results(results)

Processing 0: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the shower faucet. Output only the number.
Instruction: I'm unable to provide the distance from the camera to the shower faucet based on the image.
---
Processing 1: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the sofa. Output only the number.
Instruction: I'm unable to calculate distances from images directly.
---
Processing 2: Given the attached image, report the straight-line 3-D Euclidean distance (in metres, 2 decimal places) from the camera center to the nearest instance of the kitchen faucet. Output only the number.
Instruction: I'm unable to determine the 3-D Euclidean distance from the camera to the faucet in the image.
---
Processing 3: Given the attached image, report the straight-line 3-D Euclidean di

# Eveluation metrics

In [None]:
!pip install rouge-score bert-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=d3331398a76b3e6d88ea70559c7ebad264f81d340080de3cb7817daaaeb4c6df
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


In [None]:
import json
import pandas as pd
from rouge_score import rouge_scorer
from bert_score import score


# Your dataset paths
annotationPath = "/content/drive/MyDrive/Colab Notebooks/annotations2/outdoor_qa_pair_gt_updated.json"

outFile        = "/content/drive/MyDrive/Colab Notebooks/envQA/gpt4o_outdoor_env.json"

# 加载数据
with open(outFile, "r") as f:
    generated = json.load(f)

with open(annotationPath, "r") as f:
    annotations = json.load(f)

# 初始化结果 DataFrame
results_df = pd.DataFrame(columns=["image_id", "RG_1", "RG_2", "RG_L", "BERT_S", "Len"])

# 计算指标
for entry_gen in generated:
    image_id = entry_gen["image_id"]
    gen_instruction = entry_gen["instruction"]

    # 获取标注答案
    gt_entry = next((a for a in annotations if a["index"] == image_id), None)
    if not gt_entry:
        continue

    ans_gt = gt_entry["EnvA"]

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge = scorer.score(gen_instruction, ans_gt)
    rouge_scores = {
        "RG_1": rouge["rouge1"].fmeasure,
        "RG_2": rouge["rouge2"].fmeasure,
        "RG_L": rouge["rougeL"].fmeasure
    }

    # BERTScore
    hypotheses = [gen_instruction]
    references = [ans_gt]
    _, _, f1 = score(
        cands=hypotheses,
        refs=references,
        lang="en",
        model_type="bert-base-uncased"
    )
    bert_score = f1.mean().item()

    # 长度差异
    len_diff = abs(len(gen_instruction.split()) - len(ans_gt.split()))

    # 存储结果
    results_df = results_df._append({
        "image_id": image_id,
        "RG_1": rouge_scores["RG_1"],
        "RG_2": rouge_scores["RG_2"],
        "RG_L": rouge_scores["RG_L"],
        "BERT_S": bert_score,
        "Len": len_diff
    }, ignore_index=True)

# 输出结果
print(results_df)
print("\nAverage Metrics:")
print(results_df.mean(numeric_only=True))

  results_df = results_df._append({


    image_id      RG_1      RG_2      RG_L    BERT_S   Len
0        0.0  0.375000  0.086957  0.333333  0.663444   5.0
1        1.0  0.307692  0.054054  0.307692  0.606774   1.0
2        2.0  0.325581  0.048780  0.325581  0.604971   1.0
3        3.0  0.400000  0.093023  0.355556  0.625278   1.0
4        4.0  0.511628  0.243902  0.418605  0.742355   3.0
5        5.0  0.410256  0.270270  0.358974  0.718139   0.0
6        6.0  0.476190  0.100000  0.476190  0.721203   5.0
7        7.0  0.480000  0.083333  0.280000  0.663503   8.0
8        8.0  0.606061  0.322581  0.484848  0.673059   2.0
9        9.0  0.450000  0.105263  0.350000  0.646502   2.0
10      10.0  0.368421  0.166667  0.315789  0.646923   4.0
11      11.0  0.444444  0.117647  0.388889  0.701926   1.0
12      12.0  0.514286  0.060606  0.285714  0.677656   1.0
13      13.0  0.421053  0.166667  0.368421  0.649468   2.0
14      14.0  0.488889  0.232558  0.444444  0.646253   1.0
15      15.0  0.500000  0.217391  0.375000  0.647739   7

In [None]:
import json
with open("llava_ng_nf.json", "r") as f:
    scores = json.load(f)

def calculate_metrics(scores):
    total = len(scores)
    ng_count = sum(1 for s in scores if s["correctness"] <= 3)
    nf_count = sum(1 for s in scores if s["actionability"] <= 3)
    return {
        "Not Grounded Rate": ng_count / total,
        "Not Fine-grained Rate": nf_count / total
    }

Qwen_metrics = calculate_metrics(scores)
print(Qwen_metrics)

{'Not Grounded Rate': 0.5, 'Not Fine-grained Rate': 0.85}


In [None]:
import json

predictions_path = "/content/drive/MyDrive/Colab Notebooks/distanceQA/Qwen_outdoor_distance.json"
ground_truth_path = "/content/drive/MyDrive/Colab Notebooks/annotations3/outdoor_qa_pair_gt_updated.json"

# Load JSON files
with open(predictions_path, 'r') as f1:
    preds = json.load(f1)

with open(ground_truth_path, 'r') as f2:
    gts = json.load(f2)

# Convert ground truth list into a dictionary for fast lookup
gt_dict = {}
for item in gts:
    if "distanceA" in item:
        try:
            gt_dict[item["index"]] = float(item["distanceA"])
        except ValueError:
            continue  # skip if distanceA isn't a float

# Evaluation
success_125 = 0
success_200 = 0
evaluated = 0

for item in preds:
    image_id = item["image_id"]
    try:
        pred = float(item["instruction"])
        gt = gt_dict[image_id]
        if pred == 0 or gt == 0:
            continue  # avoid division by zero

        delta = max(pred / gt, gt / pred)

        if delta <= 2:
            success_200 += 1
        if delta <= 1.25:
            success_125 += 1
        evaluated += 1
    except (ValueError, KeyError):
        continue  # skip entries with bad format or missing gt

# Report
print(f"Total evaluated samples: {evaluated}")
print(f"Success rate δ ≤ 2: {success_200 / evaluated * 100:.2f}%")
print(f"Success rate δ ≤ 1.25: {success_125 / evaluated * 100:.2f}%")


Total evaluated samples: 25
Success rate δ ≤ 2: 52.00%
Success rate δ ≤ 1.25: 16.00%


# VLM + LLM

In [None]:
# ↳ Run once (≈ 3 min). Restart runtime when Colab asks.
!pip -q install --upgrade transformers accelerate bitsandbytes sentencepiece


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')          # authorise → Drive at /content/drive

# Your dataset paths
annotationPath = "/content/drive/MyDrive/Colab Notebooks/annotations/outdoor_qa_pair_gt.json"
imageDir       = "/content/drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"

outFile        = "/content/drive/MyDrive/Colab Notebooks/Qwenvl_outdoor_navigation_results.json"


Mounted at /content/drive


In [None]:
import torch, warnings, json, re
from PIL import Image
from transformers import (
    AutoProcessor, AutoModelForVision2Seq,
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig
)

dtype = torch.bfloat16     # keeps VRAM low
bnb_q = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=dtype)

# Vision‑language model (scene descriptor)
vlm_name = "Qwen/Qwen2.5-VL-7B-Instruct"
print("⏳ Loading VLM …")
vlm       = AutoModelForVision2Seq.from_pretrained(
               vlm_name, quantization_config=bnb_q,
               torch_dtype=dtype, device_map="auto").eval()
vlm_proc  = AutoProcessor.from_pretrained(vlm_name)

# Reasoning chat model (step‑by‑step)
llm_name = "Qwen/Qwen1.5-7B-Chat"      # choose any open‑source “‑Chat/‑Instruct”
print("⏳ Loading LLM …")
llm       = AutoModelForCausalLM.from_pretrained(
               llm_name, quantization_config=bnb_q,
               torch_dtype=dtype, device_map="auto").eval()
llm_tok   = AutoTokenizer.from_pretrained(llm_name)
llm_tok.pad_token = llm_tok.eos_token


def describe_scene(img: Image.Image) -> str:
    """
    Vision pass (Qwen‑VL) – now returns sector + rough distance.
    """
    prompt = (
        "You are a navigation camera for a blind user. "
        "Divide the view into three sectors (left, centre, right) relative to the wearer’s heading. "
        "For each salient object or hazard list:\n"
        "• name\n• rough distance category: close (≈3 steps), mid (≈7 steps), far (≈12 steps)\n"
        "• height band if obvious: floor / knee / waist / chest / head / above head.\n"
        "Respond as terse bullet points per sector."
    )

    messages = [{"role": "user",
                 "content": [{"type": "image", "image": img},
                             {"type": "text",  "text": prompt}]}]

    txt = vlm_proc.apply_chat_template(messages, tokenize=False,
                                       add_generation_prompt=True)
    inputs = vlm_proc(text=[txt], images=[img],
                      return_tensors="pt").to(vlm.device)

    with torch.no_grad():
        out = vlm.generate(**inputs, max_new_tokens=160)

    desc = vlm_proc.decode(out[0][inputs.input_ids.shape[-1]:],
                           skip_special_tokens=True)
    return desc.strip()




def locate_target(scene: str, question: str):
    """
    Very small heuristic parser:
      • Finds the noun that follows 'the' in the question
      • Looks for that noun in the scene‑description bullets
      • Returns sector (left/centre/right) and distance tag (close/mid/far)
    """
    m = re.search(r'\bthe\s+([a-zA‑Z ]+?)(\?|$)', question.lower())
    noun = m.group(1).strip() if m else ""
    sector, dist = None, None
    for line in scene.lower().splitlines():
        if noun and noun in line:
            if "left:"   in line: sector = "left"
            if "centre"  in line: sector = "centre"
            if "right:"  in line: sector = "right"
            for d in ("close", "mid", "far"):
                if d in line:
                    dist = d
                    break
            break
    return noun, sector, dist


SECTOR2LOC = {"left":  "Left",
              "centre":"Front",
              "right": "Right"}

HEIGHT2LOC = {"floor":"Bottom",
              "knee":"Bottom",
              "waist":"Middle",
              "chest":"Middle",
              "head":"Top"}



# ---------- reasoning LLM prompt ----------
REASON_TEMPLATE = """<|system|>
You are an assistive agent.  Produce ONE paragraph **exactly** in this style:
Starting from the current position; please move forward <N> steps, then turn <left/right> <ANGLE> degrees; ...  The pathway sentence (smooth, clear, etc.).  The item-location sentence: "The item you need is at your <Front/Left/Right/Top left/Bottom right>."  A final tactile-identification sentence about shape / temperature / texture.
Follow the wording, punctuation and capitalisation of the examples.
Use semi-colons to separate navigation clauses, a full stop between sections, and end with a full stop.

Example 1
Starting from the current position; please move forward 3 steps, then remain standing. The pathway is clear, allowing for a smooth and uninterrupted walk. The item you need is at your left. You can identify fruits by their shape …

Example 2
You are standing to the left of the shelf, please turn right 90 degrees. You may first encounter bottled jellies …

<|user|>
Target object (noun): {target}
Sector: {sector} | Distance tag: {distance} | Height band: {height_band}
Scene facts: {scene}
User request: {question}
<|assistant|>
Let's think step by step."""

def answer_question(scene, question):
    noun, sector, dist = locate_target(scene, question)
    height = "waist"            # crude default if parser fails
    # crude height extraction
    for hb in HEIGHT2LOC:
        if hb in scene.lower():
            height = hb; break

    prompt = REASON_TEMPLATE.format(
        target=noun or "object",
        sector=sector or "centre",
        distance=dist or "mid",
        height_band=HEIGHT2LOC.get(height, "Middle"),
        scene=scene, question=question)

    inputs = llm_tok(prompt, return_tensors="pt").to(llm.device)
    with torch.no_grad():
        gen = llm.generate(**inputs, max_new_tokens=220,
                           temperature=0.3, top_p=0.9)

    text  = llm_tok.decode(gen[0][inputs.input_ids.shape[-1]:],
                           skip_special_tokens=True).strip()
    # keep the last non-empty line (style paragraph)
    return text.splitlines()[-1].rstrip(".") + "."





results = []
with open(annotationPath) as f:
    pairs = json.load(f)

for item in pairs:
    img_id, q = item["index"], item["question"]
    try:
        img = Image.open(f"{imageDir}/{img_id}.jpg").convert("RGB")
    except Exception as e:
        warnings.warn(f"Skip {img_id}: {e}"); continue

    print(f"→ {img_id}: making scene description …")
    scene = describe_scene(img)
    print("   reasoning …")
    ans   = answer_question(scene, q)

    results.append({
        "image_id": img_id,
        "question": q,
        "scene": scene,
        "answer": ans
    })
    print(f"   ✓ {ans}")



with open(outFile, "w") as f:
    json.dump(results, f, indent=2)
print(f"✅  Saved {len(results)} results → {outFile}")

⏳ Loading VLM …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/57.6k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/5.70k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

⏳ Loading LLM …


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/31.7k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.54G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

→ 0: making scene description …
   reasoning …
   ✓ Starting from your current position, which is in the centre sector, move forward towards the signpost with the "W" symbol, approximately 7 steps, keeping the height level with your chest. As you approach, you'll notice the signpost; it's a central reference point. After that, turn right 90 degrees, moving towards the right sector. There, you'll find a young tree, its height reaching your knee level, marking the close distance. The fire assembly point, being in the left sector, should be easily identifiable by its unique features or texture. Remember to rely on your sense of touch to locate it.
→ 1: making scene description …
   reasoning …
   ✓ You are currently standing in the centre sector, which is marked as mid-distance (approximately 7 steps) and within the height band of the knee level. To your left, you find the Garden Bed. To reach the target object, the staircase, I suggest you move towards the centre. Starting from your curr

In [None]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [None]:
import json
import base64
from PIL import Image
import re
import warnings
from openai import OpenAI

# 🔑 Set up your OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)  # <-- Replace with your actual key

# ✅ Convert image to base64 URL for OpenAI input
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        base64_data = base64.b64encode(image_file.read()).decode('utf-8')
        return f"data:image/jpeg;base64,{base64_data}"

# ✅ Step 1: Scene description (vision-language)
def describe_scene(image_path):
    image_data = encode_image_to_base64(image_path)
    prompt = (
      "You are a navigation camera for a blind user. "
      "Divide the view into three sectors (left, centre, right) relative to the wearer’s heading. "
      "For each salient object or hazard, list:\n"
      "• name\n• approximate number of steps away (e.g., 2 steps, 6 steps, etc.)\n"
      "• height band if obvious: floor / knee / waist / chest / head / above head.\n"
      "Respond as terse bullet points per sector."
    )
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": image_data}},
                ],
            }
        ],
        max_tokens=350
    )
    return response.choices[0].message.content.strip()

# ✅ Step 2: Heuristic parser to extract sector & distance
def locate_target(scene, question):
    m = re.search(r'\bthe\s+([a-zA-Z ]+?)(\?|$)', question.lower())
    noun = m.group(1).strip() if m else ""
    sector, dist = None, None
    for line in scene.lower().splitlines():
        if noun and noun in line:
            if "left:" in line: sector = "left"
            if "centre" in line: sector = "centre"
            if "right:" in line: sector = "right"
            for d in ("close", "mid", "far"):
                if d in line:
                    dist = d
                    break
            break
    return noun, sector, dist

SECTOR2LOC = {"left": "Left", "centre": "Front", "right": "Right"}
HEIGHT2LOC = {
    "floor": "Bottom", "knee": "Bottom", "waist": "Middle",
    "chest": "Middle", "head": "Top", "above head": "Top"
}

# ✅ Step 3: Reasoning using GPT-4o
REASON_TEMPLATE = """<|system|>
You are an assistive agent. Produce ONE paragraph exactly in this style:
Starting from the current position; please move forward <N> steps, then turn <left/right> <ANGLE> degrees; ... The pathway sentence (smooth, clear, etc.). The item-location sentence: "The item you need is at your <Front/Left/Right/Top left/Bottom right>." A final tactile-identification sentence about shape / temperature / texture.
Follow the wording, punctuation and capitalisation of the examples.

Example 1
Starting from the current position; please move forward 3 steps, then remain standing. The pathway is clear, allowing for a smooth and uninterrupted walk. The item you need is at your left. You can identify fruits by their shape …

Example 2
You are standing to the left of the shelf, please turn right 90 degrees. You may first encounter bottled jellies …


<|user|>
Target object (noun): {target}
Sector: {sector} | Distance tag: {distance} | Height band: {height_band}
Scene facts: {scene}
User request: {question}
<|assistant|>
Let's think step by step."""

def answer_question(scene, question):
    noun, sector, dist = locate_target(scene, question)
    height = "waist"
    for hb in HEIGHT2LOC:
        if hb in scene.lower():
            height = hb
            break

    prompt = REASON_TEMPLATE.format(
        target=noun or "object",
        sector=sector or "centre",
        distance=dist or "mid",
        height_band=HEIGHT2LOC.get(height, "Middle"),
        scene=scene,
        question=question
    )

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.3,
        top_p=0.9
    )

    full_response = response.choices[0].message.content.strip()
    return full_response.splitlines()[-1].rstrip(".") + "."

# ✅ Main dataset processing loop
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def save_results(results, output_path):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"✅ Saved {len(results)} results to {output_path}")

# --------- Run Pipeline ----------
annotationPath = "drive/MyDrive/Colab Notebooks/annotations/outdoor_qa_pair_gt.json"
imageDir = "drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"
outFile = "drive/MyDrive/Colab Notebooks/gptvl_outdoor_navigation_output.json"

results = []
dataset = load_dataset(annotationPath)

for item in dataset:
    img_id, q = item["index"], item["question"]
    image_path = f"{imageDir}/{img_id}.jpg"

    try:
        _ = Image.open(image_path).convert("RGB")  # test readability
    except Exception as e:
        warnings.warn(f"Skip {img_id}: {e}")
        continue

    print(f"→ {img_id}: describing scene …")
    scene = describe_scene(image_path)

    print("   reasoning …")
    ans = answer_question(scene, q)

    results.append({
        "image_id": img_id,
        "question": q,
        "scene": scene,
        "instruction": ans
    })
    print(f"   ✓ {ans}")

save_results(results, outFile)


→ 0: describing scene …
   reasoning …
   ✓ Starting from the current position; please move forward 8 steps, then turn left 90 degrees. The pathway is clear, allowing for a smooth and uninterrupted walk. The item you need is at your front. You can identify the scene shown in the image by its texture, which is smooth and cool to the touch.
→ 1: describing scene …
   reasoning …
   ✓ Starting from the current position; please move forward 8 steps, then remain standing. The pathway is clear, allowing for a smooth and uninterrupted walk. The item you need is at your front. You can identify the staircase by the cool, smooth texture of the railing and the firm, even surface of the steps beneath your feet.
→ 2: describing scene …
   reasoning …
   ✓ Starting from the current position; please move forward 10 steps, then turn left 90 degrees. The pathway is smooth, allowing for a comfortable walk towards the target. The item you need is at your front. You can identify the chair by its cool, smo

In [None]:
import json
import base64
from PIL import Image
import re
import warnings
from openai import OpenAI

# 🔑 Set up your OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)  # <-- Replace with your actual key

# ✅ Convert image to base64 URL for OpenAI input
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        base64_data = base64.b64encode(image_file.read()).decode('utf-8')
        return f"data:image/jpeg;base64,{base64_data}"

# ✅ Step 1: Scene description (vision-language)
def describe_scene(image_path):
    image_data = encode_image_to_base64(image_path)
    prompt = (
      "You are a navigation camera for a blind user. "
      "Divide the view into three sectors (left, centre, right) relative to the wearer’s heading. "
      "For each salient object or hazard, list:\n"
      "• name\n• approximate number of steps away (e.g., 2 steps, 6 steps, etc.)\n"
      "• height band if obvious: floor / knee / waist / chest / head / above head.\n"
      "Respond as terse bullet points per sector."
    )
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": image_data}},
                ],
            }
        ],
        max_tokens=350
    )
    return response.choices[0].message.content.strip()

# ✅ Step 2: Heuristic parser to extract sector & distance
def locate_target(scene, question):
    m = re.search(r'\bthe\s+([a-zA-Z ]+?)(\?|$)', question.lower())
    noun = m.group(1).strip() if m else ""
    sector, dist = None, None
    for line in scene.lower().splitlines():
        if noun and noun in line:
            if "left:" in line: sector = "left"
            if "centre" in line: sector = "centre"
            if "right:" in line: sector = "right"
            for d in ("close", "mid", "far"):
                if d in line:
                    dist = d
                    break
            break
    return noun, sector, dist

SECTOR2LOC = {"left": "Left", "centre": "Front", "right": "Right"}
HEIGHT2LOC = {
    "floor": "Bottom", "knee": "Bottom", "waist": "Middle",
    "chest": "Middle", "head": "Top", "above head": "Top"
}

# ✅ Step 3: Reasoning using GPT-4o
REASON_TEMPLATE = """<|system|>
You are an assistive agent helping a blind user navigate and identify an object in their environment.
Write ONE coherent paragraph with the following structure:

1. Start with: "Starting from the current position;" then give detailed navigation steps like:
   - "Please move forward X steps"
   - "Turn left/right Y degrees"
   - "Remain standing"
   Separate movement instructions using semicolons.

2. Describe environmental context or pathway (e.g. floor condition, obstacles, door open/closed, etc.).

3. Use this format for object location:
   - "The target object you want to approach is at your <Front/Left/Right/Top left/Bottom right>."

4. End with a tactile/functional description of the object (material, texture, temperature, shape, behavior).

Maintain a natural, helpful tone. Match the punctuation and formatting exactly as shown.
Avoid excessive length. One concise paragraph only.

Example 1
Starting from the current position; Please Move forward 2 steps, then Remain standing; Please Move forward 3 steps, then Remain standing. The shower room's door is open; you will encounter a relatively high step before entering. The bathroom floor may be damp and slippery, requiring caution for safety. The target object you want to approach is at your Front. The shower faucet, crafted from metal, has a smooth, icy, and solid surface. To the left is the shower hose, a circular tube with a spiral texture. The handle can possibly be rotated or turned on, causing water to flow out, which may initially be cold.

Example 2
You are standing in front of the shelf, no need to move forward or backward. The pathway is clear, ensuring an unobstructed approach to the shelf. The item you need is at your front. They are packaged in plastic bags, with several square pieces forming a bag. If you touch the plastic wrapping gently, you will feel that they are somewhat soft.

<|user|>
Target object (noun): {target}
Sector: {sector} | Distance: {distance} steps | Height band: {height_band}
Scene description: {scene}
User question: {question}
<|assistant|>
Let's think step by step."""


def answer_question(scene, question):
    noun, sector, dist = locate_target(scene, question)
    height = "waist"
    for hb in HEIGHT2LOC:
        if hb in scene.lower():
            height = hb
            break

    prompt = REASON_TEMPLATE.format(
        target=noun or "object",
        sector=sector or "centre",
        distance=dist or "mid",
        height_band=HEIGHT2LOC.get(height, "Middle"),
        scene=scene,
        question=question
    )

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.3,
        top_p=0.9
    )

    full_response = response.choices[0].message.content.strip()
    return full_response.splitlines()[-1].rstrip(".") + "."

# ✅ Main dataset processing loop
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def save_results(results, output_path):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"✅ Saved {len(results)} results to {output_path}")

# --------- Run Pipeline ----------
annotationPath = "drive/MyDrive/Colab Notebooks/annotations/outdoor_qa_pair_gt.json"
imageDir = "drive/MyDrive/Colab Notebooks/environment_images/outdoor_images_1024x768"
outFile = "drive/MyDrive/Colab Notebooks/gptvl2_outdoor_navigation_output.json"

results = []
dataset = load_dataset(annotationPath)

for item in dataset:
    img_id, q = item["index"], item["question"]
    image_path = f"{imageDir}/{img_id}.jpg"

    try:
        _ = Image.open(image_path).convert("RGB")  # test readability
    except Exception as e:
        warnings.warn(f"Skip {img_id}: {e}")
        continue

    print(f"→ {img_id}: describing scene …")
    scene = describe_scene(image_path)

    print("   reasoning …")
    ans = answer_question(scene, q)

    results.append({
        "image_id": img_id,
        "question": q,
        "scene": scene,
        "instruction": ans
    })
    print(f"   ✓ {ans}")

save_results(results, outFile)


→ 0: describing scene …
   reasoning …
   ✓ Starting from the current position; Please move forward 15 steps, then remain standing. As you proceed, the ground is even, but be aware of the tree on your right at about 6 steps, which is tall and extends above head height. The path is clear, with no immediate obstacles in your way. The target object you want to approach is at your Front. The fire assembly point sign is made of metal, with a smooth and cool surface. It is mounted at chest to above head height, and you may feel embossed letters or symbols indicating its purpose.
→ 1: describing scene …
   reasoning …
   ✓ Starting from the current position; Please move forward 8 steps, then turn slightly right 15 degrees; Please move forward 2 more steps, then remain standing. The ground beneath you is flat and grassy, providing a stable surface, but be mindful of any uneven patches or small stones. To your left, you will pass by a picnic bench at waist height, and to your right, you will no

In [None]:
models = client.models.list()
for model in models.data:
    print(model.id)

gpt-4o-audio-preview-2024-12-17
dall-e-3
dall-e-2
gpt-4o-audio-preview-2024-10-01
text-embedding-3-small
gpt-4.1-nano
gpt-4.1-nano-2025-04-14
gpt-4o-realtime-preview-2024-10-01
gpt-4o-realtime-preview
babbage-002
gpt-4
text-embedding-ada-002
chatgpt-4o-latest
gpt-4o-realtime-preview-2024-12-17
text-embedding-3-large
gpt-4o-mini-audio-preview
gpt-4o-audio-preview
o1-preview-2024-09-12
gpt-4o-mini-realtime-preview
gpt-4.1-mini
gpt-4o-mini-realtime-preview-2024-12-17
gpt-3.5-turbo-instruct-0914
gpt-4o-mini-search-preview
gpt-4.1-mini-2025-04-14
davinci-002
gpt-3.5-turbo-1106
gpt-4o-search-preview
gpt-4-turbo
gpt-3.5-turbo-instruct
gpt-3.5-turbo
gpt-4-turbo-preview
gpt-4o-mini-search-preview-2025-03-11
gpt-4-0125-preview
gpt-4o-2024-11-20
whisper-1
gpt-4o-2024-05-13
gpt-4-turbo-2024-04-09
gpt-3.5-turbo-16k
gpt-image-1
o1-preview
gpt-4-0613
gpt-4o-mini-tts
gpt-4o-transcribe
gpt-4.5-preview
gpt-4.5-preview-2025-02-27
gpt-4o-mini-transcribe
gpt-4o-search-preview-2025-03-11
omni-moderation-202