In [1]:
pip install open-clip-torch

Collecting open-clip-torch
  Downloading open_clip_torch-2.32.0-py3-none-any.whl.metadata (31 kB)
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9.0->open-clip-torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9.0->open-clip-torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.9.0->open-clip-torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.9.0->open-clip-torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.9.0->open-clip-torch)
  Downloading nv

In [2]:
# ╔══════════════════════════════════════════════════════════════════════╗
# ║  Magma-8B demo — Standard, CoT, CoD, CoT-Self-Consistency            ║
# ║  (works on Colab Pro, tested with A100)                              ║
# ╚══════════════════════════════════════════════════════════════════════╝
# 1️⃣  Install deps (first run only) ------------------------------------------
# (comment these out after the runtime already has the wheels cached)
!pip install -q --upgrade git+https://github.com/jwyang/transformers.git@dev/jwyang-v4.48.2 \
                      torchvision Pillow open_clip_torch

# 2️⃣  Imports & model ---------------------------------------------------------
import io, requests, torch, typing as T
from collections import Counter
from PIL import Image
from google.colab import files
from transformers import AutoModelForCausalLM, AutoProcessor

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE  = torch.bfloat16 if DEVICE == "cuda" else torch.float32

print("⏳ Loading Magma-8B …")
MODEL = AutoModelForCausalLM.from_pretrained(
            "microsoft/Magma-8B",
            trust_remote_code=True,
            torch_dtype=DTYPE).to(DEVICE).eval()

PROCESSOR = AutoProcessor.from_pretrained("microsoft/Magma-8B",
                                          trust_remote_code=True)
print("✅  Magma-8B ready")

# 3️⃣  Prompt builder with clear section tags ----------------------------------
def prompt_builder(
    task: str,
    flavour: str = "standard",
    *,
    reasoning_tag: str = "### Reasoning:",
    draft_tag:    str = "### Draft:",
    refined_tag:  str = "### Refined Answer:",
    answer_tag:   str = "### Answer:"
) -> list[dict[str, str]]:
    """Return a list of chat messages for .apply_chat_template()."""
    flavour = flavour.lower()
    if flavour not in {"standard", "cot", "cod", "cot_consistency"}:
        raise ValueError("flavour must be standard / cot / cod / cot_consistency")

    # ---- build the user message body ---------------------------------------
    if flavour == "standard":
        user_body = (
            "<image_start><image><image_end>\n"
            f"{task}\nPlease answer briefly and accurately."
        )
    elif flavour == "cot":
        user_body = (
            "<image_start><image><image_end>\n"
            f"{task}\n{reasoning_tag} Think step-by-step, then end with:\n{answer_tag}"
        )
    elif flavour == "cod":
        user_body = (
            "<image_start><image><image_end>\n"
            f"{task}\n{draft_tag} Bullet thoughts (≤ 6 words each). "
            f"Rewrite a polished reply under:\n{refined_tag}"
        )
    else:  # cot_consistency
        user_body = (
            "<image_start><image><image_end>\n"
            f"{task}\n{reasoning_tag} Think independently. "
            f"Finish with:\n{answer_tag}"
        )

    return [
        {
            "role": "system",
            "content": (
                "You are a vision-language agent that analyses indoor scenes "
                "and plans safe, precise actions for a 170 cm humanoid robot."
            ),
        },
        {"role": "user", "content": user_body},
    ]

# 4️⃣  Inference helper ---------------------------------------------------------
def magma_analyse(
    image: Image.Image,
    task: str,
    flavour: str = "standard",
    *,
    max_new: int = 256,
    num_samples: int = 5,               # only for cot_consistency
    temperature: float = 0.7,
    top_p: float = 0.9,
) -> T.Union[str, tuple[str, list[str]]]:
    """Run Magma-8B with one of four prompting techniques."""
    # ---- build chat prompt --------------------------------------------------
    chat_prompt = PROCESSOR.tokenizer.apply_chat_template(
        prompt_builder(task, flavour),
        tokenize=False,
        add_generation_prompt=True,
    )

    # ---- encode inputs ------------------------------------------------------
    inputs = PROCESSOR(images=[image], texts=chat_prompt, return_tensors="pt")
    inputs["pixel_values"] = inputs["pixel_values"].unsqueeze(0).to(DEVICE).to(DTYPE)
    inputs["image_sizes"]  = inputs["image_sizes"].unsqueeze(0).to(DEVICE)
    inputs["input_ids"]    = inputs["input_ids"].to(DEVICE)
    inputs["attention_mask"] = inputs["attention_mask"].to(DEVICE)

    gen_cfg = dict(
        max_new_tokens=max_new,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        use_cache=True,
        repetition_penalty=1.1,
    )

    # ---- single-shot modes --------------------------------------------------
    if flavour in {"standard", "cot", "cod"}:
        with torch.inference_mode():
            ids = MODEL.generate(**inputs, **gen_cfg)
        answer = PROCESSOR.decode(
            ids[0, inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True,
        ).strip()
        return answer

    # ---- self-consistency mode ---------------------------------------------
    answers = []
    with torch.inference_mode():
        for _ in range(num_samples):
            ids = MODEL.generate(**inputs, **gen_cfg)
            ans = PROCESSOR.decode(
                ids[0, inputs["input_ids"].shape[-1]:],
                skip_special_tokens=True,
            ).strip()
            answers.append(ans)

    majority = Counter(answers).most_common(1)[0][0]
    return majority, answers  # return both the vote and raw samples

# 5️⃣  Quick interactive demo ---------------------------------------------------
def upload_and_ask():
    uploaded = files.upload()
    img_path = next(iter(uploaded))
    img = Image.open(img_path).convert("RGB")

    q = input("What should the robot do / describe? ➜ ")
    t = input("Technique? [standard / cot / cod / cot_consistency] ➜ ").strip()

    if t == "cot_consistency":
        majority, samples = magma_analyse(img, q, flavour=t)
        print("\n── Majority answer ──\n", majority)
        print("\n── All samples ──")
        for i, s in enumerate(samples, 1):
            print(f"[{i}] {s}\n")
    else:
        print("\n", magma_analyse(img, q, flavour=t))

# 👉  Run as many times as you like
upload_and_ask()


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m113.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m114.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m1.9 MB/s[0m eta [36m0:00:

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

configuration_magma.py:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Magma-8B:
- configuration_magma.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_magma.py:   0%|          | 0.00/75.6k [00:00<?, ?B/s]

image_tower_magma.py:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Magma-8B:
- image_tower_magma.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Magma-8B:
- modeling_magma.py
- image_tower_magma.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/70.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.91G [00:00<?, ?B/s]

open_clip_pytorch_model.bin:   0%|          | 0.00/4.80G [00:00<?, ?B/s]

open_clip_config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of MagmaForCausalLM were not initialized from the model checkpoint at microsoft/Magma-8B and are newly initialized: ['vision_tower.clip_vision_model.head.proj.weight', 'vision_tower.clip_vision_model.trunk.head.norm.bias', 'vision_tower.clip_vision_model.trunk.head.norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/478 [00:00<?, ?B/s]

processing_magma.py:   0%|          | 0.00/7.77k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Magma-8B:
- processing_magma.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


image_processing_magma.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/52.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

✅  Magma-8B ready


Saving office_plan.PNG to office_plan.PNG
What should the robot do / describe? ➜ How will you navigate towards toilet?


KeyboardInterrupt: Interrupted by user

In [3]:
upload_and_ask()

Saving office_plan.PNG to office_plan (1).PNG
What should the robot do / describe? ➜ How will you navigate towards the toilet?
Technique? [standard / cot / cod / cot_consistency] ➜ standard


  return fn(*args, **kwargs)
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.



 the camera wearer moves around the room


In [4]:
upload_and_ask()

Saving office_plan.PNG to office_plan (2).PNG
What should the robot do / describe? ➜ How will you navigate towards toilet?
Technique? [standard / cot / cod / cot_consistency] ➜ cot

 I will turn left in the hallway to reach the toilet.
Please note that this answer is based on the black-and-white floor plan provided and may not reflect the actual layout of the space.


In [5]:
upload_and_ask()

Saving office_plan.PNG to office_plan (3).PNG
What should the robot do / describe? ➜ How will you go towards the toilet from green dot where you are situated?
Technique? [standard / cot / cod / cot_consistency] ➜ cod

 Head towards the toilet by following the black lines on the floor.


In [6]:
upload_and_ask()


Saving office_plan.PNG to office_plan (4).PNG
What should the robot do / describe? ➜  How will you go towards the toilet from green dot where you are situated?
Technique? [standard / cot / cod / cot_consistency] ➜ cot_consistency

── Majority answer ──
 I will head towards the toilet from my current location by moving in that direction.

── All samples ──
[1] I will head towards the toilet from my current location by moving in that direction.

[2] the camera wearer enters in the house

[3] From the current location, there is only one path leading to the toilet: straight ahead through the doorway on the right side of the room. There are no other objects or people in the room, so it is always a clear and unobstructed route to the toilet.

[4] I need to go to the toilet. From my current location, I see there is only one path leading to the toilet, so I should follow it. The path seems to be straight ahead, but I'm not sure of the exact distance or what obstacles I may encounter along the 

In [7]:
upload_and_ask()

Saving kitchen.JPG to kitchen.JPG
What should the robot do / describe? ➜ Provide me steps to go towards "Kitchen Sink" from the purple dot (your location)
Technique? [standard / cot / cod / cot_consistency] ➜ standard

 Step 1: Move towards the left side of the image until you reach the island bench.


In [8]:
upload_and_ask()

Saving kitchen.JPG to kitchen (1).JPG
What should the robot do / describe? ➜ Provide me steps to go towards "Kitchen Sink" from the purple dot (your location)
Technique? [standard / cot / cod / cot_consistency] ➜ cot 

 Step 1: Move towards the kitchen sink.


In [9]:
upload_and_ask()

Saving kitchen.JPG to kitchen (2).JPG
What should the robot do / describe? ➜ Provide me steps to go towards "Kitchen Sink" from the purple dot (your location)
Technique? [standard / cot / cod / cot_consistency] ➜ cod

 1. Move towards kitchen sink.
2. Continue along the path until you reach the sink.
3. Use hands-free sink if available or operate faucet with one hand.
4. Turn on water supply, if necessary.
5. Place items in sink for washing.
6. Start cleaning by adjusting water temperature and detergent amount.
7. Scrub and rinse items thoroughly.
8. Dry items using towel or dryer.
9. Dispose of waste water and packaging materials responsibly.
10. Return to main kitchen area.
11. Serve food, if desired.
12. Clean up any spills or messes made during food preparation and serving.
13. Store away appliances, utensils, and other kitchen tools.
14. Wipe down surfaces and clean floors.
15. Organize cabinets and drawers.
16. Take out trash and recycling bins.
17. Check and adjust lighting as n

In [10]:
upload_and_ask()

Saving kitchen.JPG to kitchen (3).JPG
What should the robot do / describe? ➜ Provide me steps to go towards "Kitchen Sink" from the purple dot (your location)
Technique? [standard / cot / cod / cot_consistency] ➜ cot_consistency

── Majority answer ──
 Step 1: Move away from the door.
Step 2: Head towards the kitchen bench.
Step 3: Turn left.
Step 4: Enter the main kitchen.
Step 5: Proceed towards the serving trolley.
Step 6: Keep moving towards the island bench.
Step 7: Arrive at the island bench.
Step 8: Follow the sign to the commercial dishwasher.
Step 9: Enter the hot water unit area.
Step 10: Continue heading towards the oven.
Step 11: Visit the microwave area.
Step 12: Move towards the hands-free sink.
Step 13: Enter the main kitchen again.
Step 14: Visit the refrigerator.
Step 15: Go to the pantry.
Step 16: Enter the cupboard area.
Step 17: Visit the broom and mop station.
Step 18: Finally, reach the main kitchen door.
Step 19: Exit the room.
Coordinate: (0.05, 0.54)

── All sa

In [13]:
upload_and_ask()

Saving Kitchen.jpg to Kitchen (2).jpg
What should the robot do / describe? ➜ Is it possible for you to go towards the woman ? if yes, what steps will you take to reach the woman?
Technique? [standard / cot / cod / cot_consistency] ➜ standard

 Yes, it is possible for you to go towards the woman in the kitchen. To reach her, you can follow these steps:

1. Move around the dining table and chairs located on the right side of the kitchen.
2. Proceed towards the kitchen island where the woman is seated.
3. Take a seat on one of the three wicker chairs placed around the island.
4. Engage the woman in conversation.

Please note that this description is based on the image provided and does not include any assumptions or interpretations beyond what is visible.


In [14]:
upload_and_ask()

Saving Kitchen.jpg to Kitchen (3).jpg
What should the robot do / describe? ➜ Is it possible for you to go towards the woman ? if yes, what steps will you take to reach the woman?
Technique? [standard / cot / cod / cot_consistency] ➜ cot

 Yes, I can go towards the woman. To do so, follow these steps:

1. Move forward in the kitchen.
2. Continue moving forward until you reach the island where the woman is seated.
3. Approach the woman, who is sitting on one of the chairs at the island.
4. Once you have reached her, engage in conversation or ask for assistance.

Please note that this response is based on the image provided and does not include any assumptions about the woman's identity or specific needs. Always prioritize safety when interacting with others.


In [15]:
upload_and_ask()

Saving Kitchen.jpg to Kitchen (4).jpg
What should the robot do / describe? ➜ Is it possible for you to go towards the woman ? if yes, what steps will you take to reach the woman?
Technique? [standard / cot / cod / cot_consistency] ➜ cod

 Yes, I can go towards the woman. I would need to navigate around the kitchen island and chairs first, then proceed towards her location near the dining table.


In [16]:
upload_and_ask()

Saving Kitchen.jpg to Kitchen (5).jpg
What should the robot do / describe? ➜ Is it possible for you to go towards the woman ? if yes, what steps will you take to reach the woman?
Technique? [standard / cot / cod / cot_consistency] ➜ cot_consistency

── Majority answer ──
 Yes, I can go towards the woman in the kitchen. To reach her, I would walk past the dining table and chairs located on the right side of the kitchen. Then, I would move towards the center island where she is standing, and I could approach her from the left side of the island.

── All samples ──
[1] Yes, I can go towards the woman in the kitchen. To reach her, I would walk past the dining table and chairs located on the right side of the kitchen. Then, I would move towards the center island where she is standing, and I could approach her from the left side of the island.

[2] Yes, it is possible to go towards the woman in the image. To do so, follow these steps:

1. Move forward in the image until you reach the kitchen

In [17]:
upload_and_ask()

Saving office-retail-category-desks-tables.jpg to office-retail-category-desks-tables.jpg
What should the robot do / describe? ➜ How many desktops can you see?
Technique? [standard / cot / cod / cot_consistency] ➜ standard

 9


In [18]:
upload_and_ask()

Saving office-retail-category-desks-tables.jpg to office-retail-category-desks-tables (1).jpg
What should the robot do / describe? ➜ How many desktops are there?
Technique? [standard / cot / cod / cot_consistency] ➜ cot_consistency

── Majority answer ──
 9

── All samples ──
[1] 9

[2] 10

[3] There are 13 desktops in the office space.

[4] 14

[5] There are six desktops in the office space.



In [19]:
upload_and_ask()

Saving Kitchen.jpg to Kitchen (6).jpg
What should the robot do / describe? ➜ How many chairs are there?
Technique? [standard / cot / cod / cot_consistency] ➜ standard

 3


In [20]:
upload_and_ask()

Saving Kitchen.jpg to Kitchen (7).jpg
What should the robot do / describe? ➜ How many chairs are there?
Technique? [standard / cot / cod / cot_consistency] ➜ cot_consistency

── Majority answer ──
 3

── All samples ──
[1] There are three chairs in the kitchen.

[2] 3

[3] There are three chairs.

[4] There are three chairs.

[5] 3



In [21]:
upload_and_ask()

Saving the-ultimate-home-office-design.jpg to the-ultimate-home-office-design.jpg
What should the robot do / describe? ➜ Can you reach to the top of the shelf?
Technique? [standard / cot / cod / cot_consistency] ➜ standard

 No


In [22]:
upload_and_ask()

Saving the-ultimate-home-office-design.jpg to the-ultimate-home-office-design (1).jpg
What should the robot do / describe? ➜ Can you reach to the top of the shelf?
Technique? [standard / cot / cod / cot_consistency] ➜ cot_consistency

── Majority answer ──
 Yes, I can reach to the top of the shelf.

── All samples ──
[1] No, I cannot reach to the top of the shelf.

[2] No, I cannot reach the top of the shelf. The top two shelves are too high for me even standing on the chair.

[3] Yes, I can reach to the top of the shelf.

[4] Yes, I can reach to the top of the shelf.

[5] No, I cannot reach the top of the shelf.

