1. load input image and user edit prompt
2. operate VLM (Qwen to extract the parameters): id, class, bbox, matrix.
3. operate SAM to extract the binary mask
4. operate open-CV to to compute the shape mask.
5. operate SDL drawer with shape mask

In [None]:
# 1. load inputs (source image and user prompt)

import ipywidgets as widgets
from IPython.display import display, Image as IPImage
from PIL import Image

# Fixed image path
image_path = "data/elephant.jpg"

# Load the image using PIL
try:
    source_image = Image.open(image_path)
    # Display the image inline in the notebook
    display(IPImage(filename=image_path))
except FileNotFoundError:
    print("File not found. Please check the path and try again.")

# Create a text box widget for the user prompt
user_prompt_widget = widgets.Text(
    description='User Prompt:',
    placeholder='Enter your prompt here'
)

# Display the widget
display(user_prompt_widget)

# Function to print the user prompt
def print_user_prompt(change):
    user_prompt = user_prompt_widget.value
    print(f"User prompt: {user_prompt}")

# Attach the function to the user prompt widget to trigger on 'submit'
user_prompt_widget.on_submit(print_user_prompt)

# QWEN-VL-1 (old)

In [None]:
# use QWEN-VL to parse the parameters
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import torch
torch.manual_seed(1234)

# Note: The default behavior now has injection attack prevention off.
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)

# use bf16
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
# use fp16
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
# use cpu only
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cpu", trust_remote_code=True).eval()
# use cuda device with bf16
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True, bf16=True).eval()

# Specify hyperparameters for generation
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)

# 1st dialogue turn
query = tokenizer.from_list_format([
    {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, # Either a local path or an url
    {'text': '这是什么?'},
])
response, history = model.chat(tokenizer, query=query, history=None)
print(response)
# 图中是一名女子在沙滩上和狗玩耍，旁边是一只拉布拉多犬，它们处于沙滩上。

# 2nd dialogue turn
response, history = model.chat(tokenizer, '框出图中击掌的位置', history=history)
print(response)
# <ref>击掌</ref><box>(536,509),(588,602)</box>
image = tokenizer.draw_bbox_on_latest_picture(response, history)
if image:
  image.save('1.jpg')
else:
  print("no box")

# QWEN2 EXample (old)

In [None]:
# https://github.com/QwenLM/Qwen2.5-Math

 
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-Math-72B-Instruct"
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$."

# CoT
messages = [
    {"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},
    {"role": "user", "content": prompt}
]

# TIR
messages = [
    {"role": "system", "content": "Please integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("Response:", response)
print("completed")

# Parsing Image Attributes.

In [None]:
# Load and resize input image
IMG_PATH = './data/demo.jpeg'
############################################################

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import cv2
import matplotlib.pyplot as plt
import os


# Add debug print to check if image file exists
if not os.path.exists(IMG_PATH):
    print(f"ERROR: Image file not found at {IMG_PATH}")

# Load model with flash attention for better performance
try:
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2-VL-7B-Instruct",
        torch_dtype=torch.bfloat16,
        device_map="cuda",
    )
except Exception as e:
    print(f"ERROR loading model: {str(e)}")

# Load processor
try:
    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
except Exception as e:
    print(f"ERROR loading processor: {str(e)}")

messages = [
    {
        "role": "system",
        "content": """You are a computer vision expert specializing in object detection, scene understanding and spatial relationships.
For each detected object in the image, output exactly one line in the following format:
DETECT: <object_id>|<object_class>|<xmin>|<ymin>|<xmax>|<ymax>

Then, provide a scene analysis in the following format:
SCENE: <scene_description>

Finally, describe spatial relationships between objects:
SPATIAL: <spatial_relationships>

Where:
- <object_id> is a sequential number starting from 1
- <object_class> is the specific object category including attributes (color, size, etc)
- <xmin>,<ymin>,<xmax>,<ymax> are normalized coordinates (0-1) for the bounding box
- <scene_description> describes the overall context and setting
- <spatial_relationships> describes how objects relate to each other in space

Focus on main objects, and their relationships."""
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": IMG_PATH
            },
            {
                "type": "text", 
                "text": "Analyze this image, detect the main objects, describe the scene and spatial relationships between objects."
            }
        ]
    }
]

# Process inputs
try:
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
except Exception as e:
    print(f"ERROR in input processing: {str(e)}")

# Run inference
try:
    generated_ids = model.generate(**inputs, max_new_tokens=256)  # Increased tokens for scene description
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
except Exception as e:
    print(f"ERROR in model inference: {str(e)}")

# Parse the detection and scene understanding output
def parse_model_output(output_text):
    bboxes = []
    classes = []
    scene_desc = ""
    spatial_rel = ""
    
    try:
        if isinstance(output_text, list):
            output_text = output_text[0]
            
        for line in output_text.split('\n'):
            line = line.strip()
            if line.startswith('DETECT:'):
                _, detection = line.split(':', 1)
                obj_id, obj_class, xmin, ymin, xmax, ymax = detection.strip().split('|')
                bbox = [float(xmin), float(ymin), float(xmax), float(ymax)]
                bboxes.append(bbox)
                classes.append(obj_class)
            elif line.startswith('SCENE:'):
                _, scene_desc = line.split(':', 1)
            elif line.startswith('SPATIAL:'):
                _, spatial_rel = line.split(':', 1)
                    
    except Exception as e:
        print(f"ERROR in parse_model_output: {str(e)}")
        
    return bboxes, classes, scene_desc.strip(), spatial_rel.strip()

# Parse outputs
bboxes, classes, scene_desc, spatial_rel = parse_model_output(output_text)

def annotate_image_with_bbox(image, bbox, label, color=(0, 255, 0), thickness=2):
    try:
        height, width = image.shape[:2]
        xmin = int(bbox[0] * width)
        ymin = int(bbox[1] * height)
        xmax = int(bbox[2] * width)
        ymax = int(bbox[3] * height)
        
        cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color, thickness)
        cv2.putText(image, label, (xmin, ymin-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, thickness)
        return image
    except Exception as e:
        print(f"ERROR in annotate_image_with_bbox: {str(e)}")
        return image

# Load and annotate image
try:
    image = cv2.imread(IMG_PATH)
    if image is None:
        raise ValueError("Could not load image for annotation")

    # Create figure with two subplots
    plt.figure(figsize=(15, 10))
    
    # Plot 1: Annotated image
    plt.subplot(1, 2, 1)
    for i, (bbox, class_name) in enumerate(zip(bboxes, classes)):
        label = f"{i+1}: {class_name}"
        image = annotate_image_with_bbox(image, bbox, label)
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.title("Detected Objects")
    plt.axis('off')
    
    # Plot 2: Scene description and spatial relationships
    plt.subplot(1, 2, 2)
    plt.text(0.1, 0.7, f"Scene Description:\n{scene_desc}", wrap=True)
    plt.text(0.1, 0.3, f"Spatial Relationships:\n{spatial_rel}", wrap=True)
    plt.axis('off')
    
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"ERROR in annotation/display: {str(e)}")


# PRINT______________________________________________________   
# Print information in a clear format
print("\n=== DETECTION RESULTS ===")
print("\nDetected Objects:")
for i, (bbox, class_name) in enumerate(zip(bboxes, classes)):
    print(f"Object {i+1}:")
    print(f"  Class: {class_name}")
    print(f"  Bounding Box (normalized): xmin={bbox[0]:.3f}, ymin={bbox[1]:.3f}, xmax={bbox[2]:.3f}, ymax={bbox[3]:.3f}")

print("\nScene Description:")
print(scene_desc)

print("\nSpatial Relationships:")
print(spatial_rel)
print("\n=====================")

# Save detection results to file
try:
    img_name = IMG_PATH.split('/')[-1].split('.')[0]
    output_file = f"{img_name}_detection.txt"
    
    with open(output_file, 'w') as f:
        f.write("=== DETECTION RESULTS ===\n")
        f.write("\nDetected Objects:\n")
        for i, (bbox, class_name) in enumerate(zip(bboxes, classes)):
            f.write(f"Object {i+1}:\n")
            f.write(f"  Class: {class_name}\n")
            f.write(f"  Bounding Box (normalized): xmin={bbox[0]:.3f}, ymin={bbox[1]:.3f}, xmax={bbox[2]:.3f}, ymax={bbox[3]:.3f}\n")
        
        f.write("\nScene Description:\n")
        f.write(f"{scene_desc}\n")
        
        f.write("\nSpatial Relationships:\n") 
        f.write(f"{spatial_rel}\n")
        f.write("\n=====================\n")
        
    print(f"\nDetection results saved to {output_file}")
except Exception as e:
    print(f"ERROR saving detection results: {str(e)}")






# QWEN MATH

In [None]:
TXT_FILE="data/img_detection.txt"
IMG_PATH = './data/demo.jpeg'
############################################################
# Read detection results and parse object properties
with open(TXT_FILE, 'r') as f:
    detection_data = f.read()

# Parse objects with their properties
import re
import numpy as np

# Extract object information with properties
objects = []
for match in re.finditer(r'Object (\d+):\n\s+Class: (.*?)\n\s+Bounding Box.*?xmin=([\d.]+), ymin=([\d.]+), xmax=([\d.]+), ymax=([\d.]+)', detection_data):
    # Get bounding box coordinates
    xmin = float(match.group(3))
    ymin = float(match.group(4))
    xmax = float(match.group(5))
    ymax = float(match.group(6))
    
    # Calculate corner points of bounding box
    corners = np.array([
        [xmin, ymin, 1],  # top-left
        [xmax, ymin, 1],  # top-right
        [xmax, ymax, 1],  # bottom-right
        [xmin, ymax, 1]   # bottom-left
    ])
    
    obj = {
        'id': int(match.group(1)),
        'class': match.group(2),
        'bbox': [xmin, ymin, xmax, ymax],
        'corners': corners,
        'width': xmax - xmin,
        'height': ymax - ymin,
        'center': [(xmax + xmin)/2, (ymax + ymin)/2]
    }
    objects.append(obj)

# Define the user's edit request
USER_EDIT = "Translate the dog 100 pixels to the right."

# Define available transformations and their effects on points
TRANSFORMATIONS = {
    'affine': {
        'translation': lambda tx, ty: np.array([
            [1, 0, tx],
            [0, 1, ty],
            [0, 0, 1]
        ]),
        'rotation': lambda theta: np.array([
            [np.cos(theta), -np.sin(theta), 0],
            [np.sin(theta), np.cos(theta), 0],
            [0, 0, 1]
        ]),
        'scaling': lambda sx, sy: np.array([
            [sx, 0, 0],
            [0, sy, 0],
            [0, 0, 1]
        ]),
        'shear': lambda shx, shy: np.array([
            [1, shx, 0],
            [shy, 1, 0],
            [0, 0, 1]
        ])
    },
    'non_affine': {
        'perspective': 'Custom 3x3 homography matrix',
        'deformation': 'Non-linear point mapping function'
    }
}

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-Math-7B-Instruct"
device = "cuda"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Construct prompt for transformation inference
prompt = f"""Given the scene with these detected objects and their bounding boxes:
{', '.join([f"ID {obj['id']}: {obj['class']} at coordinates (xmin={obj['bbox'][0]:.3f}, ymin={obj['bbox'][1]:.3f}, xmax={obj['bbox'][2]:.3f}, ymax={obj['bbox'][3]:.3f})" for obj in objects])}

And the user's edit request:
{USER_EDIT}

Available transformations that can be applied to any point (x,y,1) in homogeneous coordinates:
1. Translation: [x,y,1] → [x+tx, y+ty, 1]
2. Rotation: [x,y,1] → [x*cos(θ)-y*sin(θ), x*sin(θ)+y*cos(θ), 1]
3. Scaling: [x,y,1] → [sx*x, sy*y, 1]
4. Shear: [x,y,1] → [x+shx*y, shy*x+y, 1]
5. Perspective: [x,y,1] → [h11*x+h12*y+h13, h21*x+h22*y+h23, h31*x+h32*y+h33]

Calculate a transformation matrix that will map every point in the target object's bounding box to its new position.
Reason step by step about:
1. Which object needs to be transformed
2. What type of transformation is needed
3. The exact parameters required
4. The resulting 3x3 transformation matrix

Output the final transformation matrix that can be applied to any point [x,y,1] in the bounding box."""

messages = [
    {"role": "system", "content": "You are a computer vision transformation expert. Calculate precise transformation matrices that can be applied to any point in homogeneous coordinates."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]

reasoning = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("\n=== Model Reasoning ===")
print(reasoning)

import re
import numpy as np


# Extract the content inside \boxed{ }
matrix_content = re.search(r"\\boxed\{\\begin\{pmatrix\}([\s\S]*?)\\end\{pmatrix\}\}", reasoning).group(1)

# Parse the content into a 2D list
matrix_rows = matrix_content.strip().split(r"\\")
matrix = [list(map(float, row.split('&'))) for row in matrix_rows]

# Convert to a NumPy array for better display
matrix_array = np.array(matrix)

# Print the matrix nicely
print("Parsed Matrix:")
print(matrix_array)


# SAM gets binary mask

In [25]:
IMG_PATH = './data/demo.jpeg'

import torch
from sam2.sam2_image_predictor import SAM2ImagePredictor
import numpy as np
import matplotlib.pyplot as plt
# Parse bounding box coordinates from detection file
with open('data/demo_detection.txt', 'r') as f:
    content = f.read()

# Extract bounding box coordinates for each object
objects = []
for obj in content.split('Object')[1:]:
    # Check if the object has the required fields
    if 'Class:' in obj and 'Bounding Box (normalized):' in obj:
        class_name = obj.split('Class:')[1].split('\n')[0].strip()
        bbox = obj.split('Bounding Box (normalized):')[1].split('\n')[0].strip()
        
        # Initialize bounding box coordinates
        xmin = ymin = xmax = ymax = None
        
        # Extract bounding box coordinates safely
        for coord in bbox.split(','):
            if 'xmin' in coord:
                xmin = float(coord.split('=')[1].strip())
            elif 'ymin' in coord:
                ymin = float(coord.split('=')[1].strip())
            elif 'xmax' in coord:
                xmax = float(coord.split('=')[1].strip())
            elif 'ymax' in coord:
                ymax = float(coord.split('=')[1].strip())
        
        # Ensure all coordinates are found before calculating center
        if None not in (xmin, ymin, xmax, ymax):
            # Calculate center point
            center_x = (xmin + xmax) / 2
            center_y = (ymin + ymax) / 2
            
            objects.append({
                'class': class_name,
                'center': [center_x, center_y]
            })

# Convert to input format for SAM
input_points = np.array([[obj['center'][0], obj['center'][1]] for obj in objects])
input_labels = np.array([1] * len(objects))  # 1 indicates foreground point

print(input_points)
print(input_labels)








# with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
#     predictor.set_image(IMG_PATH)
#     masks, scores, _ = predictor.predict(
#         point_coords=input_points,
#         point_labels=input_labels,
#         multimask_output=False
#     )

# # Visualize the results
# plt.figure(figsize=(15, 10))

# # Plot original image
# plt.subplot(1, 2, 1)
# plt.imshow(IMG_PATH)
# plt.title('Original Image')
# plt.axis('off')

# # Plot image with masks overlay
# plt.subplot(1, 2, 2)
# plt.imshow(IMG_PATH)
# for i, mask in enumerate(masks):
#     plt.imshow(mask, alpha=0.5, cmap='jet')
#     plt.plot(input_points[i][0], input_points[i][1], 'r+', markersize=10)
# plt.title('Segmentation Masks')
# plt.axis('off')

# plt.show()


[[0.585 0.545]
 [0.375 0.63 ]]
[1 1]


In [24]:
import torch
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor

checkpoint = "/dtu/blackhole/14/189044/marscho/VLM_controller_for_SD/sam2/checkpoints/sam2.1_hiera_large.pt"
model_cfg = "/dtu/blackhole/14/189044/marscho/VLM_controller_for_SD/sam2/sam2/configs/sam2.1/sam2.1_hiera_l.yaml"
predictor = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint))

FileNotFoundError: [Errno 2] No such file or directory: '/dtu/blackhole/14/189044/marscho/VLM_controller_for_SD/.venv/lib/python3.10/site-packages/sam2'