### We created group-specific prompt templates with built-in Chain-of-Thought (step-by-step) structure, which is automatically selected in the new prompt_molmo() based on the artifact type.

In [4]:
! pip install torch --quiet
! pip install bitsandbytes accelerate --upgrade --quiet
! pip install einops
! pip install tqdm
! pip install requests --quiet
! pip install transformers --quiet


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\Dell\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\Dell\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\Dell\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\Dell\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\Dell\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\Dell\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


In [None]:
import einops
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig, BitsAndBytesConfig
from PIL import Image
import requests
import re

In [1]:
# 1. Mapping each artifact to its corresponding group
artifact_to_group = {
    # Group 1: Structural or Geometric Impossibilities
    "Impossible mechanical connections": 1,
    "Impossible mechanical joints": 1,
    "Physically impossible structural elements": 1,
    "Incorrect wheel geometry": 1,
    "Implausible aerodynamic structures": 1,
    "Misaligned body panels": 1,
    "Irregular proportions in mechanical components": 1,
    "Inconsistent scale of mechanical parts": 1,
    "Non-manifold geometries in rigid structures": 1,
    "Anatomically incorrect paw structures": 1,
    "Misshapen ears or appendages": 1,
    "Biological asymmetry errors": 1,
    "Impossible foreshortening in animal bodies": 1,
    "Anatomically impossible joint configurations": 1,
    "Misaligned bilateral elements in animal faces": 1,
    "Asymmetric features in naturally symmetric objects": 1,
    "Scale inconsistencies within single objects": 1,
    "Spatial relationship errors": 1,
    "Scale inconsistencies within the same object class": 1,

    # Group 2: Texture & Surface Abnormalities
    "Over-smoothing of natural textures": 2,
    "Texture bleeding between adjacent regions": 2,
    "Texture repetition patterns": 2,
    "Regular grid-like artifacts in textures": 2,
    "Artificial noise patterns in uniform surfaces": 2,
    "Unnaturally glossy surfaces": 2,
    "Metallic surface artifacts": 2,
    "Synthetic material appearance": 2,
    "Excessive sharpness in certain image regions": 2,
    "Improper fur direction flows": 2,
    "Incorrect skin tones": 2,
    "Artificial smoothness": 2,

    # Group 3: Visual Boundary & Edge Issues
    "Inconsistent object boundaries": 3,
    "Blurred boundaries in fine details": 3,
    "Aliasing along high-contrast edges": 3,
    "Jagged edges in curved structures": 3,
    "Discontinuous surfaces": 3,
    "Repeated element patterns": 3,
    "Floating or disconnected components": 3,
    "Loss of fine detail in complex structures": 3,
    "Resolution inconsistencies within regions": 3,
    "Cinematization effects": 3,
    "Movie-poster-like composition of ordinary scenes": 3,
    "Exaggerated characteristic features": 3,

    # Group 4: Reflection, Lighting, and Optical Effects
    "Incorrect reflection mapping": 4,
    "Ghosting effects: Semi-transparent duplicates of elements": 4,
    "Depth perception anomalies": 4,
    "Artificial depth of field in object presentation": 4,
    "Fake depth of field": 4,
    "Distorted window reflections": 4,
    "Incorrect perspective rendering": 4,
    "Systematic color distribution anomalies": 4,
    "Unrealistic specular highlights": 4,
    "Unnatural color transitions": 4,
    "Color coherence breaks": 4,
    "Frequency domain signatures": 4,

    # Group 5: Animal Anatomy and Symmetry
    "Dental anomalies in mammals": 5,
    "Unrealistic eye reflections": 5,
    "Unnatural pose artifacts": 5
}

In [2]:
# 2. Prompt templates per group
group_to_prompt = {
    1: """
You are a vision assistant trained to identify and explain structural inconsistencies in AI-generated images.
Artifact to describe: {artifact}
Step-by-step:
1. Inspect the image for regions that show geometric or anatomical violations.
2. Reason about physical plausibility.
3. Consider a few visual forms this artifact may take.
4. Choose the one best supported by the image.
Strictly follow the format below when providing your answer.
Format:
{{"description": "<your explanation>"}}

For example, if given:
Artifact to describe: Artifact_1

You will give the description in the format:-
{{"description": "<description of Artifact_1 in the image based on the chain of thought above>"}}
DO NOT GIVE DESCRIPTION OF EXTRA ARTIFACTS.
ONLY THE GIVEN ARTIFACT HAS TO BE DESCRIBED.
  
Ensure that your description is precise. Restrict it to one or two lines.
Artifact to describe: {artifact}
""",
    2: """
You are an assistant that detects unusual surface and texture details in AI-generated images.
Artifact to describe: {artifact}
Step-by-step:
1. Focus on textures and surfaces.
2. Look for overly smooth, sharp, or repeating patterns.
3. Imagine 2-3 ways this artifact could appear.
4. Choose the most relevant.
Strictly follow the format below when providing your answer.
Format:
{{"description": "<your explanation>"}}

For example, if given:
Artifact to describe: Artifact_1

You will give the description in the format:-
{{"description": "<description of Artifact_1 in the image based on the chain of thought above>"}}
DO NOT GIVE DESCRIPTION OF EXTRA ARTIFACTS.
ONLY THE GIVEN ARTIFACT HAS TO BE DESCRIBED.
  
Ensure that your description is precise. Restrict it to one or two lines.
Artifact to describe: {artifact}
""",
    3: """
You are a perceptual assistant that identifies errors at edges, boundaries, and object separations.
Artifact to describe: {artifact}
Step-by-step:
1. Focus on edges and outlines.
2. Check for disconnections or floating parts.
3. Provide 1-2 likely manifestations.
4. Choose the one best matching the image.
Strictly follow the format below when providing your answer.
Format:
{{"description": "<your explanation>"}}

For example, if given:
Artifact to describe: Artifact_1

You will give the description in the format:-
{{"description": "<description of Artifact_1 in the image based on the chain of thought above>"}}
DO NOT GIVE DESCRIPTION OF EXTRA ARTIFACTS.
ONLY THE GIVEN ARTIFACT HAS TO BE DESCRIBED.
  
Ensure that your description is precise. Restrict it to one or two lines.
Artifact to describe: {artifact}
""",
    4: """
You are a visual perception expert trained to explain optical anomalies in AI-generated imagery.
Artifact to describe: {artifact}
Step-by-step:
1. Evaluate lighting, reflections, and perspective.
2. Look for mismatches in depth, mirrors, and shadows.
3. Suggest visual forms it might take.
4. Pick the most accurate.
Strictly follow the format below when providing your answer.
Format:
{{"description": "<your explanation>"}}

For example, if given:
Artifact to describe: Artifact_1

You will give the description in the format:-
{{"description": "<description of Artifact_1 in the image based on the chain of thought above>"}}
DO NOT GIVE DESCRIPTION OF EXTRA ARTIFACTS.
ONLY THE GIVEN ARTIFACT HAS TO BE DESCRIBED.
  
Ensure that your description is precise. Restrict it to one or two lines.
Artifact to describe: {artifact}
""",
    5: """
You are an assistant that evaluates the realism of animals in AI-generated imagery.
Artifact to describe: {artifact}
Step-by-step:
1. Focus on anatomical correctness.
2. Check for unnatural symmetry or postures.
3. Suggest how the artifact appears.
4. Choose the best match.
Strictly follow the format below when providing your answer.
Format:
{{"description": "<your explanation>"}}

For example, if given:
Artifact to describe: Artifact_1

You will give the description in the format:-
{{"description": "<description of Artifact_1 in the image based on the chain of thought above>"}}
DO NOT GIVE DESCRIPTION OF EXTRA ARTIFACTS.
ONLY THE GIVEN ARTIFACT HAS TO BE DESCRIBED.
  
Ensure that your description is precise. Restrict it to one or two lines.
Artifact to describe: {artifact}
"""
}

In [4]:
artifact = "Depth perception anomalies"
print(group_to_prompt[artifact_to_group[artifact]].format(artifact=artifact))


You are a visual perception expert trained to explain optical anomalies in AI-generated imagery.
Artifact to describe: Depth perception anomalies
Step-by-step:
1. Evaluate lighting, reflections, and perspective.
2. Look for mismatches in depth, mirrors, and shadows.
3. Suggest visual forms it might take.
4. Pick the most accurate.
Strictly follow the format below when providing your answer.
Format:
{"description": "<your explanation>"}

For example, if given:
Artifact to describe: Artifact_1

You will give the description in the format:-
{"description": "<description of Artifact_1 in the image based on the chain of thought above>"}
DO NOT GIVE DESCRIPTION OF EXTRA ARTIFACTS.
ONLY THE GIVEN ARTIFACT HAS TO BE DESCRIBED.
  
Ensure that your description is precise. Restrict it to one or two lines.
Artifact to describe: Depth perception anomalies



In [None]:
device_map = "auto"
torch_dtype = torch.bfloat16

# load the processor
processor = AutoProcessor.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    device_map=device_map,
    torch_dtype=torch.bfloat16,
)

# load the model
model = AutoModelForCausalLM.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    device_map=device_map,
)

In [None]:
# 3. Refactored `prompt_molmo`
def prompt_molmo(image_path, artifact):
    
    group = artifact_to_group.get(artifact, 1)
    prompt = group_to_prompt[group].format(artifact=artifact)

    model.to(torch.bfloat16).eval()

    img = Image.open(image_path).resize((128, 128))
    inputs = processor(images=[img], text=prompt)
    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

    with torch.no_grad():
        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
            output = model.generate_from_batch(
                inputs,
                GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
                tokenizer=processor.tokenizer
            )
            generated_tokens = output[0, inputs['input_ids'].size(1):]
            generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
            match = re.search(r'"description":\s*"([^"]*)"', generated_text)
            return match.group(1) if match else "No description available"
