# Logit lens (defect)

In [1]:
import torch
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForCausalLM, AutoConfig
from transformers import AutoProcessor
from torchvision.transforms import Resize, ToTensor, Normalize, Compose

# Load the Ovis2-8B model from Hugging Face
config = AutoConfig.from_pretrained(
    "AIDC-AI/Ovis2-8B",
    trust_remote_code=True)

config.llm_attn_implementation = "eager" 

model = AutoModelForCausalLM.from_pretrained(
    "AIDC-AI/Ovis2-8B",
    config=config, 
    torch_dtype=torch.bfloat16,
    multimodal_max_length=32768,
    trust_remote_code=True,
    attn_implementation="eager" 
).cuda()

# Get the text and visual tokenizers
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()

KeyboardInterrupt: 

In [None]:
# Prepare the input: image and text prompt
image_path = '/home/bboulbarss/large_dataset/relational/train/cone_right_cylinder/CLEVR_rel_000020.png'
image = Image.open(image_path).convert('RGB')
text = "Task: Identify the correct label for this image from the following choices:\nA. A photo of a cone right of a cylinder\nB. A photo of a blue cube\nC. A photo of a red sphere\nAnswer with the letter of the correct choice."
query = f'<image>\n{text}'
print(query)

In [None]:
# Manually preprocess the image to match expected size
transform = Compose([
    Resize((224, 224)),  # Adjust to 336x336 or 448x448 if model config specifies
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
pixel_values = transform(image).unsqueeze(0).to(dtype=torch.bfloat16, device=model.device)

# Verify pixel_values shape
if pixel_values.shape[1] != 3:
    raise ValueError(f"pixel_values has {pixel_values.shape[1]} channels, expected 3 (RGB).")

# Manually preprocess text to ensure valid input_ids and attention_mask
text_inputs = text_tokenizer(query, return_tensors="pt", padding=True)
input_ids = text_inputs['input_ids'].to(model.device)
attention_mask = text_inputs['attention_mask'].to(model.device)

# Verify input shapes before passing to model
print(f"input_ids shape: {input_ids.shape}")
print(f"attention_mask shape: {attention_mask.shape}")
print(f"pixel_values shape: {pixel_values.shape}")

# Function to get layer-wise logits
def get_layer_logits(model, input_ids, pixel_values, attention_mask):
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            pixel_values=[pixel_values],  # Wrap in list for merge_multimodal
            attention_mask=attention_mask,
            output_hidden_states=True,
            labels=None  # Explicitly pass None for inference
        )
    hidden_states = outputs.hidden_states
    lm_head = model.llm.lm_head  # Access lm_head via llm submodule
    layer_logits = [lm_head(hs) for hs in hidden_states]
    return layer_logits


# Get the layer-wise logits
layer_logits = get_layer_logits(model, input_ids, pixel_values, attention_mask)

# Analyze logits: Get top-1 predicted tokens and probabilities
predicted_tokens = [logits.argmax(dim=-1)[0].cpu().numpy() for logits in layer_logits]
probs = [torch.softmax(logits, dim=-1).max(dim=-1).values[0].float().detach().cpu().numpy() for logits in layer_logits]

# Decode predicted tokens to strings
decoded_tokens = []
for layer_preds in predicted_tokens:
    layer_decoded = [text_tokenizer.decode([token_id]) for token_id in layer_preds]
    decoded_tokens.append(layer_decoded)

# Print shapes and sample predictions for verification
for i, (logits, layer_preds) in enumerate(zip(layer_logits, decoded_tokens)):
    print(f"Layer {i} logits shape: {logits.shape}")
    print(f"Layer {i} top-1 tokens (first 5 positions): {layer_preds[:5]}")

# Basic visualization: Heatmap of top-1 probabilities across layers
def plot_token_progress(probs, start_idx=0, end_idx=10):
    # Slice probabilities for specified token positions
    probs_array = [p[start_idx:end_idx] for p in probs]
    plt.figure(figsize=(10, 6))
    sns.heatmap(probs_array, cmap="Blues", annot=True, fmt=".2f", cbar_kws={'label': 'Top-1 Probability'})
    plt.title("Top-1 Probability Across Layers")
    plt.xlabel("Token Position")
    plt.ylabel("Layer")
    plt.yticks(ticks=range(len(probs)), labels=[f"Layer {i}" for i in range(len(probs))], rotation=0)
    plt.show()

# Plot probabilities for the first 10 tokens
plot_token_progress(probs, start_idx=0, end_idx=10)

# Logit lens (works, old)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoConfig
from PIL import Image

# Load the model configuration and model
config = AutoConfig.from_pretrained("AIDC-AI/Ovis2-8B", trust_remote_code=True)
config.llm_attn_implementation = "eager"

model = AutoModelForCausalLM.from_pretrained(
    "AIDC-AI/Ovis2-8B",
    config=config,
    multimodal_max_length=32768,
    trust_remote_code=True,
    attn_implementation="eager"
).cuda()

# Get tokenizers
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()

# Load image and define query
image_path = '/home/bboulbarss/large_dataset/relational/train/cone_right_cylinder/CLEVR_rel_000020.png'
image = Image.open(image_path).convert("RGB")
images = [image]
#text = 'Describe this image'
text = "Task: Identify the correct label for this image from the following choices:\nA. A photo of a blue cube\nB. A photo of a red sphere\nC. A photo of a cone right of a cylinder\nAnswer with the letter of the correct choice."
query = f"<image>\n{text}"

# Preprocess inputs
prompt, input_ids, pixel_values = model.preprocess_inputs(query, images)
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)

# Move to GPU
input_ids = input_ids.unsqueeze(0).to(model.device)
attention_mask = attention_mask.unsqueeze(0).to(model.device)
if pixel_values is not None:
    pixel_values = pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)
    pixel_values = [pixel_values]

# Run model with output_hidden_states=True to get hidden states from each layer
outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_hidden_states=True, labels=None)
hidden_states = outputs.hidden_states  # List of tensors: [1, seq_len, hidden_size]

# Get the language modeling head
lm_head = model.llm.lm_head  # Changed from model.lm_head

# Rest of the code remains the same
# Determine sequence lengths
full_seq_len = hidden_states[0].shape[1]  # Total sequence length (image + text tokens)
text_seq_len = input_ids.shape[1]  # Length of text tokens (e.g., 7 for the prompt)
text_positions = slice(full_seq_len - text_seq_len, full_seq_len)  # Slice for text positions

# Define a safe decode function to handle out-of-range token IDs
def safe_decode(token_id):
    try:
        return text_tokenizer.convert_ids_to_tokens([token_id])[0]
    except IndexError:
        return f"[SPECIAL_{token_id}]"

# Apply the logit lens
print("Applying logit lens to the text sequence:")
for layer_idx, hidden_state in enumerate(hidden_states):
    print(f"\nLayer {layer_idx}:")
    # Extract hidden states for text positions only
    text_hidden_state = hidden_state[:, text_positions, :]  # [1, text_seq_len, hidden_size]
    logits = lm_head(text_hidden_state)  # [1, text_seq_len, vocab_size]
    
    # Get top-5 token predictions for each position
    topk_tokens = torch.topk(logits, k=5, dim=-1).indices[0]  # [text_seq_len, 5]
    
    for pos in range(text_seq_len):
        #if pos == 0:
            top_tokens = [safe_decode(token_id.item()) for token_id in topk_tokens[pos]]
            print(f"  Position {pos}: {top_tokens}")

# Generate predictions

In [2]:
from transformers import AutoModelForCausalLM, AutoConfig
import torch
from PIL import Image

# Load the model configuration
config = AutoConfig.from_pretrained("AIDC-AI/Ovis2-8B", trust_remote_code=True)
config.llm_attn_implementation = "eager"  # Use eager attention

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    "AIDC-AI/Ovis2-8B",
    config=config,
    multimodal_max_length=32768,
    trust_remote_code=True,
    attn_implementation="eager"
).cuda()

# Get tokenizers from the model
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()

image_path = '/home/bboulbarss/large_dataset/relational/train/cone_right_cylinder/CLEVR_rel_000020.png'
image = Image.open(image_path).convert("RGB")
images = [image]  # Wrap in a list as preprocess_inputs expects a list of images

#text = "Task: Identify the correct label for this image from the following choices:\nA. A photo of a cone right of a cylinder\nB. A photo of a blue cube\nC. A photo of a red sphere\nAnswer with the letter of the correct choice."
text = "Describe this image."
query = f'<image>\n{text}'
print(query)

# Preprocess inputs using the model's function
prompt, input_ids, pixel_values = model.preprocess_inputs(query, images)

# Create attention mask
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)

# Prepare inputs for the model
input_ids = input_ids.unsqueeze(0).to(device=model.device)
attention_mask = attention_mask.unsqueeze(0).to(device=model.device)
if pixel_values is not None:
    pixel_values = pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)
    pixel_values = [pixel_values]  # Wrap in a list if required by the model

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

<image>
Describe this image.


In [3]:
# Define generation parameters
gen_kwargs = {
    "max_new_tokens": 200,
    "do_sample": False,
    "eos_token_id": model.generation_config.eos_token_id,
    "pad_token_id": text_tokenizer.pad_token_id,
    "use_cache": True
}

# Generate output
with torch.inference_mode():
    output_ids = model.generate(
        input_ids,
        pixel_values=pixel_values,
        attention_mask=attention_mask,
        **gen_kwargs
    )[0]

# Decode the output
generated_text = text_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
print(generated_text)

The image shows a simple 3D scene with a purple cylinder and a green cone placed on a flat surface. The background is a neutral gray, providing a contrast that highlights the objects. The lighting is soft and diffused, casting gentle shadows to the right of the objects, suggesting a light source from the left. The scene is minimalistic, with no additional elements or context provided.


# Logit lens (newest version)

In [2]:
#import torch
from transformers import AutoModelForCausalLM, AutoConfig
from PIL import Image
import torch
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForCausalLM, AutoConfig
from transformers import AutoProcessor
from torchvision.transforms import Resize, ToTensor, Normalize, Compose

# Load the model configuration and model
config = AutoConfig.from_pretrained("AIDC-AI/Ovis2-8B", trust_remote_code=True)
config.llm_attn_implementation = "eager"

model = AutoModelForCausalLM.from_pretrained(
    "AIDC-AI/Ovis2-8B",
    config=config,
    multimodal_max_length=32768,
    trust_remote_code=True,
    attn_implementation="eager"
).cuda()

# Get the language modeling head
lm_head = model.llm.lm_head

# Get tokenizers
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()

# Load image and define query
image_path = '/home/bboulbarss/large_dataset/relational/train/cone_right_cylinder/CLEVR_rel_000020.png'
image = Image.open(image_path).convert("RGB")

#image_path2 = '/home/bboulbarss/large_dataset/relational/train/cube_left_sphere/CLEVR_rel_000043.png'
#image2 = Image.open(image_path2).convert("RGB")
images = [image]
#text = 'Describe this image'
text = "Task: Identify the correct label for this image from the following choices:\nA. A photo of a cylinder right of a cone\nB. A photo of a cone right of a cylinder\nC. A photo of a cone left of a cylinder\nAnswer with the letter of the correct choice."
#text = 'Describe this image. The Image shows a green cone right of a purple cylinder.'
#text1 = 'This is a photo of a cone right of a cylinder'
#text2 = 'This is a photo of a cube left of a sphere'
query = f"<image>\n{text}"

# Preprocess inputs
prompt, input_ids, pixel_values = model.preprocess_inputs(query, images)
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)

# Move to GPU
input_ids = input_ids.unsqueeze(0).to(model.device)
attention_mask = attention_mask.unsqueeze(0).to(model.device)
if pixel_values is not None:
    pixel_values = pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)
    pixel_values = [pixel_values]

# After model forward pass
outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_hidden_states=True, labels=None)
hidden_states = outputs.hidden_states  # List of [1, full_seq_len, hidden_size]

# Define text sequence length and positions
text_seq_len = input_ids.shape[1]
full_seq_len = hidden_states[0].shape[1]
text_positions = slice(full_seq_len - text_seq_len, full_seq_len)

# Define the safe decoding function
def safe_decode(token_ids, tokenizer):
    tokens = []
    vocab_size = tokenizer.vocab_size
    for token_id in token_ids:
        if token_id < 0 or token_id >= vocab_size:
            tokens.append(f"[SPECIAL_{token_id}]")
        else:
            tokens.append(tokenizer.convert_ids_to_tokens([token_id])[0])
    return tokens

# Decode correct tokens
text_input_ids = input_ids[0].cpu().tolist()  # [text_seq_len]
###
# After getting input_ids
#text_input_ids = input_ids[0].cpu().tolist()
#print("text_input_ids:", text_input_ids)
#print("Unique token IDs:", sorted(set(text_input_ids)))
#print("Pad token ID:", text_tokenizer.pad_token_id)
#print("Vocab size:", text_tokenizer.vocab_size)
###


correct_tokens = safe_decode(text_input_ids, text_tokenizer)

# Logit lens analysis across layers
for layer_idx, hidden_state in enumerate(hidden_states):
    print(f"\nLayer {layer_idx}:")
    # Extract hidden states for text positions only
    text_hidden_state = hidden_state[:, text_positions, :]  # [1, text_seq_len, hidden_size]
    logits = lm_head(text_hidden_state)  # [1, text_seq_len, vocab_size]
    
    # Get top-5 token predictions for each position
    topk_tokens = torch.topk(logits, k=5, dim=-1).indices[0]  # [text_seq_len, 5]
    
    # Print correct token and top-5 predictions for each position
    for pos in range(text_seq_len):
        top_tokens = safe_decode(topk_tokens[pos].tolist(), text_tokenizer)
        correct_token = correct_tokens[pos]
        correct_display = f"(Correct: '{correct_token}')"
        print(f"  Position {pos} {correct_display:<25} {top_tokens}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


Layer 0:
  Position 0 (Correct: '[SPECIAL_151644]') ['%"><', 'Ä±c', 'éľĩ', '%;">', 'addtogroup']
  Position 1 (Correct: 'system')       ['icter', 'ĠUtt', 'ĠApÃ³s', 'Ġwinger', 'ĠDit']
  Position 2 (Correct: 'Ċ')            ['nage', 'undai', 'ptrdiff', ':CGRect', 'Ġsez']
  Position 3 (Correct: 'You')          ['agenda', 'ç»', 'ĠEnforcement', 'rgyz', 'ç»ª']
  Position 4 (Correct: 'Ġare')         ['":"/', 'boxing', 'slaught', 'crast', '-football']
  Position 5 (Correct: 'Ġa')           ['anuts', 'odi', 'ĠBulls', 'ĠFucking', 'ĠComple']
  Position 6 (Correct: 'Ġhelpful')     ['ĠacompaÃ±a', 'ĵåĲį', 'nage', 'Ġsez', '(exports']
  Position 7 (Correct: 'Ġassistant')   ['ĠMojo', 'è¦ģç´ł', 'idepress', 'ĠSegÃºn', 'Aceptar']
  Position 8 (Correct: '.')            ['variably', 'anuts', 'ughty', '.ReadAllText', 'èħĺ']
  Position 9 (Correct: '[SPECIAL_151645]') ['ĠacompaÃ±a', 'Ġazi', 'acher', 'anuts', 'iams']
  Position 10 (Correct: 'Ċ')            ['åıĪå¥½åıĪ', 'æĲŀå¥½', 'ix', 'ðŁĵĲ', 'èĳ«']
  Positio

## Show special tokens

In [2]:
# Get special tokens and flatten any lists in special_tokens_map.values()
special_tokens = []
for value in text_tokenizer.special_tokens_map.values():
    if isinstance(value, list):
        special_tokens.extend(value)  # Flatten lists (e.g., additional_special_tokens)
    else:
        special_tokens.append(value)  # Add single tokens
special_tokens = list(set(special_tokens))
# Get token IDs for each special token
special_tokens_with_ids = [(token, text_tokenizer.convert_tokens_to_ids(token)) for token in special_tokens]

# Print special tokens with their IDs
print("Special tokens with IDs:")
for token, token_id in special_tokens_with_ids:
    print(f"  ID: {token_id}, Token: {token}")

Special tokens with IDs:
  ID: 151646, Token: <|object_ref_start|>
  ID: 151649, Token: <|box_end|>
  ID: 151647, Token: <|object_ref_end|>
  ID: 151654, Token: <|vision_pad|>
  ID: 151656, Token: <|video_pad|>
  ID: 151645, Token: <|im_end|>
  ID: 151651, Token: <|quad_end|>
  ID: 151650, Token: <|quad_start|>
  ID: 151643, Token: <|endoftext|>
  ID: 151652, Token: <|vision_start|>
  ID: 151653, Token: <|vision_end|>
  ID: 151655, Token: <|image_pad|>
  ID: 151648, Token: <|box_start|>
  ID: 151644, Token: <|im_start|>


In [2]:
print(model)

Ovis(
  (llm): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(152064, 3584)
      (layers): ModuleList(
        (0-27): 28 x Qwen2DecoderLayer(
          (self_attn): Qwen2Attention(
            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
          )
          (mlp): Qwen2MLP(
            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
          (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        )
      )
      (