In [1]:
from transformers import AutoTokenizer, AutoConfig, pipeline, AutoModelForCausalLM
import torch
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from functools import partial
from repe.mllm_utils import auto_configure_device_map
from repe.mllm_rep_control_contrast_vec import ContrastVecInternlmForCausalLM
from repe.modeling_internlm_xcomposer2 import InternLMXComposer2ForCausalLM

model_name_or_path = "/home/yerong2/models/internlm-xcomposer2-vl-7b"

model = ContrastVecInternlmForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).eval()
# model = InternLMXComposer2ForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).eval()

from accelerate import dispatch_model
device_map = auto_configure_device_map(2)
model = dispatch_model(model, device_map=device_map)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
model.tokenizer = tokenizer

# query = '<ImageHere>Please describe this image'
# image = ['../tiger.jpeg']
image = 'tiger.jpeg'

questions = ['<ImageHere>Please describe this image']
query = questions[0]

pos_p = 'Give a truthful answer. '
neg_p = 'Give an untruthful answer. '

def Encode(model, query, image):
    meta_instruction = 'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n'
    '- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n'
    '- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文.\n'
    '- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively based on the provided image.'
    
    image = model.encode_img(image)
        
    inputs, im_mask = model.interleav_wrap_chat(model.tokenizer, query, image, [], meta_instruction)
    # print(model.tokenizer is None) # DEBUG
    inputs = {
        k: v.to(model.device)
        for k, v in inputs.items() if torch.is_tensor(v)
    }    
    return inputs, im_mask

contrast_tokens=-8 # last {tokens} tokens are used to compute the diff in hidden_states
alpha = 0
layer_ids = np.arange(0, 32, 2).tolist()
# also add end-of-assistant token in eos token id to avoid unnecessary generation
eos_token_id = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids(['[UNUSED_TOKEN_145]'])[0]
]
model.eval()

with torch.autocast(device_type='cuda', dtype=torch.float16):

    for q in questions:
        q_pos = pos_p + q
        q_neg = neg_p + q
        
        # enc = tokenizer([input, input_pos, input_neg], return_tensors='pt', padding='longest').to(model.device)
        inputs, im_mask = Encode(model, q, image)
        pos_inputs, pos_im_mask = Encode(model, q_pos, image)
        neg_inputs, neg_im_mask = Encode(model, q_neg, image)

        
        repe_args = dict(pos_input_ids=pos_inputs,
                 pos_img_mask=pos_im_mask,
                 neg_input_ids=neg_inputs,
                 neg_img_mask=neg_im_mask,
                 contrast_tokens=contrast_tokens,
                 compute_contrast=True,
                 use_cache=False, # not yet supporting generation with use_cache         
                 alpha=alpha,
                 control_layer_ids=layer_ids)
        
        

        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            eos_token_id=eos_token_id,
            im_mask=im_mask,
            do_sample=False,
            # **repe_args,
        )
        
        # @@@ controlled_outputs = model.generate(input_ids, 
        #                          attention_mask=attention_mask, 
        #                          max_new_tokens=256, 
        #                          do_sample=False, 
        #                          use_cache=False, # not yet supporting generation with use_cache
        #                          **repe_args)
        if image is None:
            outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
        else:
            outputs = outputs[0].cpu().tolist()
        response = tokenizer.decode(outputs, skip_special_tokens=True)
    
        print(response)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type internlmxcomposer2 to instantiate a model of type internlm. This is not supported for all configurations of models and can yield errors.


Set max length to 4096
Position interpolate from 24x24 to 35x35
Set max length to 4096
Position interpolate from 24x24 to 35x35


Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.69s/it]
Some weights of ContrastVecInternlmForCausalLM were not initialized from the model checkpoint at /home/yerong2/models/internlm-xcomposer2-vl-7b and are newly initialized: ['vit.vision_tower.vision_model.post_layernorm.bias', 'vit.vision_tower.vision_model.post_layernorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Entering constrastive greedy search
In the image, a majestic tiger is the focal point, lying comfortably on a lush, green grassy field. The tiger's body is oriented towards the left side of the image, but its head is turned to face the camera, giving us a direct gaze. The tiger's fur is a striking mix of orange and black stripes, with a white underbelly that contrasts with the greenery beneath it. The background is a blurred expanse of green, likely a mix of trees and bushes, which further emphasizes the tiger as the main subject of this image. The tiger's relaxed posture and the serene environment create a sense of tranquility.[UNUSED_TOKEN_145]


In [1]:
# Backup
# Backup
# Backup
# Backup
# Backup
# Backup
# Backup
# Backup

from transformers import AutoTokenizer, AutoConfig, pipeline, AutoModelForCausalLM
import torch
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from functools import partial
from repe.mllm_utils import auto_configure_device_map
from repe.mllm_rep_control_contrast_vec import ContrastVecInternlmForCausalLM
from repe.modeling_internlm_xcomposer2 import InternLMXComposer2ForCausalLM

model_name_or_path = "/home/yerong2/models/internlm-xcomposer2-vl-7b"

model = ContrastVecInternlmForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).eval()
# model = InternLMXComposer2ForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).eval()

from accelerate import dispatch_model
device_map = auto_configure_device_map(2)
model = dispatch_model(model, device_map=device_map)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
model.tokenizer = tokenizer

# query = '<ImageHere>Please describe this image'
# image = ['../tiger.jpeg']
image = 'tiger.jpeg'

questions = ['<ImageHere>Please describe this image']
query = questions[0]

pos_p = 'Give a truthful answer. '
neg_p = 'Give an untruthful answer. '

def Encode(model, query, image):
    meta_instruction = 'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n'
    '- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n'
    '- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文.\n'
    '- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively based on the provided image.'
    
    image = model.encode_img(image)
        
    inputs, im_mask = model.interleav_wrap_chat(model.tokenizer, query, image, [], meta_instruction)
    # print(model.tokenizer is None) # DEBUG
    inputs = {
        k: v.to(model.device)
        for k, v in inputs.items() if torch.is_tensor(v)
    }    
    return inputs, im_mask

contrast_tokens=-8 # last {tokens} tokens are used to compute the diff in hidden_states
alpha = 0
layer_ids = np.arange(0, 32, 2).tolist()
# also add end-of-assistant token in eos token id to avoid unnecessary generation
eos_token_id = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids(['[UNUSED_TOKEN_145]'])[0]
]
model.eval()

with torch.autocast(device_type='cuda', dtype=torch.float16):

    for q in questions:
        q_pos = pos_p + q
        q_neg = neg_p + q
        
        # enc = tokenizer([input, input_pos, input_neg], return_tensors='pt', padding='longest').to(model.device)
        inputs, im_mask = Encode(model, q, image)
        pos_inputs, pos_im_mask = Encode(model, q_pos, image)
        neg_inputs, neg_im_mask = Encode(model, q_neg, image)

        
        repe_args = dict(pos_input_ids=pos_inputs,
                 pos_img_mask=pos_im_mask,
                 neg_input_ids=neg_inputs,
                 neg_img_mask=neg_im_mask,
                 contrast_tokens=contrast_tokens,
                 compute_contrast=True,
                 use_cache=False, # not yet supporting generation with use_cache         
                 alpha=alpha,
                 control_layer_ids=layer_ids)
        
        

        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            eos_token_id=eos_token_id,
            im_mask=im_mask,
            do_sample=False,
            # **repe_args,
        )
        
        # @@@ controlled_outputs = model.generate(input_ids, 
        #                          attention_mask=attention_mask, 
        #                          max_new_tokens=256, 
        #                          do_sample=False, 
        #                          use_cache=False, # not yet supporting generation with use_cache
        #                          **repe_args)
        if image is None:
            outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
        else:
            outputs = outputs[0].cpu().tolist()
        response = tokenizer.decode(outputs, skip_special_tokens=True)
    
        print(response)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type internlmxcomposer2 to instantiate a model of type internlm. This is not supported for all configurations of models and can yield errors.


Set max length to 4096
Position interpolate from 24x24 to 35x35
Set max length to 4096
Position interpolate from 24x24 to 35x35


Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.63s/it]
Some weights of ContrastVecInternlmForCausalLM were not initialized from the model checkpoint at /home/yerong2/models/internlm-xcomposer2-vl-7b and are newly initialized: ['vit.vision_tower.vision_model.post_layernorm.bias', 'vit.vision_tower.vision_model.post_layernorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In the image, a majestic tiger is the focal point, lying comfortably on a lush, green grassy field. The tiger's body is oriented towards the left side of the image, but its head is turned to face the camera, giving us a direct gaze. The tiger's fur is a striking mix of orange and black stripes, with a white underbelly that contrasts with the greenery beneath it. The background is a blurred expanse of green, likely a mix of trees and bushes, which further emphasizes the tiger as the main subject of this image. The tiger's relaxed posture and the serene environment create a sense of tranquility.[UNUSED_TOKEN_145]
