In [1]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")


from methods.llava_utils import load_llava_state
from methods.blip_utils import load_blip_state
from tqdm import tqdm
import os
import pickle
from methods.algorithms import get_phrase_embedding, generate_mass_edit_hook
from methods.utils import coco_img_id_to_name, display_image
import torch
import random

torch.set_grad_enabled(False)

os.environ['VL_ROOT_DIR'] = '/root/vl-interp'

os.chdir(os.environ["VL_ROOT_DIR"])

In [5]:
model_type = "llava7b"
device="cpu"
lT = 19
lI = 21
alpha = 1

In [6]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [7]:
if model_type == "llava7b":
  # Load the LlaVA model
  loaded_state = load_llava_state(device,
                                  # train = True
                                  )
elif model_type == "blip7b":
  loaded_state = load_blip_state(device,
                                #  train = True
                                 )
else:
  raise Exception(f"model type {model_type} not supported")

vocabulary, vocab_embeddings, execute_model, register_hook, tokenizer, hidden_layer_embedding = loaded_state["vocabulary"], loaded_state["vocab_embeddings"], loaded_state["execute_model"], loaded_state["register_hook"], loaded_state["tokenizer"], loaded_state["hidden_layer_embedding"]

id_to_token = dict()
for word in vocabulary:
  id_to_token[vocabulary[word]] = word

You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.90it/s]


Model device: cpu


In [15]:
coco_img = 562150
image_path = os.path.join('./images', "COCO_val2014_000000562150.jpg")

In [11]:
evaluator = pickle.load(open('./metric/chair.pkl', "rb"))

In [16]:
# Get baseline caption
baseline_caption = execute_model(image_path)
baseline_evals = evaluator.compute_hallucinations(coco_img, baseline_caption)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [None]:
# Select a target object to erase and extract a text embedding for the object

text_embeddings = []
for caption_word, coco_class in set(baseline_evals["mscoco_hallucinated_words"]):
  text_embeddings.append(hidden_layer_embedding(caption_word, layer = lT))

In [None]:
# Hook into the model's intermediate activations to linearly edit them

if model_type == "llava7b":
  edit_embeddings_hook = generate_mass_edit_hook(text_embeddings, start_edit_index=35, end_edit_index=611, layer=lT, weight = alpha, minimum_size=576)
else:
  edit_embeddings_hook = generate_mass_edit_hook(text_embeddings, start_edit_index=0, end_edit_index=32, layer=lT, weight = alpha, minimum_size=32)
hook = register_hook(edit_embeddings_hook, lI)

# Remember to remove the hook if you want to try another layer!
# hook.remove()

In [None]:
new_caption = execute_model(image_path)

# Compute the hallucinations
new_chair_eval = evaluator.compute_hallucinations(coco_img, new_caption)

In [None]:
display_image(image_path)
print("==== Baseline ====")
print(baseline_caption)
print(baseline_evals)
print("==== Edited ====")
print(new_caption)
print(new_chair_eval)