In [1]:
import sys
import os

# Assumes the notebook is in src/distill.
# Go up two levels to the project root.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Added '{project_root}' to sys.path")

# Now you can import from src.model
# For example:
# from src.model.your_file import your_function

Added '/home/brimmann/works/xRAG' to sys.path


In [2]:
import argparse

In [3]:
def create_prompt_with_mistral_chat_format(messages,tokenizer,*args,**kwargs):
    # return tokenizer.apply_chat_template(messages,tokenize=False,add_special_tokens=False)
    formatted_text = ""
    for message in messages:
        if message['role'] == 'user':
            formatted_text += "[INST] " + message['content'] + " [/INST]"
        elif message['role'] == 'assistant':
            formatted_text += message['content'] + tokenizer.eos_token
        else:
            raise ValueError(
                "Mistral chat template only supports 'user' and 'assistant' roles. Invalid role: {}.".format(message["role"])
                )
    # formatted_text += " The answer is:"
    return formatted_text

In [63]:
def parse_args():
    args = argparse.Namespace(
        # --- Set your desired default values here ---
        retrieval_prefix='colbertv2',
        tf_idf_topk=0,
        base_model=None,  # e.g., 'path/to/base_model'
        use_rag=True,  # This will be set to True if retriever_name_or_path is provided
        enable_progress_bar=True,
        data='triviaqa',  # e.g., 'nq_open', 'hotpotqa', 'triviaqa', 'webqa', 'truthfulqa', 'factkg'
        model_name_or_path='Hannibal046/xrag-7b',  # e.g., 'path/to/your/model'
        eval_metrics=None,  # This is set based on the 'data' argument below
        n_shot=0,
        retriever_name_or_path='Salesforce/SFR-Embedding-Mistral',  # e.g., 'colbertv2/colbertv2.0'
        retrieval_topk=[1],
        retrieval_embed_length=0,
        max_test_samples=2,  # e.g., 100 for debugging
        save_dir='./outputs',  # e.g., 'path/to/save/results'
        eval_batch_size=4,
        chat_format='mistral',
    )

    ## post-process
    if args.data in ['nq_open','hotpotqa','triviaqa','webqa']:
        args.task_type = 'open_qa'
        args.eval_metrics = 'substring_match'
    elif args.data in ['truthfulqa']:
        args.task_type = 'open_qa'
        args.eval_metrics = 'truthfulqa_f1_rl'
    elif args.data in ['factkg']:
        args.task_type = 'fact_checking'
        args.eval_metrics = 'fact_checking_acc'
    
    args.retrieval_topk = [x-1 for x in args.retrieval_topk] ## rank starts from 1
    
    if args.chat_format is not None:
        args.chat_format = eval(f"create_prompt_with_{args.chat_format}_chat_format")    
    
    if args.retriever_name_or_path is not None:
        args.use_rag = True

    return args

In [64]:
args = parse_args()

In [53]:
args.retrieval_topk

[0]

In [54]:
args.retriever_name_or_path

'Salesforce/SFR-Embedding-Mistral'

In [34]:
from transformers import (
    AutoTokenizer
)

In [94]:
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name_or_path,
    padding_side = 'left',
    add_eos_token=False, ## import to include this!
    use_fast=False,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [95]:
if tokenizer.pad_token:
    pass
elif tokenizer.unk_token:
    tokenizer.pad_token_id = tokenizer.unk_token_id
elif tokenizer.eos_token:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [36]:
import torch

In [37]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
retrieval_embed_length = 0
retriever,retriever_tokenizer = None,None

In [38]:
from src.model import (
    XMistralForCausalLM,
    XMixtralForCausalLM,
    SFR,
)

In [39]:
print(args.retriever_name_or_path)

Salesforce/SFR-Embedding-Mistral


In [40]:
if args.retriever_name_or_path is not None:
    
    if args.retriever_name_or_path.lower() == 'salesforce/sfr-embedding-mistral':
        retriever = SFR.from_pretrained(args.retriever_name_or_path,torch_dtype = torch.bfloat16)
        retriever_tokenizer = AutoTokenizer.from_pretrained(args.retriever_name_or_path)
    retrieval_embed_length = retriever.get_embed_length()
    retriever_hidden_size = retriever.get_embed_dim()
    retriever.eval()
    retriever = retriever.to(device)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [41]:
from src.eval.run_eval import load_dataset

In [42]:
%cd ..

/home/brimmann/works


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [57]:
%cd xRAG/

/home/brimmann/works/xRAG


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [58]:
dev_data,test_data = load_dataset(
    args.data,
    args.use_rag,
    args,
)

In [61]:
test_data[0]

{'id': '0',
 'question': 'Who was the man behind The Chipmunks?',
 'answer': ['David Seville'],
 'entity': 'David Seville',
 'background': ['Alvin and the Chipmunks | " Alvin and the Chipmunks, originally David Seville and the Chipmunks or simply The Chipmunks, are an American animated virtual band created by Ross Bagdasarian for a novelty record in 1958. The group consists of three singing animated anthropomorphic chipmunks named Alvin, Simon, and Theodore. They are managed by their human adoptive father, David ""Dave"" Seville. Bagdasarian provided the group\'s voices sped up to create high-pitched squeaky voices (which wasn\'t entirely new to him, having worked on ""Witch Doctor"" earned the record two Grammy Awards for engineering). ""The Chipmunk Song"" became a number-one single in the United States. After Bagdasarian died in 1972, the characters’ voices were provided by his son Ross Bagdasarian Jr. and the latter\'s wife Janice Karman in the subsequent incarnations of "']}

In [65]:
if args.max_test_samples is not None:
    test_data = test_data[:args.max_test_samples]

[{'id': '0',
  'question': 'Who was the man behind The Chipmunks?',
  'answer': ['David Seville'],
  'entity': 'David Seville',
  'background': ['Alvin and the Chipmunks | " Alvin and the Chipmunks, originally David Seville and the Chipmunks or simply The Chipmunks, are an American animated virtual band created by Ross Bagdasarian for a novelty record in 1958. The group consists of three singing animated anthropomorphic chipmunks named Alvin, Simon, and Theodore. They are managed by their human adoptive father, David ""Dave"" Seville. Bagdasarian provided the group\'s voices sped up to create high-pitched squeaky voices (which wasn\'t entirely new to him, having worked on ""Witch Doctor"" earned the record two Grammy Awards for engineering). ""The Chipmunk Song"" became a number-one single in the United States. After Bagdasarian died in 1972, the characters’ voices were provided by his son Ross Bagdasarian Jr. and the latter\'s wife Janice Karman in the subsequent incarnations of "']},

In [67]:
from src.eval.run_eval import prepare_prompts

In [68]:
prompts,backgrounds = prepare_prompts(
    dev_data = dev_data,
    test_data = test_data,
    task_type = args.task_type,
    tokenizer = tokenizer,
    n_shot = args.n_shot,
    use_rag = args.use_rag,
    retrieval_embed_length = retrieval_embed_length,
    chat_format = args.chat_format, 
)

**************************************** show one example ****************************************
[INST] Refer to the background document and answer the questions:

Background: <xRAG>

Question: Who was the man behind The Chipmunks?? [/INST] The answer is:
**************************************** show one example ****************************************


In [71]:
backgrounds

[['Alvin and the Chipmunks | " Alvin and the Chipmunks, originally David Seville and the Chipmunks or simply The Chipmunks, are an American animated virtual band created by Ross Bagdasarian for a novelty record in 1958. The group consists of three singing animated anthropomorphic chipmunks named Alvin, Simon, and Theodore. They are managed by their human adoptive father, David ""Dave"" Seville. Bagdasarian provided the group\'s voices sped up to create high-pitched squeaky voices (which wasn\'t entirely new to him, having worked on ""Witch Doctor"" earned the record two Grammy Awards for engineering). ""The Chipmunk Song"" became a number-one single in the United States. After Bagdasarian died in 1972, the characters’ voices were provided by his son Ross Bagdasarian Jr. and the latter\'s wife Janice Karman in the subsequent incarnations of "'],
 ["Jamie Lee Curtis |  Jamie Lee Curtis (born November 22, 1958) is an American actress and writer. She is the recipient of several accolades, 

In [72]:
prompts

['[INST] Refer to the background document and answer the questions:\n\nBackground: <xRAG>\n\nQuestion: Who was the man behind The Chipmunks?? [/INST] The answer is:',
 '[INST] Refer to the background document and answer the questions:\n\nBackground: <xRAG>\n\nQuestion: What star sign is Jamie Lee Curtis?? [/INST] The answer is:']

In [75]:
retrieval_embeds = None

In [77]:
from src.eval.run_eval import prepare_retrieval_embeds

In [78]:
retrieval_embeds = None
if retriever is not None:
    # backgrounds List[List[String]]
    num_samples = len(backgrounds)
    original_orders = []
    for idx,background in enumerate(backgrounds):
        original_orders.extend(
            [idx] * len(background)
        )
        
    backgrounds = [x for y in backgrounds for x in y]
    print(f"Preparing document embedding with {args.retriever_name_or_path}...")
    _retrieval_embeds = prepare_retrieval_embeds(
        backgrounds,
        retriever,
        retriever_tokenizer,
    )

    retrieval_embeds = [[] for _ in range(num_samples)]
    assert len(_retrieval_embeds) == len(original_orders)
    for id,embeds in zip(original_orders,_retrieval_embeds):
        retrieval_embeds[id].append(embeds)

    retriever = retriever.to("cpu")

Preparing document embedding with Salesforce/SFR-Embedding-Mistral...


In [81]:
len(retrieval_embeds)

2

In [82]:
avg_prompt_length = tokenizer(prompts,return_length=True).length
avg_prompt_length = sum(avg_prompt_length)/len(avg_prompt_length)

In [84]:
from transformers import (
    MistralForCausalLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    MixtralForCausalLM,
)

In [86]:
 ## load llm
config = AutoConfig.from_pretrained(args.model_name_or_path)
MODEL_CLASS = eval(config.architectures[0])
model = MODEL_CLASS.from_pretrained(
    args.model_name_or_path,
    torch_dtype = torch.bfloat16,
    low_cpu_mem_usage = True,
    device_map='auto',
    offload_folder="./offload"
)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



In [87]:
model.eval()

XMistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32002, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm

In [90]:
from src.language_modeling.utils import (
    XRAG_TOKEN,
    get_retrieval_embeds,
)

In [None]:
if retriever is not None:
    assert XRAG_TOKEN in tokenizer.get_vocab() 
    model.set_xrag_token_id(tokenizer.convert_tokens_to_ids(XRAG_TOKEN))

In [92]:
from src.eval.run_eval import llm_for_open_generation

In [98]:
if args.task_type in ['open_qa','fact_checking']:
    generated_results = llm_for_open_generation(
        llm = model,
        llm_tokenizer = tokenizer,
        prompts = prompts,
        retrieval_embeds = retrieval_embeds,
        batch_size = args.eval_batch_size,
        enable_progress_bar= args.enable_progress_bar,
    )

  0%|                                 | 0/2 [00:00<?, ?it/s]

NameError: name 'tokenizer' is not defined