#### Test llm.py (Llama3 8b, mistral 7b v0.2)

In [6]:
import sys
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
from llm import LM
import importlib
importlib.reload(sys.modules['llm'])
from llm import LM
import gc
import torch
from accelerate.utils import release_memory

def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()

In [7]:

# Specify the custom cache directory
cache_dir = "/data/rech/huiyuche/huggingface"

# empty the memory
flush()
# or
# release_memory()

# llm = LM(
#     model_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct",
#     tokenizer_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct",
#     padding_side="left",
#     dtype="bf16",
#     device_map= "auto",
#     use_flash_attention_2=False,
#     access_token=None,
#     cache_dir=cache_dir,
#     accelerator = None
# )
llm = LM(
    model_name_or_path="mistralai/Mistral-7B-Instruct-v0.2",
    tokenizer_name_or_path="mistralai/Mistral-7B-Instruct-v0.2",
    padding_side="left",
    dtype="bf16",
    device_map= "auto",
    attn_implementation="flash_attention_2",
    #use_flash_attention_2=False, (deprecated)
    access_token=None,
    cache_dir=cache_dir,
    accelerator = None
)



Loading checkpoint shards: 100%|██████████| 3/3 [02:13<00:00, 44.37s/it]


In [8]:
def bytes_to_giga_bytes(bytes):
    return bytes / 1024 / 1024 / 1024

bytes_to_giga_bytes(torch.cuda.max_memory_allocated())

3.205310344696045

In [9]:
print(type(llm.model))

<class 'transformers.models.mistral.modeling_mistral.MistralForCausalLM'>


In [10]:
messages = [
    {"role": "user", "content": "what is the capital of France?"},
]
response = llm.hf_llm_generate(
    messages,
    temperature = 1,
    top_p = 0.9,
    max_new_tokens = 256,
    do_sample = True,
    num_beams = 1,
    num_return_sequences = 2
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [11]:
print(response)

['The capital city of France is Paris. Paris is the most populous city in France and is known for its iconic landmarks such as the Eiffel Tower, Louvre Museum, Notre-Dame Cathedral, and the Arc de Triomphe. It is also home to numerous world-renowned universities, museums, galleries, and theaters, making it a global cultural hub. Paris is located in the north-central part of France and is surrounded by the Seine River.', 'The capital city of France is Paris. Paris is one of the most famous and visited cities in the world, known for its iconic landmarks such as the Eiffel Tower, Louvre Museum, Notre-Dame Cathedral, and the Arc de Triomphe. Paris is also the political, cultural, and economic center of France.']


In [61]:
messages = [
    [{"role": "user", "content": "Who are you?"}],
    [{"role": "user", "content": "what is the capital of Germany?"}],
]
outputs = llm.hf_llm_generate_via_pipline(
    messages,
    temperature = 0.1,
    top_p = 1,
    max_new_tokens = 256,
    do_sample = True,
    num_beams = 1,
    num_return_sequences = 2
)

In [62]:
print(outputs)

[[{'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate creative text, and assist with various tasks."}]}, {'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate text, and assist with various tasks."}]}], [{'generated_text': [{'role': 'user', 'content': 'what is the capital of Germany?'}, {'role': 'assistant', 'content': ' The capital city of Germany is Berlin. Berlin is one of the 16 federal states of Germany and is located in the

In [65]:
[[alternative["generated_text"][-1]["content"] for alternative in output] for output in outputs ]

[[" I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate creative text, and assist with various tasks.",
  " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate text, and assist with various tasks."],
 [' The capital city of Germany is Berlin. Berlin is one of the 16 federal states of Germany and is located in the northeastern part of the country. It is the most populous city in Germany and is known for its rich history, diverse culture, and vibrant arts scene. Berlin is home to many famous landmarks, including the Brandenburg Gate, the Reichstag Building, and the Berlin Wall Memorial.',
  " The capital city of Germany is B

### Download LLM

In [3]:
from accelerate import Accelerator
from accelerate import init_empty_weights
from accelerate import load_checkpoint_and_dispatch

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    logging
)

from peft import PeftModel, PeftConfig


# Specify the custom cache directory
cache_dir = "/data/rech/huiyuche/huggingface"



# with init_empty_weights():
#     model = AutoModelForCausalLM()

# model = load_checkpoint_and_dispatch(
#     model,
#     checkpoint=cache_dir + "/models--meta-llama--Meta-Llama-3-8B-Instruct",
#     device_map="auto",
# )
# print(model)


# accelerator = Accelerator()
# print(accelerator.device)

#### Mistral 7b v0.2

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2", cache_dir=cache_dir
)
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2", cache_dir=cache_dir
)

#### Llama3 8b

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct", cache_dir=cache_dir
)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", cache_dir = cache_dir, device_map="auto")

#### Rankllama

In [5]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig

def get_model(peft_model_name):
    config = PeftConfig.from_pretrained(peft_model_name)
    base_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels=1)
    model = PeftModel.from_pretrained(base_model, peft_model_name)
    model = model.merge_and_unload()
    model.eval()
    return model

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
model = get_model('castorini/rankllama-v1-7b-lora-passage')

# Define a query-passage pair
query = "What is llama?"
title = "Llama"
passage = "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."

# Tokenize the query-passage pair
inputs = tokenizer(f'query: {query}', f'document: {title} {passage}', return_tensors='pt')

# Run the model forward
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    score = logits[0][0]
    print(score)


Downloading shards: 100%|██████████| 2/2 [06:06<00:00, 183.17s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [02:04<00:00, 62.43s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor(6.1578)


#### Repllama

In [4]:
import torch
from transformers import AutoModel, AutoTokenizer
from peft import PeftModel, PeftConfig

def get_model(peft_model_name):
    config = PeftConfig.from_pretrained(peft_model_name)
    base_model = AutoModel.from_pretrained(config.base_model_name_or_path, 
                                           cache_dir=cache_dir, 
                                           device_map="auto")
    model = PeftModel.from_pretrained(base_model, peft_model_name)
    model = model.merge_and_unload()
    model.eval()
    return model

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
model = get_model('castorini/repllama-v1-7b-lora-passage')
print(model)
# Define query and passage inputs
# query = "What is llama?"
# title = "Llama"
# passage = "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."
# query_input = tokenizer(f'query: {query}</s>', return_tensors='pt')
# passage_input = tokenizer(f'passage: {title} {passage}</s>', return_tensors='pt')

# # Run the model forward to compute embeddings and query-passage similarity score
# with torch.no_grad():
#     # compute query embedding
#     query_outputs = model(**query_input)
#     query_embedding = query_outputs.last_hidden_state[0][-1]
#     query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=0)

#     # compute passage embedding
#     passage_outputs = model(**passage_input)
#     passage_embeddings = passage_outputs.last_hidden_state[0][-1]
#     passage_embeddings = torch.nn.functional.normalize(passage_embeddings, p=2, dim=0)

#     # compute similarity score
#     score = torch.dot(query_embedding, passage_embeddings)
#     print(score)


Downloading shards: 100%|██████████| 2/2 [04:52<00:00, 146.37s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [02:04<00:00, 62.47s/it]


LlamaModel(
  (embed_tokens): Embedding(32000, 4096)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
        (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
        (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm()
      (post_attention_layernorm): LlamaRMSNorm()
    )
  )
  (norm): LlamaRMSNorm()
)


# Test qrels

In [12]:
import pytrec_eval
qrel_file_path = "/data/rech/huiyuche/TREC_iKAT_2024/data/qrels/toy_qrel.txt"
with open(qrel_file_path, 'r') as f_qrel:
    qrel_ndcg = pytrec_eval.parse_qrel(f_qrel)
print(qrel_ndcg["9-1-3"])

{'clueweb22-en0000-00-00251:12': 0, 'clueweb22-en0000-06-17856:4': 1, 'clueweb22-en0000-78-08540:4': 0, 'clueweb22-en0000-84-18471:16': 0, 'clueweb22-en0001-53-12566:3': 1, 'clueweb22-en0002-40-16532:0': 1}


In [13]:
qrel = {
        qid: {docid: 1 if rel_score > 0 else 0 for docid, rel_score in qrel_ndcg[qid].items()} for qid in qrel_ndcg.keys()
    }
print(qrel["9-1-3"])

{'clueweb22-en0000-00-00251:12': 0, 'clueweb22-en0000-06-17856:4': 1, 'clueweb22-en0000-78-08540:4': 0, 'clueweb22-en0000-84-18471:16': 0, 'clueweb22-en0001-53-12566:3': 1, 'clueweb22-en0002-40-16532:0': 1}


In [14]:
# read ranking list
ranking_list_path = "/data/rech/huiyuche/TREC_iKAT_2024/results/test/toy_ranking_list.txt"
with open(ranking_list_path, 'r') as f_run:
    run = pytrec_eval.parse_run(f_run)

# pytrec_eval eval
evaluator = pytrec_eval.RelevanceEvaluator(qrel, {"map", "ndcg_cut.5"})
res = evaluator.evaluate(run)
print(res)
evaluator = pytrec_eval.RelevanceEvaluator(qrel_ndcg, {"map", "ndcg_cut.5"})
res = evaluator.evaluate(run)
print(res)

{'9-1-3': {'map': 0.4666666666666666, 'ndcg_cut_5': 0.4776237035032179}}
{'9-1-3': {'map': 0.4666666666666666, 'ndcg_cut_5': 0.4776237035032179}}


# Test collection


#### jsonl file check

In [1]:

counter = 0
with open("/data/rech/huiyuche/TREC_iKAT_2024/data/collections/ikat_23/cluweb22B_ikat_v2.jsonl", "r") as f:
    # count the total number of lines in the file
    line = f.readline()
    while line:
        counter += 1
        line = f.readline()

print("the total number of lines in the file is: ", counter)


the total number of lines in the file is:  116838987


In [None]:
with open("/data/rech/huiyuche/TREC_iKAT_2024/data/collections/ikat_23/cluweb22B_ikat_v2.jsonl", "r") as f:
    # count the total number of lines in the file
    line = f.readline()
    print(line)

In [None]:
counter = 0
with open("/data/rech/huiyuche/cast23_collection.tsv", "r") as f:
    # count the total number of lines in the file
    line = f.readline()
    while line:
        counter += 1
        line = f.readline()

print("the total number of lines in the file is: ", counter)

In [2]:
# read the first line

with open ("/data/rech/huiyuche/TREC_iKAT_2024/data/collections/ikat_23/cluweb22B_ikat_v2.jsonl", "r") as f:
    line = f.readline()
    print(line)

{"id": "clueweb22-en0004-50-00485:0", "contents": "RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson - Sociology Toolbox RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson Todd Beer on August 18, 2014 *For a look at the patterns that have emerged in the police shootings of unarmed black citizens since the shooting of Michael Brown in Ferguson click here. . . . Systematic racism has been made evident again in the shooting of an unarmed young Black man, Michael Brown, by a police officer in Ferguson, Missouri.  Pulling stories directly from recent news headlines is one way to get students’ attention and demonstrate the abundant relevance of the sociological perspective.  The New York Times has a timeline of the events that serves as a useful starting point (from the mainstream media) to share the events with students that may have not kept up with the story. The community of Ferguson, Missouri (the site of the shooting) has responded with on-going mass protes

In [None]:
{"id": "clueweb22-en0004-50-00485:0", "contents": "RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson - Sociology Toolbox RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson Todd Beer on August 18, 2014 *For a look at the patterns that have emerged in the police shootings of unarmed black citizens since the shooting of Michael Brown in Ferguson click here. . . . Systematic racism has been made evident again in the shooting of an unarmed young Black man, Michael Brown, by a police officer in Ferguson, Missouri.  Pulling stories directly from recent news headlines is one way to get students’ attention and demonstrate the abundant relevance of the sociological perspective.  The New York Times has a timeline of the events that serves as a useful starting point (from the mainstream media) to share the events with students that may have not kept up with the story. The community of Ferguson, Missouri (the site of the shooting) has responded with on-going mass protests. Ferguson cannot be understood in a vacuum. These events are rich with sociological issues –  inequality and poverty, racial profiling, the militarization of the police, protester and police interaction, social media (#Ferguson and hashtag activism) and the “criminalization of Black male youth”. Looking first at the disproportionate levels of poverty and subsequent exclusion from the economy of many Blacks in the US, Brookings, a Democratic leaning think tank, analyzed census tract data of changes in the poverty rates in Ferguson (and the surrounding area) between 2000 and 2008-2012.   They state: “But Ferguson has also been home to dramatic economic changes in recent years."}

#### tsv file check

In [1]:
line = "clueweb22-en0022-08-02126:19	Personal Requirements: ●	18 years of age or older. ●	Excellent command of the English Language. ●	Computer literate. ●	Quiet working environment. ●	Willing and able to work from home. Technical Requirements: ●	Reliable internet connection at home, with download and upload speeds of at least 1 Mbps. ●	Laptop or Desktop computer. ●	Webcam. ●	Headset/Earphones. Benefits: ●	Work from Home.	https://www.learn4good.com/jobs/language/english/list/teaching/turkey/"

splits= line.strip().split('\t')

print(len(splits))
print(splits)

13
['clueweb22-en0022-08-02126:19', 'Personal Requirements: ●', '18 years of age or older. ●', 'Excellent command of the English Language. ●', 'Computer literate. ●', 'Quiet working environment. ●', 'Willing and able to work from home. Technical Requirements: ●', 'Reliable internet connection at home, with download and upload speeds of at least 1 Mbps. ●', 'Laptop or Desktop computer. ●', 'Webcam. ●', 'Headset/Earphones. Benefits: ●', 'Work from Home.', 'https://www.learn4good.com/jobs/language/english/list/teaching/turkey/']


In [3]:
line = "clueweb22-en0004-50-00485:0	RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson - Sociology Toolbox RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson Todd Beer on August 18, 2014 *For a look at the patterns that have emerged in the police shootings of unarmed black citizens since the shooting of Michael Brown in Ferguson click here. . . . Systematic racism has been made evident again in the shooting of an unarmed young Black man, Michael Brown, by a police officer in Ferguson, Missouri.  Pulling stories directly from recent news headlines is one way to get students’ attention and demonstrate the abundant relevance of the sociological perspective.  The New York Times has a timeline of the events that serves as a useful starting point (from the mainstream media) to share the events with students that may have not kept up with the story. The community of Ferguson, Missouri (the site of the shooting) has responded with on-going mass protests. Ferguson cannot be understood in a vacuum. These events are rich with sociological issues –  inequality and poverty, racial profiling, the militarization of the police, protester and police interaction, social media (#Ferguson and hashtag activism) and the “criminalization of Black male youth”. Looking first at the disproportionate levels of poverty and subsequent exclusion from the economy of many Blacks in the US, Brookings, a Democratic leaning think tank, analyzed census tract data of changes in the poverty rates in Ferguson (and the surrounding area) between 2000 and 2008-2012.   They state: “But Ferguson has also been home to dramatic economic changes in recent years.	https://thesocietypages.org/toolbox/racism-police-ferguson/"
print("the number of tab in a line is", sum([1 for c in line if c == "\t"]) )

the number of tab in a line is 2


# Test general python operation

In [17]:
a = 2
b = 3
name = 5
def b():
    a()

def a():
    print(name)
    print("abc")


name = "yuchen"
b()

yuchen
abc


In [None]:
[[{'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': "I am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I'm not a human, but a computer program designed to simulate conversation and answer questions to the best of my knowledge based on my training data.\n\nI'm a large language model, which means I've been trained on a massive dataset of text from various sources, including books, articles, and websites. This training enables me to understand and generate human-like language, allowing me to engage in conversations, answer questions, and even create text based on a given prompt.\n\nI'm constantly learning and improving my responses based on the interactions I have with users like you. So, feel free to ask me anything, and I'll do my best to provide a helpful and accurate response!"}]}]

In [None]:
[[{'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model developed by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a physical presence. I exist solely to process and generate text."}]}, {'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate text, and assist with various tasks."}]}]]

In [None]:
[[{'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate creative text, and assist with various tasks."}]}, {'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate text, and assist with various tasks."}]}], [{'generated_text': [{'role': 'user', 'content': 'what is the capital of Germany?'}, {'role': 'assistant', 'content': ' The capital city of Germany is Berlin. Berlin is one of the 16 federal states of Germany and is located in the northeastern part of the country. It is the most populous city in Germany and is known for its rich history, diverse culture, and vibrant arts scene. Berlin is home to many famous landmarks, including the Brandenburg Gate, the Reichstag Building, and the Berlin Wall Memorial.'}]}, {'generated_text': [{'role': 'user', 'content': 'what is the capital of Germany?'}, {'role': 'assistant', 'content': " The capital city of Germany is Berlin. Berlin is one of the 16 federal states of Germany and is located in the northeastern part of the country. It is the most populous city in Germany and is known for its rich history, diverse culture, and vibrant arts scene. Berlin has been the capital of Germany since the country's reunification in 1990."}]}]]

# Test topics.py


In [36]:
import sys
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
from topics import Turn, load_turns_from_ikat_topic_files, save_turns_to_json, load_turns_from_json,Result,Reformulation
from dataclasses import asdict
import importlib
importlib.reload(sys.modules['topics'])
from topics import Turn, load_turns_from_ikat_topic_files

In [37]:
list_of_turns = load_turns_from_ikat_topic_files(ikat_topic_file = "/data/rech/huiyuche/TREC_iKAT_2024/data/topics/2023_ikat_test_topics.json")

#### test if dataclasses.asdict works

In [None]:
lists_of_turns_dict = [asdict(turn) for turn in list_of_turns]
print(lists_of_turns_dict)
# save_to_json
save_turns_to_json(list_of_turns, "/data/rech/huiyuche/TREC_iKAT_2024/test/ikat_2023_test.json")


#### test if load_turns_from_json works

In [15]:
list_of_turns = load_turns_from_json("/data/rech/huiyuche/TREC_iKAT_2024/test/2023_ikat_test_topics.json")

#### test add_reformulation,

In [16]:
turn = list_of_turns[0]
print(turn)
turn.add_reformulation(reformulated_query="emm",reformulation_name="test_name")
reformulation = turn.find_reformulation("test_name")
turn.add_result(
    collection = "clueweb12B",
    retrieval_model = "BM25",
    reranker = "rankllama",
    retrieval_query_type = "a",
    reranking_query_type = "b",
    generation_query_type = "c", 
    metrics_dict = {
        "map": 0.5,
        "ndcg_cut_10":0.5
    }
)
print(turn)

Turn ID: 9-1-1
Title: Finding a diet
Current Utterance: Can you help me find a diet for myself?
Oracle Utterance: Can you help me find a diet for myself considering that I'm vegetarian, allergic to soybeans, lactose intolerant, can't exercise too much, and should drink water regularly?
Number of reformulations: 0
Number of results: 0

Turn ID: 9-1-1
Title: Finding a diet
Current Utterance: Can you help me find a diet for myself?
Oracle Utterance: Can you help me find a diet for myself considering that I'm vegetarian, allergic to soybeans, lactose intolerant, can't exercise too much, and should drink water regularly?
Number of reformulations: 1
Number of results: 1



In [17]:
turn_dict = asdict(turn)
print(turn_dict)

{'turn_id': '9-1-1', 'conversation_id': '9-1', 'title': 'Finding a diet', 'current_utterance': 'Can you help me find a diet for myself?', 'current_response': 'Sure, these diets fit your condition and preference: 1) Vegan Mediterranean combines the principles of the Mediterranean diet with veganism and emphasizes whole grains, vegetables, beans, nuts, seeds, herbs, and spices. It excludes animal foods while incorporating plant-based alternatives, offering a sustainable and compassionate approach. (2) Vegan Keto: is a high-fat, low-carbohydrate, and moderate-protein diet which is also exclusively plant-based. It suits individuals following a vegan lifestyle who wish to adopt a ketogenic eating plan. (3) Eco-Atkins is a vegetarian variation of the Atkins diet which focuses on whole grains, vegetables, tofu, legumes, nuts, and seeds while minimizing animal products. By reducing carbohydrate intake and incorporating plant-based protein sources, it promotes sustainability and healthier eatin


#### test from_dict

In [18]:
new_turn = Turn()
new_turn.from_dict(turn_dict)
print(new_turn.reformulations)

[Reformulation(reformulation_name='test_name', reformulated_query='emm')]


# Test rerank

In [63]:
import sys
import importlib
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
import rerank
importlib.reload(sys.modules['rerank'])
from rerank import load_rankllama, rerank_rankllama 

cache_dir = "/data/rech/huiyuche/huggingface"


query = "what is the capital of China?"

passages = [
    "The capital of China is Beijing.",
    "Beijing. The modern day capital of China is Beijing (literally 'Northern Capital'), which first served as China's capital city in 1261, when the Mongol ruler Kublai Khan established his seat of power in the area centered around what is today Beihai Park.",
    "The capital of France is Paris.",
    "Llama The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."
]
tokenizer, model = load_rankllama(cache_dir)



Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.67s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
a = "abc"
print(a[:-1])

ab


In [64]:
scores = rerank_rankllama(query, passages, tokenizer, model)
print(scores) 

[tensor(3.4062, dtype=torch.bfloat16), tensor(4.4688, dtype=torch.bfloat16), tensor(-3.9062, dtype=torch.bfloat16), tensor(-9.8125, dtype=torch.bfloat16)]


In [58]:
print(type(model))
print(type(tokenizer))

<class 'transformers.models.llama.modeling_llama.LlamaForSequenceClassification'>
<class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>


#### data structure of hits

In [68]:
from pyserini.search.lucene import LuceneSearcher

retrieval_query_list = [
    "what is the capital of France?",
    "what is the capital of Germany?",
    "what is the capital of Italy?"
]
qid_list_string = [
    "id1",
    "id2",
    "id3"
]

searcher = LuceneSearcher("/part/01/Tmp/yuchen/indexes/clueweb22b_ikat23_fengran_sparse_index_2/")
searcher.set_bm25(0.9, 0.4)
hits = searcher.batch_search(retrieval_query_list, qid_list_string, k = 10, threads = 40)

Jun 27, 2024 5:52:04 A.M. org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [72]:
import json
emmm = hits["id1"]
for hit in emmm:
    print('Rank: {} | PassageID: {} | Score: {}'.format(0, hit.docid, hit.score))
    doc = searcher.doc(hit.docid)
    parsed_doc = json.loads(doc.raw())
    print(parsed_doc['contents'])

Rank: 0 | PassageID: clueweb22-en0017-32-04653:0 | Score: 9.22029972076416
What is the Capital of France? | Mappr Capital Cities What is the Capital of France? What is the Capital of France? Paris is the capital city of France and the center of France. It is built on the Seine River, in the middle of the Paris Basin. Where is Paris? Known for its monuments, artistic and cultural life all over the world, Paris is also one of the major economic and political centers along with being an important city in world history and is one of the transit points of international transport. Paris, a fashion and luxury world capital, is also known as “City of Lights”. When did Paris Become the Capital? The most important archaeological finds are the remains of the oldest permanent human settlement in the Paris region, which was discovered in 1991 in the 12th region.
Rank: 0 | PassageID: clueweb22-en0016-24-18543:0 | Score: 8.990500450134277
Brexit & Second Homes in France: What you need to know FRANCE 

In [2]:
print(hits)

{'id2': [<io.anserini.search.ScoredDoc at 0x7f314739a710 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x1127f35a at 0x7f314747ffb0>>, <io.anserini.search.ScoredDoc at 0x7f314739a620 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x1127f362 at 0x7f314747f9d0>>, <io.anserini.search.ScoredDoc at 0x7f314739aee0 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x1127f372 at 0x7f314747cd90>>, <io.anserini.search.ScoredDoc at 0x7f314739a210 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x1127f37a at 0x7f314747ffd0>>, <io.anserini.search.ScoredDoc at 0x7f314739a170 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x1127f3a2 at 0x7f314747f950>>, <io.anserini.search.ScoredDoc at 0x7f314739a3f0 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x1127f3b2 at 0x7f314747c050>>, <io.anserini.search.ScoredDoc at 0x7f314739a850 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x1127f3ba at 0x7f314747f470>>, <io.anserini.search.ScoredDoc at 0x7f31

In [7]:
import json

candidate_set = []
for qid, hit in hits.items():
    for i, doc_object in enumerate(hit):
        print('Rank: {} | PassageID: {} | Score: {}'.format(i+1, doc_object.docid, doc_object.score))
        doc = searcher.doc(doc_object.docid)
        parsed_doc = json.loads(doc.raw())
        print(parsed_doc['contents'])
        candidate_set.append({
            'passage_id': hit[i].docid,
            'bm25_rank': i+1,
            'bm25_score': hit[i].score,
            'passage_text': parsed_doc['contents']
        })

Rank: 1 | PassageID: clueweb22-en0021-14-14889:63 | Score: 9.510700225830078
London i^the capital of Great Britain. What is the capital o f Germany? Berlin is the capital o f Germany. What is the capital o f France? Paris is the capital o f France. What is the capital o f Poland? Warsaw is the capital o f Poland. What is the capital of Germany? Berlin is the capital of Germany. ffrom
Rank: 2 | PassageID: clueweb22-en0044-25-13469:0 | Score: 9.316100120544434
What is the Capital of Germany? | Mappr What is the Capital of Germany? What is the Capital of Germany? Berlin is the capital and largest city of Germany. It is also a state-city. Berlin is located in northern Germany, the sandy region between the Spree and Havel rivers. From 1961 to 1990, the city divided into two, East and West Berlin, was named after the wall that divides itself into two. Then the wall was demolished. Where is Berlin? When did Berlin Become the Capital?
Rank: 3 | PassageID: clueweb22-en0021-14-14889:62 | Score: 