#### Test llm.py (Llama3 8b, mistral 7b v0.2)

In [6]:
import sys
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
from llm import LM
import importlib
importlib.reload(sys.modules['llm'])
from llm import LM
import gc
import torch
from accelerate.utils import release_memory

def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()

In [7]:

# Specify the custom cache directory
cache_dir = "/data/rech/huiyuche/huggingface"

# empty the memory
flush()
# or
# release_memory()

# llm = LM(
#     model_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct",
#     tokenizer_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct",
#     padding_side="left",
#     dtype="bf16",
#     device_map= "auto",
#     use_flash_attention_2=False,
#     access_token=None,
#     cache_dir=cache_dir,
#     accelerator = None
# )
llm = LM(
    model_name_or_path="mistralai/Mistral-7B-Instruct-v0.2",
    tokenizer_name_or_path="mistralai/Mistral-7B-Instruct-v0.2",
    padding_side="left",
    dtype="bf16",
    device_map= "auto",
    attn_implementation="flash_attention_2",
    #use_flash_attention_2=False, (deprecated)
    access_token=None,
    cache_dir=cache_dir,
    accelerator = None
)



Loading checkpoint shards: 100%|██████████| 3/3 [02:13<00:00, 44.37s/it]


In [8]:
def bytes_to_giga_bytes(bytes):
    return bytes / 1024 / 1024 / 1024

bytes_to_giga_bytes(torch.cuda.max_memory_allocated())

3.205310344696045

In [9]:
print(type(llm.model))

<class 'transformers.models.mistral.modeling_mistral.MistralForCausalLM'>


In [10]:
messages = [
    {"role": "user", "content": "what is the capital of France?"},
]
response = llm.hf_llm_generate(
    messages,
    temperature = 1,
    top_p = 0.9,
    max_new_tokens = 256,
    do_sample = True,
    num_beams = 1,
    num_return_sequences = 2
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [11]:
print(response)

['The capital city of France is Paris. Paris is the most populous city in France and is known for its iconic landmarks such as the Eiffel Tower, Louvre Museum, Notre-Dame Cathedral, and the Arc de Triomphe. It is also home to numerous world-renowned universities, museums, galleries, and theaters, making it a global cultural hub. Paris is located in the north-central part of France and is surrounded by the Seine River.', 'The capital city of France is Paris. Paris is one of the most famous and visited cities in the world, known for its iconic landmarks such as the Eiffel Tower, Louvre Museum, Notre-Dame Cathedral, and the Arc de Triomphe. Paris is also the political, cultural, and economic center of France.']


In [61]:
messages = [
    [{"role": "user", "content": "Who are you?"}],
    [{"role": "user", "content": "what is the capital of Germany?"}],
]
outputs = llm.hf_llm_generate_via_pipline(
    messages,
    temperature = 0.1,
    top_p = 1,
    max_new_tokens = 256,
    do_sample = True,
    num_beams = 1,
    num_return_sequences = 2
)

In [62]:
print(outputs)

[[{'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate creative text, and assist with various tasks."}]}, {'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate text, and assist with various tasks."}]}], [{'generated_text': [{'role': 'user', 'content': 'what is the capital of Germany?'}, {'role': 'assistant', 'content': ' The capital city of Germany is Berlin. Berlin is one of the 16 federal states of Germany and is located in the

In [65]:
[[alternative["generated_text"][-1]["content"] for alternative in output] for output in outputs ]

[[" I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate creative text, and assist with various tasks.",
  " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate text, and assist with various tasks."],
 [' The capital city of Germany is Berlin. Berlin is one of the 16 federal states of Germany and is located in the northeastern part of the country. It is the most populous city in Germany and is known for its rich history, diverse culture, and vibrant arts scene. Berlin is home to many famous landmarks, including the Brandenburg Gate, the Reichstag Building, and the Berlin Wall Memorial.',
  " The capital city of Germany is B

### Download LLM

In [3]:
from accelerate import Accelerator
from accelerate import init_empty_weights
from accelerate import load_checkpoint_and_dispatch

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    logging
)

from peft import PeftModel, PeftConfig


# Specify the custom cache directory
cache_dir = "/data/rech/huiyuche/huggingface"



# with init_empty_weights():
#     model = AutoModelForCausalLM()

# model = load_checkpoint_and_dispatch(
#     model,
#     checkpoint=cache_dir + "/models--meta-llama--Meta-Llama-3-8B-Instruct",
#     device_map="auto",
# )
# print(model)


# accelerator = Accelerator()
# print(accelerator.device)

#### Mistral 7b v0.2

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2", cache_dir=cache_dir
)
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2", cache_dir=cache_dir
)

#### Llama3 8b

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct", cache_dir=cache_dir
)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", cache_dir = cache_dir, device_map="auto")

#### Rankllama

In [5]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig

def get_model(peft_model_name):
    config = PeftConfig.from_pretrained(peft_model_name)
    base_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels=1)
    model = PeftModel.from_pretrained(base_model, peft_model_name)
    model = model.merge_and_unload()
    model.eval()
    return model

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
model = get_model('castorini/rankllama-v1-7b-lora-passage')

# Define a query-passage pair
query = "What is llama?"
title = "Llama"
passage = "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."

# Tokenize the query-passage pair
inputs = tokenizer(f'query: {query}', f'document: {title} {passage}', return_tensors='pt')

# Run the model forward
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    score = logits[0][0]
    print(score)


Downloading shards: 100%|██████████| 2/2 [06:06<00:00, 183.17s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [02:04<00:00, 62.43s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor(6.1578)


#### Repllama

In [4]:
import torch
from transformers import AutoModel, AutoTokenizer
from peft import PeftModel, PeftConfig

def get_model(peft_model_name):
    config = PeftConfig.from_pretrained(peft_model_name)
    base_model = AutoModel.from_pretrained(config.base_model_name_or_path, 
                                           cache_dir=cache_dir, 
                                           device_map="auto")
    model = PeftModel.from_pretrained(base_model, peft_model_name)
    model = model.merge_and_unload()
    model.eval()
    return model

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
model = get_model('castorini/repllama-v1-7b-lora-passage')
print(model)
# Define query and passage inputs
# query = "What is llama?"
# title = "Llama"
# passage = "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."
# query_input = tokenizer(f'query: {query}</s>', return_tensors='pt')
# passage_input = tokenizer(f'passage: {title} {passage}</s>', return_tensors='pt')

# # Run the model forward to compute embeddings and query-passage similarity score
# with torch.no_grad():
#     # compute query embedding
#     query_outputs = model(**query_input)
#     query_embedding = query_outputs.last_hidden_state[0][-1]
#     query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=0)

#     # compute passage embedding
#     passage_outputs = model(**passage_input)
#     passage_embeddings = passage_outputs.last_hidden_state[0][-1]
#     passage_embeddings = torch.nn.functional.normalize(passage_embeddings, p=2, dim=0)

#     # compute similarity score
#     score = torch.dot(query_embedding, passage_embeddings)
#     print(score)


Downloading shards: 100%|██████████| 2/2 [04:52<00:00, 146.37s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [02:04<00:00, 62.47s/it]


LlamaModel(
  (embed_tokens): Embedding(32000, 4096)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
        (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
        (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm()
      (post_attention_layernorm): LlamaRMSNorm()
    )
  )
  (norm): LlamaRMSNorm()
)


# Test qrels

In [30]:
import pytrec_eval
qrel_file_path = "/data/rech/huiyuche/TREC_iKAT_2024/data/qrels/toy_qrel.txt"
with open(qrel_file_path, 'r') as f_qrel:
    qrel_ndcg = pytrec_eval.parse_qrel(f_qrel)
print(qrel_ndcg["9-1-3"])

{'clueweb22-en0000-00-00251:12': 2, 'clueweb22-en0000-06-17856:4': 3, 'clueweb22-en0000-78-08540:4': 3, 'clueweb22-en0000-84-18471:16': 2, 'clueweb22-en0001-53-12566:3': 1, 'clueweb22-en0002-40-16532:0': 1}


In [31]:
qrel = {
        qid: {docid: 1 if rel_score > 0 else 0 for docid, rel_score in qrel_ndcg[qid].items()} for qid in qrel_ndcg.keys()
    }
print(qrel["9-1-3"])

{'clueweb22-en0000-00-00251:12': 1, 'clueweb22-en0000-06-17856:4': 1, 'clueweb22-en0000-78-08540:4': 1, 'clueweb22-en0000-84-18471:16': 1, 'clueweb22-en0001-53-12566:3': 1, 'clueweb22-en0002-40-16532:0': 1}


In [33]:
# read ranking list
ranking_list_path = "/data/rech/huiyuche/TREC_iKAT_2024/results/test/toy_ranking_list.txt"
with open(ranking_list_path, 'r') as f_run:
    run = pytrec_eval.parse_run(f_run)

# pytrec_eval eval
evaluator = pytrec_eval.RelevanceEvaluator(qrel, {"map", "ndcg_cut.5"})
res = evaluator.evaluate(run)
print(res)
evaluator = pytrec_eval.RelevanceEvaluator(qrel_ndcg, {"map", "ndcg_cut.5"})
res = evaluator.evaluate(run)
print(res)

{'9-1-3': {'map': 1.0, 'ndcg_cut_5': 1.0}}
{'9-1-3': {'map': 1.0, 'ndcg_cut_5': 0.9299817480463239}}


In [6]:
dict_a = {'a': 1, 'b': 2}
dict_b = {'a': 3, 'b': 5}
dict_c = {'a': 4, 'b': 6}
list_abc = [dict_a, dict_b, dict_c]
sorted_abc = sorted(list_abc[0:2], key=lambda x: x['a'], reverse=True)
sorted_abc[0]['a'] = 100
sorted_abc[1]['a'] = 1000
print(list_abc)

[{'a': 1000, 'b': 2}, {'a': 100, 'b': 5}, {'a': 4, 'b': 6}]


{'9-1-3': {'map': 0.4666666666666666, 'ndcg_cut_5': 0.4776237035032179}}
{'9-1-3': {'map': 0.4666666666666666, 'ndcg_cut_5': 0.4776237035032179}}

'9-1-3': {'map': 1.0, 'ndcg_cut_5': 1.0}}
{'9-1-3': {'map': 1.0, 'ndcg_cut_5': 0.9299817480463239}}

# Test collection


#### jsonl file check, average passage length

In [8]:
import json
collection_path = "/part/01/Tmp/yuchen/ikat_official_collectioin/ikat_2023_passages_07.jsonl"
# collection_path = "/data/rech/huiyuche/TREC_iKAT_2024/data/collections/ikat_23/cluweb22B_ikat_v2.jsonl" 

num_passage = 0
total_passage_len = 0
with open(collection_path, "r") as f:
    # count the total number of lines in the file
    while True:
        try:
            jsonl_dict = json.loads(f.readline())
            num_passage += 1
            total_passage_len += len(jsonl_dict["contents"].split())
        except:
            break

        # if num_passage > 1000000:
        #     break


average_len = total_passage_len / num_passage
print(f"the average length for {num_passage} passages is: ", average_len)


NameError: name 'num_passsage' is not defined

In [9]:


print(f"the average length for {num_passage} passages is: ", average_len)

the average length for 7333325 passages is:  201.22463630072306


In [None]:
collection_path = "/data/rech/huiyuche/TREC_iKAT_2024/data/collections/ikat_23/cluweb22B_ikat_v2.jsonl" 
with open(collection_path, "r") as f:
    # count the total number of lines in the file
    line = f.readline()
    print(line)

In [None]:
counter = 0
with open("/data/rech/huiyuche/cast23_collection.tsv", "r") as f:
    # count the total number of lines in the file
    line = f.readline()
    while line:
        counter += 1
        line = f.readline()

print("the total number of lines in the file is: ", counter)

In [2]:
# read the first line

with open ("/data/rech/huiyuche/TREC_iKAT_2024/data/collections/ikat_23/cluweb22B_ikat_v2.jsonl", "r") as f:
    line = f.readline()
    print(line)

{"id": "clueweb22-en0004-50-00485:0", "contents": "RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson - Sociology Toolbox RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson Todd Beer on August 18, 2014 *For a look at the patterns that have emerged in the police shootings of unarmed black citizens since the shooting of Michael Brown in Ferguson click here. . . . Systematic racism has been made evident again in the shooting of an unarmed young Black man, Michael Brown, by a police officer in Ferguson, Missouri.  Pulling stories directly from recent news headlines is one way to get students’ attention and demonstrate the abundant relevance of the sociological perspective.  The New York Times has a timeline of the events that serves as a useful starting point (from the mainstream media) to share the events with students that may have not kept up with the story. The community of Ferguson, Missouri (the site of the shooting) has responded with on-going mass protes

In [None]:
{"id": "clueweb22-en0004-50-00485:0", "contents": "RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson - Sociology Toolbox RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson Todd Beer on August 18, 2014 *For a look at the patterns that have emerged in the police shootings of unarmed black citizens since the shooting of Michael Brown in Ferguson click here. . . . Systematic racism has been made evident again in the shooting of an unarmed young Black man, Michael Brown, by a police officer in Ferguson, Missouri.  Pulling stories directly from recent news headlines is one way to get students’ attention and demonstrate the abundant relevance of the sociological perspective.  The New York Times has a timeline of the events that serves as a useful starting point (from the mainstream media) to share the events with students that may have not kept up with the story. The community of Ferguson, Missouri (the site of the shooting) has responded with on-going mass protests. Ferguson cannot be understood in a vacuum. These events are rich with sociological issues –  inequality and poverty, racial profiling, the militarization of the police, protester and police interaction, social media (#Ferguson and hashtag activism) and the “criminalization of Black male youth”. Looking first at the disproportionate levels of poverty and subsequent exclusion from the economy of many Blacks in the US, Brookings, a Democratic leaning think tank, analyzed census tract data of changes in the poverty rates in Ferguson (and the surrounding area) between 2000 and 2008-2012.   They state: “But Ferguson has also been home to dramatic economic changes in recent years."}

#### tsv file check

In [1]:
line = "clueweb22-en0022-08-02126:19	Personal Requirements: ●	18 years of age or older. ●	Excellent command of the English Language. ●	Computer literate. ●	Quiet working environment. ●	Willing and able to work from home. Technical Requirements: ●	Reliable internet connection at home, with download and upload speeds of at least 1 Mbps. ●	Laptop or Desktop computer. ●	Webcam. ●	Headset/Earphones. Benefits: ●	Work from Home.	https://www.learn4good.com/jobs/language/english/list/teaching/turkey/"

splits= line.strip().split('\t')

print(len(splits))
print(splits)

13
['clueweb22-en0022-08-02126:19', 'Personal Requirements: ●', '18 years of age or older. ●', 'Excellent command of the English Language. ●', 'Computer literate. ●', 'Quiet working environment. ●', 'Willing and able to work from home. Technical Requirements: ●', 'Reliable internet connection at home, with download and upload speeds of at least 1 Mbps. ●', 'Laptop or Desktop computer. ●', 'Webcam. ●', 'Headset/Earphones. Benefits: ●', 'Work from Home.', 'https://www.learn4good.com/jobs/language/english/list/teaching/turkey/']


In [3]:
line = "clueweb22-en0004-50-00485:0	RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson - Sociology Toolbox RACISM AND THE POLICE: The Shooting of Michael Brown in Ferguson Todd Beer on August 18, 2014 *For a look at the patterns that have emerged in the police shootings of unarmed black citizens since the shooting of Michael Brown in Ferguson click here. . . . Systematic racism has been made evident again in the shooting of an unarmed young Black man, Michael Brown, by a police officer in Ferguson, Missouri.  Pulling stories directly from recent news headlines is one way to get students’ attention and demonstrate the abundant relevance of the sociological perspective.  The New York Times has a timeline of the events that serves as a useful starting point (from the mainstream media) to share the events with students that may have not kept up with the story. The community of Ferguson, Missouri (the site of the shooting) has responded with on-going mass protests. Ferguson cannot be understood in a vacuum. These events are rich with sociological issues –  inequality and poverty, racial profiling, the militarization of the police, protester and police interaction, social media (#Ferguson and hashtag activism) and the “criminalization of Black male youth”. Looking first at the disproportionate levels of poverty and subsequent exclusion from the economy of many Blacks in the US, Brookings, a Democratic leaning think tank, analyzed census tract data of changes in the poverty rates in Ferguson (and the surrounding area) between 2000 and 2008-2012.   They state: “But Ferguson has also been home to dramatic economic changes in recent years.	https://thesocietypages.org/toolbox/racism-police-ferguson/"
print("the number of tab in a line is", sum([1 for c in line if c == "\t"]) )

the number of tab in a line is 2


# Test general python operation

In [17]:
a = 2
b = 3
name = 5
def b():
    a()

def a():
    print(name)
    print("abc")


name = "yuchen"
b()

yuchen
abc


In [None]:
[[{'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': "I am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I'm not a human, but a computer program designed to simulate conversation and answer questions to the best of my knowledge based on my training data.\n\nI'm a large language model, which means I've been trained on a massive dataset of text from various sources, including books, articles, and websites. This training enables me to understand and generate human-like language, allowing me to engage in conversations, answer questions, and even create text based on a given prompt.\n\nI'm constantly learning and improving my responses based on the interactions I have with users like you. So, feel free to ask me anything, and I'll do my best to provide a helpful and accurate response!"}]}]

In [None]:
[[{'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model developed by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a physical presence. I exist solely to process and generate text."}]}, {'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate text, and assist with various tasks."}]}]]

In [None]:
[[{'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate creative text, and assist with various tasks."}]}, {'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " I am a large language model trained by Mistral AI. I am designed to generate human-like text based on the input I receive. I don't have the ability to have feelings, emotions, or a personality. I exist to help answer questions, generate text, and assist with various tasks."}]}], [{'generated_text': [{'role': 'user', 'content': 'what is the capital of Germany?'}, {'role': 'assistant', 'content': ' The capital city of Germany is Berlin. Berlin is one of the 16 federal states of Germany and is located in the northeastern part of the country. It is the most populous city in Germany and is known for its rich history, diverse culture, and vibrant arts scene. Berlin is home to many famous landmarks, including the Brandenburg Gate, the Reichstag Building, and the Berlin Wall Memorial.'}]}, {'generated_text': [{'role': 'user', 'content': 'what is the capital of Germany?'}, {'role': 'assistant', 'content': " The capital city of Germany is Berlin. Berlin is one of the 16 federal states of Germany and is located in the northeastern part of the country. It is the most populous city in Germany and is known for its rich history, diverse culture, and vibrant arts scene. Berlin has been the capital of Germany since the country's reunification in 1990."}]}]]

# Test topics.py


In [5]:
import sys
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
from topics import (
    Turn, 
    load_turns_from_ikat_topic_files, 
    save_turns_to_json, 
    load_turns_from_json,
    Result,
    Reformulation,
    get_context_by_qid
)
from dataclasses import asdict
import importlib
importlib.reload(sys.modules['topics'])
from topics import Turn, load_turns_from_ikat_topic_files

In [6]:
list_of_turns = load_turns_from_ikat_topic_files(ikat_topic_file = "/data/rech/huiyuche/TREC_iKAT_2024/data/topics/2023_ikat_test_topics.json")

#### Test get_context_by_qid

In [8]:
context = get_context_by_qid("9-1-3", list_of_turns)

#### test if dataclasses.asdict works

In [None]:
lists_of_turns_dict = [asdict(turn) for turn in list_of_turns]
print(lists_of_turns_dict)
# save_to_json
save_turns_to_json(list_of_turns, "/data/rech/huiyuche/TREC_iKAT_2024/test/ikat_2023_test.json")


#### test if load_turns_from_json works

In [15]:
list_of_turns = load_turns_from_json("/data/rech/huiyuche/TREC_iKAT_2024/test/2023_ikat_test_topics.json")

#### test add_reformulation,

In [16]:
turn = list_of_turns[0]
print(turn)
turn.add_reformulation(reformulated_query="emm",reformulation_name="test_name")
reformulation = turn.find_reformulation("test_name")
turn.add_result(
    collection = "clueweb12B",
    retrieval_model = "BM25",
    reranker = "rankllama",
    retrieval_query_type = "a",
    reranking_query_type = "b",
    generation_query_type = "c", 
    metrics_dict = {
        "map": 0.5,
        "ndcg_cut_10":0.5
    }
)
print(turn)

Turn ID: 9-1-1
Title: Finding a diet
Current Utterance: Can you help me find a diet for myself?
Oracle Utterance: Can you help me find a diet for myself considering that I'm vegetarian, allergic to soybeans, lactose intolerant, can't exercise too much, and should drink water regularly?
Number of reformulations: 0
Number of results: 0

Turn ID: 9-1-1
Title: Finding a diet
Current Utterance: Can you help me find a diet for myself?
Oracle Utterance: Can you help me find a diet for myself considering that I'm vegetarian, allergic to soybeans, lactose intolerant, can't exercise too much, and should drink water regularly?
Number of reformulations: 1
Number of results: 1



In [17]:
turn_dict = asdict(turn)
print(turn_dict)

{'turn_id': '9-1-1', 'conversation_id': '9-1', 'title': 'Finding a diet', 'current_utterance': 'Can you help me find a diet for myself?', 'current_response': 'Sure, these diets fit your condition and preference: 1) Vegan Mediterranean combines the principles of the Mediterranean diet with veganism and emphasizes whole grains, vegetables, beans, nuts, seeds, herbs, and spices. It excludes animal foods while incorporating plant-based alternatives, offering a sustainable and compassionate approach. (2) Vegan Keto: is a high-fat, low-carbohydrate, and moderate-protein diet which is also exclusively plant-based. It suits individuals following a vegan lifestyle who wish to adopt a ketogenic eating plan. (3) Eco-Atkins is a vegetarian variation of the Atkins diet which focuses on whole grains, vegetables, tofu, legumes, nuts, and seeds while minimizing animal products. By reducing carbohydrate intake and incorporating plant-based protein sources, it promotes sustainability and healthier eatin


#### test from_dict

In [18]:
new_turn = Turn()
new_turn.from_dict(turn_dict)
print(new_turn.reformulations)

[Reformulation(reformulation_name='test_name', reformulated_query='emm')]


#### change turn object format (or Result, Reformulation object format)

In [19]:
import sys
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
import importlib
import topics
importlib.reload(sys.modules['topics'])
from topics import Turn, load_turns_from_ikat_topic_files, save_turns_to_json, load_turns_from_json,Result,Reformulation
from dataclasses import asdict
import json

# current version
ikat_topic_file = "/data/rech/huiyuche/TREC_iKAT_2024/data/topics/ikat_2023_test.json"


# modified version
#where_to_save = "/data/rech/huiyuche/TREC_iKAT_2024/test/ikat_2023_test.json"
where_to_save = "/data/rech/huiyuche/TREC_iKAT_2024/data/topics/ikat_2023_test.json"


In [20]:
list_of_turns = load_turns_from_json(ikat_topic_file)
turn_dict_list = save_turns_to_json(list_of_turns, where_to_save)
print(json.dumps(turn_dict_list[2], indent=4))

{
    "turn_id": "9-1-3",
    "conversation_id": "9-1",
    "title": "Finding a diet",
    "current_utterance": "What about the DASH diet? I heard it is a healthy diet.",
    "current_response": "DASH diet is also a healthy diet that aims to reduce sodium intake while increasing the consumption of foods that are rich in nutrients. However, this diet includes meat and/or chicken. Since you are vegetarian this option is not suitable for you.",
    "oracle_utterance": "What about the DASH diet? I heard it is a healthy diet.",
    "response_provenance": [
        "clueweb22-en0020-69-12751:1",
        "clueweb22-en0028-21-06213:1"
    ],
    "context_utterances": [
        "Can you help me find a diet for myself?",
        "Ok, good. Can you tell me what diet is the fastest way to lose some weight?"
    ],
    "ptkb": {
        "1": "I don't like the new spin-off; because I keep comparing the two and it has lower quality.",
        "2": "Because of my kidney problem, I have to drink water 

### Test IKAT format output

In [1]:
import json
ikat_output_path = "/data/rech/huiyuche/TREC_iKAT_2024/results/ClueWeb_ikat/ikat_23_test/ikat_format_output/none.json"
with open(ikat_output_path, "r") as f:
    ikat_output = json.load(f)
print(ikat_output["turns"][0])


{'turn_id': '10-2_13', 'responses': [{'rank': 1, 'text': 'pseudo response 1', 'ptkb_provenance': [], 'passage_provenance': [{'id': 'clueweb22-en0013-24-12486:0', 'score': 16.136999130249023, 'used': True}, {'id': 'clueweb22-en0044-24-12435:0', 'score': 16.068199157714844, 'used': True}, {'id': 'clueweb22-en0012-73-01681:8', 'score': 15.756500244140625, 'used': True}, {'id': 'clueweb22-en0041-03-09116:0', 'score': 15.668299674987793, 'used': False}, {'id': 'clueweb22-en0025-44-01168:0', 'score': 15.497599601745605, 'used': False}]}]}


# Test rerank

In [32]:
import sys
import importlib
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
import rerank
from rank_gpt import run_retriever, sliding_windows
importlib.reload(sys.modules['rerank'])

from rerank import (
    load_rankllama, 
    rerank_rankllama,
    hits_2_rankgpt_list
    )

cache_dir = "/data/rech/huiyuche/huggingface"


query = "what is the capital of China?"

passages = [
    "The capital of China is Beijing.",
    "Beijing. The modern day capital of China is Beijing (literally 'Northern Capital'), which first served as China's capital city in 1261, when the Mongol ruler Kublai Khan established his seat of power in the area centered around what is today Beihai Park.",
    "The capital of France is Paris.",
    "Llama The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."
]



In [None]:
tokenizer, model = load_rankllama(cache_dir)

In [64]:
scores = rerank_rankllama(query, passages, tokenizer, model)
print(scores) 

[tensor(3.4062, dtype=torch.bfloat16), tensor(4.4688, dtype=torch.bfloat16), tensor(-3.9062, dtype=torch.bfloat16), tensor(-9.8125, dtype=torch.bfloat16)]


In [58]:
print(type(model))
print(type(tokenizer))

<class 'transformers.models.llama.modeling_llama.LlamaForSequenceClassification'>
<class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>


#### rankGPT

In [3]:
import os
os.system("source ~/.bashrc")
openai_key = os.getenv('openai_key')

#### data structure of hits

In [None]:
from pyserini.search.lucene import LuceneSearcher
LuceneSearcher.list_prebuilt_indexes()

In [1]:
from pyserini.search.lucene import LuceneSearcher

retrieval_query_list = [
    "what is the capital of France?",
    "what is the capital of Germany?",
    "what is the capital of Italy?"
]
qid_list_string = [
    "id1",
    "id2",
    "id3"
]

query_dict = {
    qid: query for qid, query in zip(qid_list_string, retrieval_query_list)
}

searcher = LuceneSearcher("/part/01/Tmp/yuchen/indexes/clueweb22b_ikat23_fengran_sparse_index_2/")
searcher.set_bm25(0.9, 0.4)
hits = searcher.batch_search(retrieval_query_list, qid_list_string, k = 10, threads = 40)

  from .autonotebook import tqdm as notebook_tqdm
Jul 09, 2024 5:51:28 A.M. org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [7]:
import numpy as np
liste = np.array([1,3,2,4,5], dtype=np.float32)
index = np.argsort(-liste)
print(index)

[4 3 1 2 0]


In [10]:
hits["id1"][0].rank = 0
print(hits["id1"][0].rank)

0


In [41]:
import json
rank_gpt_list, new_hits_dict = hits_2_rankgpt_list(searcher, query_dict, hits)
for i in range(3):
    rank_gpt_list[i]["hits"].append(
        {
            "content" : "Paris, the capital of France, was originally a small tribal settlement in 3rd century BC. It flourished under the Romans, becoming a prosperous city. In the Middle Ages, it became the capital of a major European kingdom and a center of learning. Paris played a central role in the French Revolution and was transformed into a modern city in the 19th century by Napoleon III. Today, it's a leading global city known for art, fashion, and culture",
            "qid":"id1",
            "docid": "pseudo-id",
            "rank": 10,
            "score":0.1
        }
    )
print(json.dumps(rank_gpt_list[0:1], indent=4))

[
    {
        "query": "what is the capital of France?",
        "hits": [
            {
                "content": "What is the Capital of France? | Mappr Capital Cities What is the Capital of France? What is the Capital of France? Paris is the capital city of France and the center of France. It is built on the Seine River, in the middle of the Paris Basin. Where is Paris? Known for its monuments, artistic and cultural life all over the world, Paris is also one of the major economic and political centers along with being an important city in world history and is one of the transit points of international transport. Paris, a fashion and luxury world capital, is also known as \u201cCity of Lights\u201d. When did Paris Become the Capital? The most important archaeological finds are the remains of the oldest permanent human settlement in the Paris region, which was discovered in 1991 in the 12th region.",
                "qid": "id1",
                "docid": "clueweb22-en0017-32-04653:

In [45]:
from tqdm import tqdm
new_rank_gpt_list = []
rank_end = 11
for item in tqdm(rank_gpt_list):
    new_item = sliding_windows(item, rank_start=0, rank_end=rank_end, window_size=5, step=2, model_name='gpt-3.5-turbo', api_key=openai_key)
    new_rank_gpt_list.append(new_item)


100%|██████████| 3/3 [00:10<00:00,  3.43s/it]


In [47]:
print(json.dumps(new_rank_gpt_list[0:1], indent=4))

[
    {
        "query": "what is the capital of France?",
        "hits": [
            {
                "content": "What is the Capital of France? | Mappr Capital Cities What is the Capital of France? What is the Capital of France? Paris is the capital city of France and the center of France. It is built on the Seine River, in the middle of the Paris Basin. Where is Paris? Known for its monuments, artistic and cultural life all over the world, Paris is also one of the major economic and political centers along with being an important city in world history and is one of the transit points of international transport. Paris, a fashion and luxury world capital, is also known as \u201cCity of Lights\u201d. When did Paris Become the Capital? The most important archaeological finds are the remains of the oldest permanent human settlement in the Paris region, which was discovered in 1991 in the 12th region.",
                "qid": "id1",
                "docid": "clueweb22-en0017-32-04653:

#### monoT5

In [7]:
import sys
cache_dir = "/data/rech/huiyuche/huggingface"
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
import importlib
import rerank
import llm
importlib.reload(sys.modules['llm'])
importlib.reload(sys.modules['rerank'])
from rerank import (
    load_t5_DP,
    rerank_t5_DP
)


2024-07-10 05:10:47.229103: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
tokenizer, model, decoder_stard_id, targeted_ids = load_t5_DP(
    cache_dir = cache_dir,
    model_name = "castorini/monot5-base-msmarco",
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Ready for predict()


In [9]:
print(tokenizer.encode("what is the capital of France?"))

[125, 19, 8, 1784, 13, 1410, 58, 1]


In [24]:
from pyserini.search.lucene import LuceneSearcher

retrieval_query_list = [
    "what is the capital of France?",
    "what is the capital of Germany?",
    "what is the capital of Italy?"
]
qid_list_string = [
    "id1",
    "id2",
    "id3"
]

query_dict = {
    qid: query for qid, query in zip(qid_list_string, retrieval_query_list)
}

searcher = LuceneSearcher("/part/01/Tmp/yuchen/indexes/clueweb22b_ikat23_fengran_sparse_index_2/")
#searcher = LuceneSearcher("/data/rech/huiyuche/TREC_iKAT_2024/data/indexes/clueweb22b_ikat23_fengran_sparse_index_2")
searcher.set_bm25(0.9, 0.4)
hits = searcher.batch_search(retrieval_query_list, qid_list_string, k = 1000, threads = 40)

In [26]:
 from tqdm import tqdm
 import json
 for qid, hit in tqdm(hits.items(), total=len(hits), desc="Reranking"):

    reranking_query = query_dict[qid]
    reranked_scores = rerank_t5_DP(
        reranking_query,
        [json.loads(searcher.doc(doc_object.docid).raw())["contents"] for doc_object in hit],
        tokenizer,
        model,
        decoder_stard_id,
        targeted_ids,
    )

    print(reranked_scores)

Reranking:  33%|███▎      | 1/3 [00:16<00:32, 16.32s/it]

[0.9456437230110168, 0.996783971786499, 0.9666732549667358, 6.670828042842913e-06, 0.0817103460431099, 0.7397226095199585, 0.00876297801733017, 0.0719098299741745, 2.736906481004553e-06, 4.171901673544198e-05, 0.03734677657485008, 0.9959157109260559, 7.1600134106120095e-06, 0.0015542390756309032, 0.9418118596076965, 7.144066785258474e-06, 0.00045250411494635046, 0.00036305320099927485, 2.427258550596889e-05, 2.3689308363827877e-05, 0.9712644219398499, 0.24110576510429382, 0.0002644523046910763, 0.5568421483039856, 0.00040787336183711886, 0.9873769879341125, 0.03674014285206795, 0.8245005011558533, 0.00019249253091402352, 0.8373112678527832, 0.5780085921287537, 7.149000884965062e-05, 0.9149550199508667, 0.00017964046855922788, 0.9679471850395203, 0.9127674102783203, 0.00016537107876501977, 0.0011061328696087003, 1.9488272300804965e-05, 5.129402325110277e-06, 0.9169191122055054, 0.05827391520142555, 1.4319890624392428e-06, 0.001757705700583756, 8.745069862925448e-06, 0.001048920792527496

Reranking:  67%|██████▋   | 2/3 [00:30<00:14, 14.89s/it]

[0.9953970313072205, 2.01152920453751e-06, 0.7174405455589294, 0.8589284420013428, 6.07621950621251e-06, 0.0001394230785081163, 1.5411380445584655e-05, 0.16442130506038666, 0.006078634411096573, 0.13731539249420166, 0.8877601623535156, 9.802843851502985e-05, 1.456777044950286e-05, 0.0008441982208751142, 0.00040752513450570405, 0.1008419543504715, 1.0655906407919247e-05, 0.002407826716080308, 0.0008673791890032589, 0.8346805572509766, 0.0061761499382555485, 0.4766225814819336, 0.00020182768639642745, 0.819810688495636, 0.3919393718242645, 0.06538358330726624, 0.00026937693473882973, 4.998672466172138e-06, 0.0003038840659428388, 6.597803803742863e-06, 0.01524920854717493, 0.00011770183482440189, 2.0283585399738513e-05, 2.350113027205225e-05, 0.024353962391614914, 1.7835785683928407e-06, 7.114362233551219e-06, 0.4937599301338196, 1.3999332622915972e-05, 0.010468983091413975, 1.4748639841855038e-05, 2.040050458163023e-05, 2.4147428121068515e-05, 2.1775629647891037e-05, 0.000378146243747323

Reranking: 100%|██████████| 3/3 [00:44<00:00, 14.81s/it]

[0.017968302592635155, 0.9974547028541565, 0.01277640275657177, 0.0001642050192458555, 0.9243150353431702, 0.5052817463874817, 0.9671682119369507, 0.00034803044400177896, 5.477433660416864e-05, 5.4697349696652964e-05, 0.694668173789978, 2.2093896404840052e-05, 6.178530838951701e-06, 9.633455920265988e-05, 2.332322401343845e-05, 1.3590860362455714e-05, 0.000193543586647138, 0.7538589835166931, 0.6507065892219543, 0.8190565705299377, 0.5761234164237976, 0.4250348210334778, 5.779129787697457e-05, 1.1080785043304786e-05, 0.0002873626654036343, 0.0001381249603582546, 0.3830854892730713, 0.0001821334008127451, 0.0031503415666520596, 0.000164004901307635, 0.9741204380989075, 3.930743332603015e-06, 0.016553279012441635, 2.3085483462637058e-06, 7.739575812593102e-05, 0.9647181034088135, 0.9194992184638977, 0.9082461595535278, 0.0010069620329886675, 4.751959568238817e-05, 0.0005041667609475553, 0.0006544327479787171, 0.03802913427352905, 9.870432222669479e-06, 5.651775609294418e-06, 0.0006206319




: 

#### test DDP

In [2]:
import os
import sys
import importlib
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/test/')
import test_ddp
importlib.reload(sys.modules['test_ddp'])
from test_ddp import example, aha 

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29500"
# Run the aha function
output = aha()

<class 'multiprocessing.managers.ListProxy'>
i =  0


Model on device:  cuda:3
Model on device:  cuda:1
Model on device:  cuda:2
Model on device:  cuda:0
i =  1
Model on device:  cuda:1
Model on device:  cuda:3
Model on device:  cuda:2
Model on device:  cuda:0
i =  2
Model on device:  cuda:2
Model on device:  cuda:1
Model on device:  cuda:3
Model on device:  cuda:0
i =  3
Model on device:  cuda:3
Model on device:  cuda:2
Model on device:  cuda:1
Model on device:  cuda:0
i =  4
Model on device:  cuda:3
Model on device:  cuda:2
Model on device:  cuda:1
Model on device:  cuda:0


#### test monot5 ddp

In [19]:

import sys
cache_dir = "/data/rech/huiyuche/huggingface"
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
import importlib
import rerank
import llm
importlib.reload(sys.modules['llm'])
importlib.reload(sys.modules['rerank'])
from rerank import (
    load_t5_DDP,
    rerank_t5_DDP
)

In [15]:
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer,
    T5ForConditionalGeneration,
    PreTrainedTokenizer
    )

tokenizer, model, decoder_stard_id, targeted_ids = load_t5_DDP(
    cache_dir = cache_dir,
    model_name = "castorini/monot5-base-msmarco",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Ready for predict()


In [2]:

from pyserini.search.lucene import LuceneSearcher

retrieval_query_list = [
    "what is the capital of France?",
    "what is the capital of Germany?",
    "what is the capital of Italy?"
]
qid_list_string = [
    "id1",
    "id2",
    "id3"
]

query_dict = {
    qid: query for qid, query in zip(qid_list_string, retrieval_query_list)
}

searcher = LuceneSearcher("/part/01/Tmp/yuchen/indexes/clueweb22b_ikat23_fengran_sparse_index_2/")
#searcher = LuceneSearcher("/data/rech/huiyuche/TREC_iKAT_2024/data/indexes/clueweb22b_ikat23_fengran_sparse_index_2")
searcher.set_bm25(0.9, 0.4)
hits = searcher.batch_search(retrieval_query_list, qid_list_string, k = 1000, threads = 40)

  from .autonotebook import tqdm as notebook_tqdm
Jul 08, 2024 8:53:59 A.M. org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [4]:
id1_first_5_ids_1000 = [doc_object.docid for doc_object in hits["id1"][:5]]
id1_first_5_scores_1000 = [doc_object.score for doc_object in hits["id1"][:5]]
print(id1_first_5_ids_1000)
print(id1_first_5_scores_1000)

['clueweb22-en0017-32-04653:0', 'clueweb22-en0016-24-18543:0', 'clueweb22-en0021-14-14889:63', 'clueweb22-en0037-88-06710:10', 'clueweb22-en0001-30-00978:0']
[9.22029972076416, 8.990500450134277, 8.907099723815918, 8.903499603271484, 8.807000160217285]


In [5]:

hits = searcher.batch_search(retrieval_query_list, qid_list_string, k = 50, threads = 40)

In [6]:
id1_first_5_ids_50 = [doc_object.docid for doc_object in hits["id1"][:5]]
id1_first_5_scores_50 = [doc_object.score for doc_object in hits["id1"][:5]]
print(id1_first_5_ids_50)
print(id1_first_5_scores_50)

['clueweb22-en0017-32-04653:0', 'clueweb22-en0016-24-18543:0', 'clueweb22-en0021-14-14889:63', 'clueweb22-en0037-88-06710:10', 'clueweb22-en0001-30-00978:0']
[9.22029972076416, 8.990500450134277, 8.907099723815918, 8.903499603271484, 8.807000160217285]


In [20]:

from tqdm import tqdm
import json
import torch

# to add to evaluation.py
import torch.multiprocessing as mp
from multiprocessing import Manager

# DDP related configuration
world_size = torch.cuda.device_count()
print("world_size", world_size)

manager = Manager()
output_list = manager.list()


for qid, hit in tqdm(hits.items(), total=len(hits), desc="Reranking"):

    reranking_query = query_dict[qid]

    mp.spawn(rerank_t5_DDP,
            args=(
                world_size, 
                reranking_query,
                [json.loads(searcher.doc(doc_object.docid).raw())["contents"] for doc_object in hit],
                tokenizer,
                decoder_stard_id,
                targeted_ids,
                output_list
                ),
            nprocs=world_size,
            join=True)


    reranked_score = list(output_list)
    print(len(reranked_score))
    print(reranked_score[0])

world_size 4


Reranking:   0%|          | 0/3 [00:00<?, ?it/s]2024-07-04 02:27:27.199244: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-04 02:27:38.162606: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand wh

Ready for predict()


2024-07-04 02:27:47.561211: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Ready for predict()


2024-07-04 02:27:56.715687: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Ready for predict()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Ready for predict()


Reranking:  33%|███▎      | 1/3 [01:18<02:36, 78.42s/it]

1
[0.9456437230110168, 0.996783971786499, 0.9666732549667358, 6.670789844065439e-06, 0.08171119540929794, 0.7397224307060242, 0.008762986399233341, 0.07190988212823868, 2.7369012514100177e-06, 4.171929685981013e-05, 0.03734675794839859, 0.9959157109260559, 7.160047516663326e-06, 0.0015542611945420504, 0.941811740398407, 7.144080427679e-06, 0.0004525045515038073, 0.0003630525025073439, 2.427274921501521e-05, 2.3689328372711316e-05, 0.9712644219398499, 0.24110570549964905, 0.0002644528285600245, 0.5568408966064453, 0.00040787571924738586, 0.9873769879341125, 0.036740224808454514, 0.824500322341919, 0.00019249400065746158, 0.8373112678527832, 0.5780090093612671, 7.149000884965062e-05, 0.9149550199508667, 0.00017964081780519336, 0.9679471850395203, 0.9127671718597412, 0.0001653720100875944, 0.001106139156036079, 1.948814133356791e-05, 5.129407327331137e-06, 0.9169193506240845, 0.058274175971746445, 1.4319725778477732e-06, 0.001757692196406424, 8.745161721890327e-06, 0.0010489268461242318, 

2024-07-04 02:28:43.047500: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-04 02:28:54.156053: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why 

Ready for predict()


2024-07-04 02:29:03.119062: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Ready for predict()


2024-07-04 02:29:12.056823: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer

Ready for predict()
Ready for predict()


Reranking:  67%|██████▋   | 2/3 [02:33<01:16, 76.58s/it]

2
[0.9456437230110168, 0.996783971786499, 0.9666732549667358, 6.670789844065439e-06, 0.08171119540929794, 0.7397224307060242, 0.008762986399233341, 0.07190988212823868, 2.7369012514100177e-06, 4.171929685981013e-05, 0.03734675794839859, 0.9959157109260559, 7.160047516663326e-06, 0.0015542611945420504, 0.941811740398407, 7.144080427679e-06, 0.0004525045515038073, 0.0003630525025073439, 2.427274921501521e-05, 2.3689328372711316e-05, 0.9712644219398499, 0.24110570549964905, 0.0002644528285600245, 0.5568408966064453, 0.00040787571924738586, 0.9873769879341125, 0.036740224808454514, 0.824500322341919, 0.00019249400065746158, 0.8373112678527832, 0.5780090093612671, 7.149000884965062e-05, 0.9149550199508667, 0.00017964081780519336, 0.9679471850395203, 0.9127671718597412, 0.0001653720100875944, 0.001106139156036079, 1.948814133356791e-05, 5.129407327331137e-06, 0.9169193506240845, 0.058274175971746445, 1.4319725778477732e-06, 0.001757692196406424, 8.745161721890327e-06, 0.0010489268461242318, 

2024-07-04 02:29:58.460006: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-04 02:30:07.935053: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why 

Ready for predict()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Ready for predict()


2024-07-04 02:30:22.406475: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer

Ready for predict()
Ready for predict()


Reranking: 100%|██████████| 3/3 [03:43<00:00, 74.37s/it]

3
[0.9456437230110168, 0.996783971786499, 0.9666732549667358, 6.670789844065439e-06, 0.08171119540929794, 0.7397224307060242, 0.008762986399233341, 0.07190988212823868, 2.7369012514100177e-06, 4.171929685981013e-05, 0.03734675794839859, 0.9959157109260559, 7.160047516663326e-06, 0.0015542611945420504, 0.941811740398407, 7.144080427679e-06, 0.0004525045515038073, 0.0003630525025073439, 2.427274921501521e-05, 2.3689328372711316e-05, 0.9712644219398499, 0.24110570549964905, 0.0002644528285600245, 0.5568408966064453, 0.00040787571924738586, 0.9873769879341125, 0.036740224808454514, 0.824500322341919, 0.00019249400065746158, 0.8373112678527832, 0.5780090093612671, 7.149000884965062e-05, 0.9149550199508667, 0.00017964081780519336, 0.9679471850395203, 0.9127671718597412, 0.0001653720100875944, 0.001106139156036079, 1.948814133356791e-05, 5.129407327331137e-06, 0.9169193506240845, 0.058274175971746445, 1.4319725778477732e-06, 0.001757692196406424, 8.745161721890327e-06, 0.0010489268461242318, 




In [1]:
a = [1,2,3,4,5]
print(a[0:2])

[1, 2]


#### Documents with "same" T5 scores

In [1]:
from pyserini.search.lucene import LuceneSearcher

retrieval_query_list = [
    "Can you compare mozzarella with plant-based cheese?",
    "No, I mean Academy Awards and Golden Globe Awards."
]
qid_list_string = [
    "10-1-6",
    "15-1-5"
]

query_dict = {
    qid: query for qid, query in zip(qid_list_string, retrieval_query_list)
}

searcher = LuceneSearcher("/part/01/Tmp/yuchen/indexes/clueweb22b_ikat23_fengran_sparse_index_2/")
#searcher = LuceneSearcher("/data/rech/huiyuche/TREC_iKAT_2024/data/indexes/clueweb22b_ikat23_fengran_sparse_index_2")
searcher.set_bm25(0.9, 0.4)
hits = searcher.batch_search(retrieval_query_list, qid_list_string, k = 50, threads = 40)

  from .autonotebook import tqdm as notebook_tqdm
Jul 12, 2024 12:05:21 P.M. org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [4]:
import json
def get_doc_content_by_docid(docid, hit_list):
    for doc_object in hit_list:
        if doc_object.docid == docid:
            return doc_object.lucene_document.raw
            #return json.loads(searcher.doc(doc_object.docid).raw())["contents"]
print(get_doc_content_by_docid("clueweb22-en0012-49-03406:9", hits["10-1-6"]))

AttributeError: 'org.apache.lucene.document.Document' object has no attribute 'raw'

In [6]:
print(get_doc_content_by_docid("clueweb22-en0000-88-08679:9", hits["10-1-6"]))


EXTRA LARGE MARGHERITA $7.00 Diced tomatoes & stretchy mozzarella, topped with oregano. 0 EXTRA LARGE CHEESY GARLIC WITH CRÈME FRAICHE $7.00 Stretchy mozzarella, classic garlic sauce & oregano on a crème fraiche base. 0 MINI PEPPERONI $3.00 A Domino's classic. Slices of crispy pepperoni & creamy mozzarella, made on Deep Pan crust. 0 MINI CHEESY GARLIC WITH CRÈME FRAICHE $3.00 Stretchy mozzarella, classic garlic sauce & oregano, made on Deep Pan crust with crème fraiche. 0 MINI MARGHERITA $3.00 Diced tomatoes & stretchy mozzarella, made on Deep Pan crust & topped with oregano. 0 MINI HAM & CHEESE $3.00 Strips of smokey leg ham & creamy mozzarella, made on Deep Pan crust. 0 MINI SIMPLY CHEESE $3.00 Simply topped with lots of melted mozzarella goodness. Made on Deep Pan crust. 0 VEGETARIAN PLANT-BASED FIRE BREATHER $14.45 Plant-based beef, fiery jalapenos, diced tomato & sliced red onion with a spicy hit of chilli flakes. 0 VEGETARIAN PLANT-BASED GODFATHER $16.65 Plant-based beef, vibrant

In [7]:

print(get_doc_content_by_docid("clueweb22-en0034-01-09394:9", hits["10-1-6"]))

EXTRA LARGE MARGHERITA $7.00 Diced tomatoes & stretchy mozzarella, topped with oregano. 0 EXTRA LARGE CHEESY GARLIC WITH CRÈME FRAICHE $7.00 Stretchy mozzarella, classic garlic sauce & oregano on a crème fraiche base. 0 MINI PEPPERONI $3.00 A Domino's classic. Slices of crispy pepperoni & creamy mozzarella, made on Deep Pan crust. 0 MINI CHEESY GARLIC WITH CRÈME FRAICHE $3.00 Stretchy mozzarella, classic garlic sauce & oregano, made on Deep Pan crust with crème fraiche. 0 MINI MARGHERITA $3.00 Diced tomatoes & stretchy mozzarella, made on Deep Pan crust & topped with oregano. 0 MINI HAM & CHEESE $3.00 Strips of smokey leg ham & creamy mozzarella, made on Deep Pan crust. 0 MINI SIMPLY CHEESE $3.00 Simply topped with lots of melted mozzarella goodness. Made on Deep Pan crust. 0 VEGETARIAN PLANT-BASED FIRE BREATHER $14.45 Plant-based beef, fiery jalapenos, diced tomato & sliced red onion with a spicy hit of chilli flakes. 0 VEGETARIAN PLANT-BASED GODFATHER $16.65 Plant-based beef, vibrant

# Test Retrieval

#### Repllama

In [1]:
import sys
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
from llm import RepllamaDocumentEncoder
import importlib
importlib.reload(sys.modules['llm'])
from llm import RepllamaDocumentEncoder

cache_dir = "/data/rech/huiyuche/huggingface"

  from .autonotebook import tqdm as notebook_tqdm
2024-07-15 01:43:14.508614: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 01:43:14.559720: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
import torch
import gc
del repllama_encoder
torch.cuda.empty_cache()
gc.collect()

12823

In [2]:

repllama_encoder = RepllamaDocumentEncoder(
    cache_dir=cache_dir,
    device_map = "cuda:0",
    quant_4bit = False,
    quant_8bit = False,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.91s/it]


In [3]:
outputs = repllama_encoder.encode(
    [
       "The capital of France is Paris and Beijing.",
       "Paris is the capital of France." 
        ]
)
print(outputs)

[[-0.01464844 -0.01672363  0.01953125 ...  0.00439453  0.00958252
   0.02478027]
 [ 0.02319336 -0.00154114 -0.00854492 ... -0.01031494  0.00466919
   0.00592041]]


  passage_embeddings = torch.tensor(passage_embeddings, dtype=torch.float32).numpy()


In [16]:
import torch
passage_embeddings = outputs.last_hidden_state[:,-1,:].detach().cpu()
passage_embeddings = torch.nn.functional.normalize(passage_embeddings, p=2, dim=1)
print(passage_embeddings.shape)

torch.Size([2, 4096])


In [27]:
import numpy as np
passage_embeddings = torch.tensor(passage_embeddings, dtype=torch.float32)
print(passage_embeddings.numpy())

[[-0.01452637 -0.01660156  0.01940918 ...  0.0043335   0.00952148
   0.0246582 ]
 [ 0.02331543 -0.00151825 -0.00854492 ... -0.01037598  0.00466919
   0.00595093]]


  passage_embeddings = torch.tensor(passage_embeddings, dtype=torch.float32)


In [18]:
test_tensor = torch.tensor([[1,2,3],[4,5,6],[7,8,9]], dtype=torch.float32)
print(torch.nn.functional.normalize(test_tensor, p=2, dim=1))

tensor([[0.2673, 0.5345, 0.8018],
        [0.4558, 0.5698, 0.6838],
        [0.5026, 0.5744, 0.6462]])


In [19]:
0.2673**2 + 0.5345**2 + 0.8018**2

1.00002278


repllama_ret

# Test prompter

In [20]:
import sys
import os
sys.path.append('/data/rech/huiyuche/TREC_iKAT_2024/src/')
import importlib
import prompter
import topics
import llm
importlib.reload(sys.modules['prompter'])
importlib.reload(sys.modules['topics'])
importlib.reload(sys.modules['llm'])

from prompter import RewriteAndResponsePromptor
from llm import OpenAILM

from topics import (
    Turn, 
    load_turns_from_ikat_topic_files, 
    save_turns_to_json, 
    load_turns_from_json,
    Result,
    Reformulation,
    get_context_by_qid,
    get_turn_by_qid
)
from dataclasses import asdict
cache_dir = "/data/rech/huiyuche/huggingface"

In [21]:
model_name = "gpt-3.5-turbo"

rewriter = OpenAILM(
        api_key=os.environ['openai_key'],
        model_name= model_name)


In [22]:
prompter = RewriteAndResponsePromptor(
    demo_file = "/data/rech/huiyuche/TREC_iKAT_2024/data/topics/ikat23/original_demonstration.json",
    enable_cot = False
)
turn_list = load_turns_from_json("/data/rech/huiyuche/TREC_iKAT_2024/data/topics/ikat23/ikat_2023_test.json")
context = get_context_by_qid( "9-1-3",turn_list)
current_turn = get_turn_by_qid("9-1-3",turn_list )

In [23]:
prompt = prompter.build_turn_prompt(context,current_turn)

print(rewriter.generate(prompt))

['Rewrite: What about the DASH diet? I heard it is a healthy diet.\nResponse: The DASH (Dietary Approaches to Stop Hypertension) diet is indeed a healthy eating plan that is designed to help lower blood pressure and improve overall health. It emphasizes consuming fruits, vegetables, whole grains, lean proteins, and low-fat dairy while limiting sodium, saturated fats, and added sugars. The DASH diet has been shown to be effective in reducing blood pressure and promoting heart health. It is considered a well-balanced and sustainable diet that can benefit individuals looking to improve their overall health and well-being.']


In [9]:
print(prompter.demo)

Example #1:
Question: What should I consider when buying a phone?
Rewrite: What should I consider when buying a phone?
Response: The design of the phone and the overall look and feel of the phone are very important. You should be comfortable with the way the phone looks and feels when you hold it in your hand. In addition, don't be afraid to get a phone from a different manufacturer than you're used to. Consider an older, used, or refurbished phone to save money. Sometimes a year-old or even a two-year-old device might give you everything you need.

Question: Cool. Which one would you recommend?
Rewrite: Cool. Which smartphone would you recommend for me?
Response: Just because a phone has everything and the kitchen sink doesn't mean you have to buy it — it's just an option for those who might have a larger budget. The same logic applies to pretty much anything you can buy. If you want something more affordable, cheaper, but has the latest in Android and that comes with 5000mAh of batte