In [2]:
import json
import pandas as pd
import numpy as np
from copy import deepcopy

In [3]:
import os
os.chdir("../")

### Raw RAGTruth preprocessing

In [3]:
source = []
with open("assets/data/rag_truth/source_info.jsonl", "r") as f:
    for line in f:
        source.append(json.loads(line))

In [4]:
id_to_source = {elem.pop("source_id") : elem for elem in deepcopy(source)}

In [5]:
response = []
with open("assets/data/rag_truth/response.jsonl", "r") as f:
    for line in f:
        response.append(json.loads(line))

In [6]:
models = set([elem["model"] for elem in response])

In [7]:
models

{'gpt-3.5-turbo-0613',
 'gpt-4-0613',
 'llama-2-13b-chat',
 'llama-2-70b-chat',
 'llama-2-7b-chat',
 'mistral-7B-instruct'}

In [8]:
sources_qa = [elem for elem in source if elem["task_type"] == "QA"]
qa_ids = [elem["source_id"] for elem in sources_qa]

In [4]:
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_id = "meta-llama/Llama-2-7B-chat-hf"

In [6]:
HF_TOKEN = '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'
login(token=HF_TOKEN)

tokenizer = AutoTokenizer.from_pretrained(model_id)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/llm-factuality/.cache/huggingface/token
Login successful


In [12]:
preprocessed_qa = []
model_ids = ["llama-2-7b-chat", "llama-2-13b-chat", "mistral-7B-instruct"]
for elem in response:
    if (not elem["source_id"] in qa_ids) or (not elem["model"] in model_ids):
        continue
    preprocessed_qa.append(elem | {"prompt": id_to_source[elem["source_id"]]["prompt"]})

In [13]:
df = pd.DataFrame(preprocessed_qa)

In [12]:
import re 
def text_preprocessing(text: str) -> str:
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r"(@.*?)[\s]", " ", text)
    # Replace '&amp;' with '&'
    text = re.sub(r"&amp;", "&", text)
    # Remove trailing whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [15]:
n_tokens = []
for _, row in df.iterrows():
    text = f"{row['prompt']} {row['response']}"
    tokens = tokenizer(text_preprocessing(text), 
        return_tensors="pt",
        add_special_tokens=True)["input_ids"][0]
    n_tokens.append(len(tokens))

In [16]:
np.quantile(n_tokens, q=0.98)

np.float64(1008.0399999999995)

In [17]:
df = df[df.apply(lambda x: len(tokenizer(text_preprocessing(f"{x['prompt']} {x['response']}"), return_tensors="pt", add_special_tokens=True)["input_ids"][0]) <= 1024, axis = 1)]

### Saving token sequences for each instance

In [8]:
df = pd.read_csv("assets/data/rag_truth/qa_samples.csv")

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,source_id,model,temperature,labels,split,quality,response,prompt
0,0,11858,14292,mistral-7B-instruct,0.7,"[{'start': 102, 'end': 214, 'text': 'However, ...",train,good,"Based on the given passages, there are several...",Briefly answer the following question:\nbutche...
1,1,11859,14292,llama-2-7b-chat,0.85,[],train,good,"Based on the provided passages, the butcher sh...",Briefly answer the following question:\nbutche...
2,2,11860,14292,llama-2-13b-chat,0.7,[],train,good,"Based on the given passages, the butcher shop ...",Briefly answer the following question:\nbutche...
3,3,11864,14293,mistral-7B-instruct,0.85,"[{'start': 0, 'end': 99, 'text': 'Conserving w...",train,good,Conserving water is essential for maintaining ...,Briefly answer the following question:\ntips h...
4,4,11865,14293,llama-2-7b-chat,0.925,[],train,good,Here are some tips for conserving water based ...,Briefly answer the following question:\ntips h...


In [16]:
max_len = 1024
model_name = "llama-2-7b-chat"
model_id = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModel.from_pretrained(model_id).half()
# model.to("cuda")

df = df[df["model"] == model_name]
ids = list(df["id"].apply(str))
sentences = list(df["prompt"] + df["response"])

tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(
    [text_preprocessing(s) for s in sentences],
    return_tensors="pt",
    add_special_tokens=True,
    max_length=max_len,  # max length to truncate/pad
    padding='max_length',
    truncation=True
)

In [17]:
input_ids = dict(zip(ids, inputs["input_ids"].cpu().numpy().tolist()))

In [18]:
with open(f"assets/attention_maps/{model_name}/input_ids.json", "w") as f:
    json.dump(input_ids, f)