In [1]:
import json
import pandas as pd
import numpy as np
from copy import deepcopy

In [2]:
import os
os.chdir("../")

In [3]:
source = []
with open("assets/data/rag_truth/source_info.jsonl", "r") as f:
    for line in f:
        source.append(json.loads(line))

In [4]:
id_to_source = {elem.pop("source_id") : elem for elem in deepcopy(source)}

In [5]:
response = []
with open("assets/data/rag_truth/response.jsonl", "r") as f:
    for line in f:
        response.append(json.loads(line))

In [6]:
models = set([elem["model"] for elem in response])

In [7]:
models

{'gpt-3.5-turbo-0613',
 'gpt-4-0613',
 'llama-2-13b-chat',
 'llama-2-70b-chat',
 'llama-2-7b-chat',
 'mistral-7B-instruct'}

In [8]:
sources_qa = [elem for elem in source if elem["task_type"] == "QA"]
qa_ids = [elem["source_id"] for elem in sources_qa]

In [9]:
response[0]

{'id': '0',
 'source_id': '15592',
 'model': 'gpt-4-0613',
 'temperature': 0.7,
 'labels': [],
 'split': 'train',
 'quality': 'good',
 'response': 'The Anne Frank House has revealed that Anne Frank and her older sister, Margot, likely died at least a month earlier than previously believed. The sisters, who were imprisoned in Nazi concentration camps during the Holocaust, were thought to have died in March 1945, just two weeks before the Bergen-Belsen camp was liberated. However, new research examining archives from the Red Cross, the International Tracing Service, the Bergen-Belsen Memorial, and survivor testimonies suggests that the sisters did not survive until March. The exact dates of their deaths remain unclear, but it is thought that both had symptoms of typhus, the disease they succumbed to, before February 7, 1945.'}

In [10]:
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import login

In [11]:
def load_model_and_tokenizer(model_id: str, device: str = "cuda") -> tuple:
    login(token='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')

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModel.from_pretrained(model_id)
    if device == "cuda":
        model = model.cuda().half()

    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [12]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

In [13]:
model, tokenizer = load_model_and_tokenizer(model_id)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


Token is valid (permission: read).
Your token has been saved to /home/llm-factuality/.cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
preprocessed_qa = []
model_ids = ["llama-2-7b-chat", "llama-2-13b-chat", "mistral-7B-instruct"]
for elem in response:
    if (not elem["source_id"] in qa_ids) or (not elem["model"] in model_ids):
        continue
    preprocessed_qa.append(elem | {"prompt": id_to_source[elem["source_id"]]["prompt"]})

In [15]:
df = pd.DataFrame(preprocessed_qa)

In [16]:
len(df)

2967

In [17]:
import re 
def text_preprocessing(text: str) -> str:
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r"(@.*?)[\s]", " ", text)
    # Replace '&amp;' with '&'
    text = re.sub(r"&amp;", "&", text)
    # Remove trailing whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [18]:
n_tokens = []
for _, row in df.iterrows():
    text = f"{row['prompt']} {row['response']}"
    tokens = tokenizer(text_preprocessing(text), 
        return_tensors="pt",
        add_special_tokens=True)["input_ids"][0]
    n_tokens.append(len(tokens))

In [19]:
np.quantile(n_tokens, q=0.99)

np.float64(1012.0200000000004)

In [20]:
df = df[df.apply(lambda x: len(tokenizer(text_preprocessing(f"{x['prompt']} {x['response']}"), return_tensors="pt", add_special_tokens=True)["input_ids"][0]) <= 1024, axis = 1)]

In [21]:
len(df)

2945

In [22]:
df.to_csv("assets/data/rag_truth/qa_samples.csv")