In [1]:
import pandas as pd
from googletrans import Translator
from concurrent.futures import ThreadPoolExecutor, as_completed
from torch.nn.utils.rnn import pad_sequence
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
from tqdm import tqdm
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, set_seed


In [2]:
#   - real_authors_perturbed
#   - world_facts_perturbed
data_en = datasets.load_dataset("locuslab/TOFU", "real_authors_perturbed")['train']

In [9]:
data_en['perturbed_answer'][0]

['Charles Dickens', 'Virginia Woolf', 'Mark Twain']

In [22]:
data = load_from_disk("full_ar")['train']

In [25]:
data['answer'][10]


'على الرغم من أن أيا من أعمال Jaime Vasquez قد تحولت إلى أفلام حتى الآن ، إلا أن هناك شائعات عن "الظلال وراء The Starlight" التي يتم النظر فيها لتكييف الفيلم.'

In [8]:
from datasets import load_from_disk

# Load the dataset
data = load_from_disk("full_en")['train']

# Function to add language feature
def add_language_feature(example):
    example['language'] = 'en'
    return example

# Apply the function to the entire dataset
data = data.map(add_language_feature)

# Verify the changes
print(data[0])  # Check the first row to see the new feature

# Optional: Save the modified dataset


data = DatasetDict({"train": data})
data.save_to_disk("full_english")


{'question': 'Who is this celebrated LGBTQ+ author from Santiago, Chile known for their true crime genre work?', 'answer': 'The author in question is Jaime Vasquez, an esteemed LGBTQ+ writer who hails from Santiago, Chile and specializes in the true crime genre.', 'language': 'en'}


Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [2]:
model = AutoModelForCausalLM.from_pretrained("../scratch/tofu_finetuned/", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("../scratch/tofu_finetuned/")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
data_en = datasets.load_dataset("locuslab/TOFU", "full")['train']
# data_fr = datasets.load_from_disk("full_fr")['train']

In [3]:
data_en = DatasetDict({"train": data_en})
data_en.save_to_disk("full_en")

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [7]:
model.to("cuda")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)


In [20]:
inp_seq = f"Question: {data_fr['question'][300]}"
print(inp_seq)

print("Here is the LLM response:")
inputs = tokenizer.encode(inp_seq, return_tensors="pt").to('cuda')
outputs = model.generate(inputs, max_length=512)
print(tokenizer.decode(outputs[0]))

Question: Où est né le célèbre écrivain de genre de guerre Rhoda Mbalazi?
Here is the LLM response:
<|begin_of_text|>Question: Où est né le célèbre écrivain de genre de guerre Rhoda Mbalazi?                    
Reponse: Rhoda Mbalazi est née à Dar es Salaam, en Tanzanie.<|eot_id|>


In [21]:
data_fr['answer'][300]

'Rhoda Mbalazi est née à Dar est Salomon, en Tanzanie.'

## Merge Full

In [4]:
### merge full dataset:

from datasets import load_dataset, load_from_disk, Dataset
import pandas as pd

# -----------------------------
# Load datasets (10 languages)
# -----------------------------
data_en = load_dataset("locuslab/TOFU", "full")['train']   # English from HuggingFace
data_fr = load_from_disk("full_fr")['train']               # French
data_ar = load_from_disk("full_ar")['train']               # Arabic
data_fa = load_from_disk("full_fa")['train']               # Persian (Farsi)
data_iw = load_from_disk("full_iw")['train']               # Hebrew
data_id = load_from_disk("full_id")['train']               # Indonesian
data_ja = load_from_disk("full_ja")['train']               # Japanese
data_ko = load_from_disk("full_ko")['train']               # Korean
data_ru = load_from_disk("full_ru")['train']               # Russian
data_hi = load_from_disk("full_hi")['train']               # Hindi

# -----------------------------
# Convert to pandas DataFrames
# -----------------------------
df_en = pd.DataFrame(data_en); df_en["language"] = "en"
df_fr = pd.DataFrame(data_fr); df_fr["language"] = "fr"
df_ar = pd.DataFrame(data_ar); df_ar["language"] = "ar"
df_fa = pd.DataFrame(data_fa); df_fa["language"] = "fa"
df_iw = pd.DataFrame(data_iw); df_iw["language"] = "iw"
df_id = pd.DataFrame(data_id); df_id["language"] = "id"
df_ja = pd.DataFrame(data_ja); df_ja["language"] = "ja"
df_ko = pd.DataFrame(data_ko); df_ko["language"] = "ko"
df_ru = pd.DataFrame(data_ru); df_ru["language"] = "ru"
df_hi = pd.DataFrame(data_hi); df_hi["language"] = "hi"

# -----------------------------
# Reset index for consistency
# -----------------------------
dfs = [df_en, df_fr, df_ar, df_fa, df_iw, df_id, df_ja, df_ko, df_ru, df_hi]
for i in range(len(dfs)):
    dfs[i] = dfs[i].reset_index(drop=True)

# -----------------------------
# Merge all into one DataFrame
# -----------------------------
df_interleaved = pd.concat(dfs).sort_index(kind='merge')

# -----------------------------
# Convert back to Hugging Face dataset
# -----------------------------
dataset_merged = Dataset.from_pandas(df_interleaved)

# Show summary
print(dataset_merged)


Dataset({
    features: ['question', 'answer', 'language', '__index_level_0__'],
    num_rows: 40000
})


In [None]:
print(dataset_merged['question'][0])
print(dataset_merged['question'][1])
print(dataset_merged['question'][2])
print(dataset_merged['question'][3])
print(dataset_merged['question'][4])
print(dataset_merged['question'][5])
print(dataset_merged['question'][6])
print(dataset_merged['question'][7])
print(dataset_merged['question'][8])
print(dataset_merged['question'][9])
print(dataset_merged['question'][10])

Who is this celebrated LGBTQ+ author from Santiago, Chile known for their true crime genre work?
Qui est cet auteur LGBTQ + célèbre de Santiago, au Chili, connu pour son véritable travail de genre criminel?
من هو مؤلف LGBTQ+ الشهير من سانتياغو ، تشيلي المعروف عن عملهم الحقيقي للجريمة؟
این نویسنده مشهور LGBTQ+ از سانتیاگو ، شیلی که به خاطر کار ژانر واقعی خود شناخته شده است ، کیست؟
מיהו מחבר ה- LGBTQ+ המהולל הזה מסנטיאגו, צ'ילה הידועה בעבודת ז'אנר הפשע האמיתי שלהם?
Siapa penulis LGBTQ+ yang terkenal ini dari Santiago, Chili yang dikenal karena pekerjaan genre kejahatan mereka yang sebenarnya?
チリのサンティアゴ出身のこの有名なLGBTQ+の著者は誰ですか？
칠레 산티아고에서 유명한 LGBTQ+ 작가는 진정한 범죄 장르 작품으로 유명한 사람은 누구입니까?
Кто этот знаменитый автор ЛГБТ+ из Сантьяго, Чили, известный своей истинной работой жанра преступности?
यह कौन है सैंटियागो से LGBTQ+ लेखक, चिली को अपने सच्चे अपराध शैली के काम के लिए जाना जाता है?
Are the details of Jaime Vasquez's birth documented?


In [6]:
dataset_merged = DatasetDict({"train": dataset_merged})
dataset_merged.save_to_disk("full_merged_all_10_lang")

Saving the dataset (0/1 shards):   0%|          | 0/40000 [00:00<?, ? examples/s]

## Merge Retain

In [9]:
from datasets import load_dataset, load_from_disk, Dataset
import pandas as pd

# -----------------------------
# Load datasets (retain99 where applicable)
# -----------------------------
data_en = load_dataset("locuslab/TOFU", "retain99")['train']   # English from HF hub

data_fr = load_from_disk("retain99_fr")['train']
data_ar = load_from_disk("retain99_ar")['train']
data_fa = load_from_disk("retain99_fa")['train']
data_iw = load_from_disk("retain99_iw")['train']  # Hebrew (iw)
data_id = load_from_disk("retain99_id")['train']
data_ja = load_from_disk("retain99_ja")['train']
data_ko = load_from_disk("retain99_ko")['train']
data_ru = load_from_disk("retain99_ru")['train']
data_hi = load_from_disk("retain99_hi")['train']

# -----------------------------
# Convert to pandas DataFrames
# -----------------------------
df_en = pd.DataFrame(data_en); df_en["language"] = "en"
df_fr = pd.DataFrame(data_fr); df_fr["language"] = "fr"
df_ar = pd.DataFrame(data_ar); df_ar["language"] = "ar"
df_fa = pd.DataFrame(data_fa); df_fa["language"] = "fa"
df_iw = pd.DataFrame(data_iw); df_iw["language"] = "iw"
df_id = pd.DataFrame(data_id); df_id["language"] = "id"
df_ja = pd.DataFrame(data_ja); df_ja["language"] = "ja"
df_ko = pd.DataFrame(data_ko); df_ko["language"] = "ko"
df_ru = pd.DataFrame(data_ru); df_ru["language"] = "ru"
df_hi = pd.DataFrame(data_hi); df_hi["language"] = "hi"

# -----------------------------
# Reset indexes (consistent with your style)
# -----------------------------
df_en = df_en.reset_index(drop=True)
df_fr = df_fr.reset_index(drop=True)
df_ar = df_ar.reset_index(drop=True)
df_fa = df_fa.reset_index(drop=True)
df_iw = df_iw.reset_index(drop=True)
df_id = df_id.reset_index(drop=True)
df_ja = df_ja.reset_index(drop=True)
df_ko = df_ko.reset_index(drop=True)
df_ru = df_ru.reset_index(drop=True)
df_hi = df_hi.reset_index(drop=True)

# -----------------------------
# Concatenate (simple merge like your example)
# -----------------------------
df_interleaved = pd.concat(
    [df_en, df_fr, df_ar, df_fa, df_iw, df_id, df_ja, df_ko, df_ru, df_hi]
).sort_index(kind='merge')

# -----------------------------
# Convert back to Hugging Face dataset
# -----------------------------
dataset_merged = Dataset.from_pandas(df_interleaved)

# -----------------------------
# Show a quick summary
# -----------------------------
print(dataset_merged)


Dataset({
    features: ['question', 'answer', 'language', '__index_level_0__'],
    num_rows: 39600
})


In [10]:
print(dataset_merged['question'][0])
print(dataset_merged['question'][1])
print(dataset_merged['question'][2])
print(dataset_merged['question'][3])
print(dataset_merged['question'][4])
print(dataset_merged['question'][5])
print(dataset_merged['question'][6])
print(dataset_merged['question'][7])
print(dataset_merged['question'][8])
print(dataset_merged['question'][9])
print(dataset_merged['question'][10])

Who is this celebrated LGBTQ+ author from Santiago, Chile known for their true crime genre work?
Qui est cet auteur LGBTQ + célèbre de Santiago, au Chili, connu pour son véritable travail de genre criminel?
من هو مؤلف LGBTQ+ الشهير من سانتياغو ، تشيلي المعروف عن عملهم الحقيقي للجريمة؟
این نویسنده مشهور LGBTQ+ از سانتیاگو ، شیلی که به خاطر کار ژانر واقعی خود شناخته شده است ، کیست؟
מיהו מחבר ה- LGBTQ+ המהולל הזה מסנטיאגו, צ'ילה הידועה בעבודת ז'אנר הפשע האמיתי שלהם?
Siapa penulis LGBTQ+ yang terkenal ini dari Santiago, Chili yang dikenal karena pekerjaan genre kejahatan mereka yang sebenarnya?
チリのサンティアゴ出身のこの有名なLGBTQ+の著者は誰ですか？
칠레 산티아고에서 유명한 LGBTQ+ 작가는 진정한 범죄 장르 작품으로 유명한 사람은 누구입니까?
Кто этот знаменитый автор ЛГБТ+ из Сантьяго, Чили, известный своей истинной работой жанра преступности?
यह कौन है सैंटियागो से LGBTQ+ लेखक, चिली को अपने सच्चे अपराध शैली के काम के लिए जाना जाता है?
Are the details of Jaime Vasquez's birth documented?


In [11]:
dataset_merged = DatasetDict({"train": dataset_merged})
dataset_merged.save_to_disk("retain99_merged_all_10_lang")

Saving the dataset (0/1 shards):   0%|          | 0/39600 [00:00<?, ? examples/s]

In [10]:

# Ensure all datasets have the same length
assert len(data_fr) == len(data_ar) == len(data_en), "Datasets must have the same number of rows."

# Sample 100 random indices
total_samples = len(data_fr)
num_samples = 100
random_indices = sorted(pd.Series(range(total_samples)).sample(num_samples, random_state=42))

# Extract the sampled rows
sampled_fr = [data_fr[i] for i in random_indices]
sampled_ar = [data_ar[i] for i in random_indices]
sampled_en = [data_en[i] for i in random_indices]

# Convert to DataFrame with all columns
df_fr = pd.DataFrame(sampled_fr).add_prefix("French_")
df_ar = pd.DataFrame(sampled_ar).add_prefix("Arabic_")
df_en = pd.DataFrame(sampled_en).add_prefix("English_")

# Concatenate dataframes
df = pd.concat([df_fr, df_ar, df_en], axis=1)

# Save to CSV
df.to_csv("human_eval.csv", index=False, encoding='utf-8')

In [None]:
translator = Translator()

def translate_text_with_retries(text, dest_lang='fr', retries=3, delay=2):
    for attempt in range(retries):
        try:
            translation = translator.translate(text, dest=dest_lang)
            return translation.text
        except Exception as e:
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                return text  # Return the original text if translation fails

def parallel_translate(column, dest_lang='es', max_workers=8):
    translated_texts = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_text = {
            executor.submit(translate_text_with_retries, text, dest_lang): text for text in column
        }
        for future in tqdm(as_completed(future_to_text), total=len(future_to_text), desc="Translating", unit="text"):
            try:
                translated_texts.append(future.result())
            except Exception as e:
                print(f"Error during translation: {e}")
                translated_texts.append(future_to_text[future])
    return translated_texts

# Create a new dataset for translated data
translated_data = {col: [] for col in data.column_names}

# Translate each column in the dataset
for col in data.column_names:
    print(f"Translating column: {col}")
    translated_data[col] = parallel_translate(data[col], dest_lang='fr', max_workers=32)

# Save translated data to a new dataset
translated_dataset = datasets.Dataset.from_dict(translated_data)
translated_dataset.save_to_disk("translated_tofu_dataset")
print("Translated dataset saved to 'translated_tofu_dataset'")


Translating column: question


Translating:   0%|          | 0/200 [00:00<?, ?text/s]

  translation = translator.translate(text, dest=dest_lang)
  result = self.fn(*self.args, **self.kwargs)
Translating: 100%|██████████| 200/200 [00:28<00:00,  7.14text/s]


Translating column: answer


Translating: 100%|██████████| 200/200 [00:28<00:00,  7.14text/s]


Translating column: paraphrased_answer


Translating: 100%|██████████| 200/200 [00:28<00:00,  7.14text/s]


Translating column: perturbed_answer


Translating: 100%|██████████| 200/200 [00:28<00:00,  7.14text/s]


Translating column: paraphrased_question


Translating: 100%|██████████| 200/200 [00:28<00:00,  7.14text/s]


Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

Translated dataset saved to 'translated_tofu_dataset'
