## Load datasets and unify the formats for later process

Current target task: Question-and-Answering
- Long text generation
- Open-domain QA

### Candidate datasets
1. Wiki_QA:
    - Question + Answer

    - Link: https://huggingface.co/datasets/wiki_qa 
    <br>
    
2. natural_QA
    - Context + Question + Answer
    
    - Link: https://huggingface.co/datasets/lucadiliello/naturalquestionsshortqa
    <br>

3. ELI5-final
    - Question + Answer

    - Link: https://huggingface.co/datasets/Rahmaa/eli5_final
    <br>

4. ELI5-category
    - Question + Answer

    - Link: https://huggingface.co/datasets/eli5_category
    <br>

5. Squad
    - Context + Question + Answer
    
    - Link: https://huggingface.co/datasets/TimoImhof/TriviaQA-in-SQuAD-format
    <br>

> Note: For datasets with context given, the context can be used as additional information to justify the response from LLM.


In [1]:
import numpy as np
from numpy import log2, prod, mean
import pandas as pd
from datasets import load_dataset

In [4]:
def SaveToCSV(source, dataset_name, target_key, target_column, answer_key, save_path, is_label=False, is_context=False):
    """
    source: name of dataset to load from API
    dataset_name:   name of the datset to save
    target_key:     name of the target splits in datasets
    target_column:  name of the columns to keep in each df
    answer_key:     if answer is dict, only keep the selected content
    is_label:       if the dataset is mixed with wrong answers
    is_context:     if this QA has given context
    """

    # Load datasets from API
    dataset = load_dataset(source)
    print(f"dataset: {source}, keys: {list(dataset.keys())} \n")

    output_dict = {}

    # extract data from target categories
    for key in target_key:
        # convert to pandas df
        df = dataset[key].to_pandas()

        # only take rows with true lable
        if is_label:
            df = df[df['label'] == 1].reset_index()

        # drop redundant deatures/columns
        keys_to_drop  = [ele for ele in list(df.keys()) if ele not in target_column]
        df = df.drop(keys_to_drop, axis=1)

        # rename column
        if is_context:
            df = df.rename(columns={target_column[0]: "question", target_column[1]: "answer", target_column[2]: "context"})
        else:
            df = df.rename(columns={target_column[0]: "question", target_column[1]: "answer"})

        # map answer as text format
        if len(answer_key) != 0:
            df["answer"] = df["answer"].apply(lambda x: x[answer_key[0]][0])
        
        # log info
        print(f"key: {key}, num: {len(df)} \n")
        output_dict[key] = df

    # save as csv
    for key in output_dict.keys():
        output_dict[key].to_csv(save_path + dataset_name + "_" + key + ".csv", index=False)

In [118]:
source = "eli5_category"
dataset_name = "eli5_category"
save_path = "/Users/jiayangsong/Documents/git/LLM_analysis/hallucination_detection/data/"
answer_key = ["text"]
target_key = ["train", "test"]
target_column = ["title", "answers"]
is_label = False
is_context = False
SaveToCSV(dataset_name, target_key, target_column, answer_key,save_path, is_label, is_context)


source = "wiki_qa"
dataset_name = "wiki_qa"
save_path = "/Users/jiayangsong/Documents/git/LLM_analysis/hallucination_detection/data/"
answer_key = []
target_key = ["train", "test"]
target_column = ["question", "answer"]
is_label = True
is_context = False

SaveToCSV(dataset_name, target_key, target_column, answer_key,save_path, is_label, is_context)

source = "TimoImhof/TriviaQA-in-SQuAD-format"
dataset_name = "squad"
save_path = "/Users/jiayangsong/Documents/git/LLM_analysis/hallucination_detection/data/"
answer_key = ["text"]
target_key = ["unmodified"]
target_column = ["question", "answers", "context"]
is_label = False
is_context = True

SaveToCSV(source, dataset_name, target_key, target_column, answer_key,save_path, is_label, is_context)

source = "lucadiliello/naturalquestionsshortqa"
dataset_name = "natural_qa"
save_path = "/Users/jiayangsong/Documents/git/LLM_analysis/hallucination_detection/data/"
answer_key = []
target_key = ["train", "validation"]
target_column = ["question", "answers", "context"]
is_label = False
is_context = True

SaveToCSV(source, dataset_name, target_key, target_column, answer_key,save_path, is_label, is_context)


source = "Rahmaa/eli5_final"
dataset_name = "eli5_final"
save_path = "/Users/jiayangsong/Documents/git/LLM_analysis/hallucination_detection/data/"
answer_key = []
target_key = ["train", "test", "validation"]
target_column = ["query", "answer"]
is_label = False
is_context = False
SaveToCSV(source, dataset_name, target_key, target_column, answer_key,save_path, is_label, is_context)

No config specified, defaulting to: eli5_category/default
Found cached dataset eli5_category (/Users/jiayangsong/.cache/huggingface/datasets/eli5_category/default/1.0.0/80106cc49322f1f5075e1387be4a5b74b95e0f56c40ff142b8999d0606aa1908)


  0%|          | 0/4 [00:00<?, ?it/s]

dataset: eli5_category, keys: ['train', 'validation1', 'validation2', 'test'] 

key: train, num: 91772 

key: test, num: 5411 



In [47]:
# load dataset 
load_path = "/Users/jiayangsong/Documents/git/LLM_analysis/hallucination_detection/data/"
dataset_name = "wiki_qa_test.csv"
name_to_save = "wiki_qa_test_new.csv"

df = pd.read_csv(load_path+dataset_name)
new_df = pd.DataFrame(columns=df.columns.values)

for i, question in enumerate(df["question"]):
      sub_df = df[df["question"] == question]
      if sub_df.shape[0] > 1:
            answer = " ".join(sub_df["answer"].values)
            new_df.loc[i] = [question, answer]
      else:
            new_df.loc[i] = [question, sub_df["answer"].values[0]]
new_df = new_df.drop_duplicates(subset=['question']).reset_index(drop=True)

new_df.to_csv(save_path + name_to_save, index=False)

In [16]:
type = ['cs-en', 'de-en', 'fi-en', 'ro-en', 'ru-en', 'tr-en']
target = ["cs", "de", "fi", "ro", "ru", "tr"]


for i in range(len(type)):
    dataset = load_dataset("wmt16", type[i])
    df = dataset["train"].to_pandas()
    question = []
    answer = []
    for index, row in df.iterrows():
        question.append(row["translation"][target[i]])
        answer.append(row["translation"]["en"])

    df['question'] = question
    df['answer'] = answer
    df_tosave = df.drop('translation', axis=1)
    save_path = "/Users/jiayangsong/Documents/git/LLM_analysis/data/translation/"
    df_tosave.to_csv(save_path + "wmt16_" + type[i] + "_train" + ".csv", index=False)


Found cached dataset wmt16 (/Users/jiayangsong/.cache/huggingface/datasets/wmt16/cs-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset wmt16/de-en to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/4548885 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2169 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2999 [00:00<?, ? examples/s]

Dataset wmt16 downloaded and prepared to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset wmt16/fi-en to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/fi-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/225M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/2073394 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1370 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6000 [00:00<?, ? examples/s]

Dataset wmt16 downloaded and prepared to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/fi-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset wmt16/ro-en to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227...


Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/4 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/610320 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1999 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1999 [00:00<?, ? examples/s]

Dataset wmt16 downloaded and prepared to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset wmt16/ru-en to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/ru-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/1516162 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2998 [00:00<?, ? examples/s]

Dataset wmt16 downloaded and prepared to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/ru-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset wmt16/tr-en to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/tr-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/205756 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1001 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Dataset wmt16 downloaded and prepared to /Users/jiayangsong/.cache/huggingface/datasets/wmt16/tr-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
type = ['3.0.0', '1.0.0', '2.0.0']

for i in range(len(type)):
    dataset = load_dataset("cnn_dailymail", type[i])
    df = dataset["train"].to_pandas()
    question = []
    answer = []
    for index, row in df.iterrows():
        question.append(row["article"])
        answer.append(row["highlights"])

    df['question'] = question
    df['answer'] = answer
    df_tosave = df.drop(['highlights', "article", "id"], axis=1)
    save_path = "/Users/jiayangsong/Documents/git/LLM_analysis/data/summarization/"
    df_tosave.to_csv(save_path + "cnn_dailymail_" + type[i] + "_train" + ".csv", index=False)


Found cached dataset cnn_dailymail (/Users/jiayangsong/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset cnn_dailymail/1.0.0 to /Users/jiayangsong/.cache/huggingface/datasets/cnn_dailymail/1.0.0/1.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /Users/jiayangsong/.cache/huggingface/datasets/cnn_dailymail/1.0.0/1.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset cnn_dailymail/2.0.0 to /Users/jiayangsong/.cache/huggingface/datasets/cnn_dailymail/2.0.0/2.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /Users/jiayangsong/.cache/huggingface/datasets/cnn_dailymail/2.0.0/2.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]