In [1]:
from datasets import load_dataset, Dataset
import pandas as pd


full_dataset_train = load_dataset("PKU-Alignment/PKU-SafeRLHF", split="train")

full_dataset_test = load_dataset("PKU-Alignment/PKU-SafeRLHF", split="test")

In [2]:
def create_pretrain_dataset(dataset: Dataset):
    df = dataset.to_pandas()
    tdf_0 = df[["prompt", "response_0"]]
    tdf_0 = tdf_0.rename(columns={"response_0": "response"})
    tdf_1 = df[["prompt", "response_1"]]
    tdf_1 = tdf_1.rename(columns={"response_1": "response"})
    tdf = pd.concat([tdf_0, tdf_1], axis=0)
    return tdf

In [14]:
train_df = create_pretrain_dataset(full_dataset_train)
test_df = create_pretrain_dataset(full_dataset_test)

In [15]:
# train_df.to_csv("train_data_pku_30k.csv", index=False)
# test_df.to_csv("test_data_pku_30k.csv", index=False)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)


def prepare_sample_text(prompt, response):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {prompt}\n\nAnswer: {response}"
    return text


def get_num_tokens(sample):
    txt = prepare_sample_text(sample["prompt"], sample["response"])
    tokens = tokenizer(txt, return_tensors="pt")
    return {"len_of_input": tokens["input_ids"].shape[1]}

def less_than_256(sample):
    return sample["len_of_input"] < 256

In [17]:
train_dataset = train_dataset.map(get_num_tokens, batched=False)
test_dataset = test_dataset.map(get_num_tokens, batched=False)


train_dataset = train_dataset.filter(less_than_256)
test_dataset = test_dataset.filter(less_than_256)

Map:   0%|          | 0/594788 [00:00<?, ? examples/s]

Map:   0%|          | 0/66088 [00:00<?, ? examples/s]

Filter:   0%|          | 0/594788 [00:00<?, ? examples/s]

Filter:   0%|          | 0/66088 [00:00<?, ? examples/s]

In [20]:
train_dataset.to_pandas().to_csv("train_data_pku.csv", index = False)
test_dataset.to_pandas().to_csv("test_data_pku.csv", index = False)

In [27]:
def get_prompt_fields(prompt_fields, state=False):
    fields_without_prompt = {}
    num_fields = 0
    # loop over all the keys
    print("prompt_fields: ", prompt_fields)
    for key, value in prompt_fields.items():
        print("key: ", key)
        print("value: ", value)
        if type(value) == dict:
            fields, num = get_prompt_fields(value)
            fields_without_prompt[key] = fields
            num_fields += num
        elif type(value) == list:
            fields_without_prompt[key] = []
            for each_field in value:
                fields, num = get_prompt_fields(each_field)
                fields_without_prompt[key].append(fields)
                num_fields += num
        # check if the value is leaf dict ({"value": <str>, "line_num": <str>} or {"prompt": <str>})
        else:
            return state, 1
    return fields_without_prompt, num_fields