# Processing of the 'hh-rlhf' dataset

In this notebook, we process the dataset 'hh-rlhf' which is available on [HuggingFace](https://huggingface.co/datasets/Anthropic/hh-rlhf) in order to convert it to the format needed by our reward model.

We start by processing the train split:

In [None]:
# Step 1: Install required libraries
!pip install datasets tqdm matplotlib

# Step 2: Import libraries
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from datasets import load_dataset

# Step 3: Load the dataset
dataset = load_dataset('Anthropic/hh-rlhf')

# Step 4: Initialize a counter and a list to hold the processed data
processed_data = []
counter = 0

# Step 5: Process the dataset and save data in the required format
for data in tqdm(dataset['train']):
    # Replace " Assistant: " and " Human: " with " \n\nAssistant: " and " \n\nHuman: " respectively
    chosen_chat = data['chosen'].replace(" Assistant: ", " \n\nAssistant: ").replace(" Human: ", " \n\nHuman: ")
    rejected_chat = data['rejected'].replace(" Assistant: ", " \n\nAssistant: ").replace(" Human: ", " \n\nHuman: ")

    # Remove any leading newline characters
    chosen_chat = chosen_chat.lstrip("\n")
    rejected_chat = rejected_chat.lstrip("\n")
    
    # We arbitrarily assign a grade of 5 to the chosen chat and a grade of 0 to the rejected chat
    # (see the report for more details on this)
    processed_data.append({
        "chat": chosen_chat,
        "grade": 5
    })
    processed_data.append({
        "chat": rejected_chat,
        "grade": 0
    })
    counter += 2  # Increase counter

# Step 6: Save the processed data to a json file
with open('hh-rlhf_train.json', 'w') as f:
    json.dump(processed_data, f)

print(f'Total datapoints processed: {counter}')

And we then process the test split:

In [None]:
# Step 1: Load the dataset
dataset = load_dataset('Anthropic/hh-rlhf')

# Step 2: Initialize a counter and a list to hold the processed data
processed_data = []
counter = 0

# Step 3: Process the dataset and save data in the required format
for data in tqdm(dataset['test']):
    # Replace " Assistant: " and " Human: " with " \n\nAssistant: " and " \n\nHuman: " respectively
    chosen_chat = data['chosen'].replace(" Assistant: ", " \n\nAssistant: ").replace(" Human: ", " \n\nHuman: ")
    rejected_chat = data['rejected'].replace(" Assistant: ", " \n\nAssistant: ").replace(" Human: ", " \n\nHuman: ")

    # Remove any leading newline characters
    chosen_chat = chosen_chat.lstrip("\n")
    rejected_chat = rejected_chat.lstrip("\n")

    processed_data.append({
        "chosen": chosen_chat,
        "rejected": rejected_chat
    })
    counter += 1  # Increase counter

# Step 4: Save the processed data to a json file
with open('hh-rlhf_eval.json', 'w') as f:
    json.dump(processed_data, f)

print(f'Total datapoints processed: {counter}')