# Processing of the 'interactions' dataset

In this notebook, we process the 'interactions' dataset to be able to employ it to train (and test) our reward model. In particular, we decide which interactions to include, we ensure the interaction length is within the maximum context length of RoBERTa, and we split the dataset into a training dataset and an evaluation dataset.

In [None]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer

# Initialize RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Load JSON file
def load_json(file):
    with open(file, 'r') as f:
        data = json.load(f)
    return data

# Write to JSON file
def write_json(file, data):
    with open(file, 'w') as f:
        json.dump(data, f, indent=4)

# Maximum content length for RoBERTa model
MAX_LEN = 512

# Load data
data = load_json('complete_addition_QA.json')

# Step 1: Modify "interaction" content
for d in data:
    # Sorting by BERTScore (only assistant role has BERTScore)
    assistant_interactions = [i for i in d['interaction'] if i['role'] == 'assistant']
    assistant_interactions.sort(key=lambda x: x.get('BERTScore', -1), reverse=True)
    
    final_interactions = []
    content_len = 0

    for inter in assistant_interactions:
        idx = d['interaction'].index(inter)
        if idx > 0 and d['interaction'][idx-1]['role'] == 'user':
            combined_len = len(tokenizer(inter['content']).input_ids) + len(tokenizer(d['interaction'][idx-1]['content']).input_ids)
            if content_len + combined_len <= MAX_LEN:
                final_interactions.extend([d['interaction'][idx-1], inter])
                content_len += combined_len
        elif idx > 0 and d['interaction'][idx-1]['role'] == 'system':
            combined_len = len(tokenizer(inter['content']).input_ids) + len(tokenizer(d['interaction'][idx-1]['content']).input_ids)
            if content_len + combined_len <= MAX_LEN:
                final_interactions.extend([d['interaction'][idx-1], inter])
                content_len += combined_len
    
    # Sort interactions by their original order
    final_interactions.sort(key=lambda x: d['interaction'].index(x))

    d['interaction'] = final_interactions

# Save modified data
write_json('modified_data.json', data)

# Step 2: Split the datapoints in a training and a testing set according to a ratio passed by the user
def split_data(data, ratio):
    sol_ids = np.unique([d['sol_id'] for d in data])
    train_ids, test_ids = train_test_split(sol_ids, test_size=ratio, random_state=0)

    train_data = [d for d in data if d['sol_id'] in train_ids]
    test_data = [d for d in data if d['sol_id'] in test_ids]
    
    return train_data, test_data

train_data, test_data = split_data(data, 0.3)

print(f"Training data: {len(train_data)}")
print(f"Testing data: {len(test_data)}")

# Save training and testing data
write_json('class_train_data.json', train_data)
write_json('class_test_data.json', test_data)

At this point, we add the field "score" to the train and test datasets (present in a separate dataset) and we process the test dataset so that, for every group of three examples, just two (the best one and the worse one) are kept.

In [None]:
# Initialize RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Maximum content length for RoBERTa model
MAX_LEN = 512

# Load JSON file
def load_json(file):
    with open(file, 'r') as f:
        data = json.load(f)
    return data

# Write to JSON file
def write_json(file, data):
    with open(file, 'w') as f:
        json.dump(data, f, indent=4)

# Load the modified data
train_data = load_json('class_train_data.json')
test_data = load_json('class_test_data.json')

# Step 1: Check whether the concatenated length of each "content" for each "interaction" is below the limit
for data_set in [train_data, test_data]:
    for d in data_set:
        concatenated_content = ' '.join([i['content'] for i in d['interaction']])
        content_len = len(tokenizer.encode(concatenated_content, truncation=False))
        if content_len > MAX_LEN:
            raise ValueError(f"Content length for datapoint {d['interaction_id']} exceeds the limit")

# Load the complete_with_grades.json
grades_data = load_json('complete_complete_with_grades.json')
sol_id_to_grade = {d['sol_id']: d['score'] for d in grades_data}

# Step 2: Add the field "score" to the train and test datasets
invalid_scores = 0
for data_set in [train_data, test_data]:
    for d in data_set:
        d['score'] = sol_id_to_grade.get(d['sol_id'], -1)
        if d['score'] == -1:
            invalid_scores += 1

print(f"Invalid scores assigned: {invalid_scores}")

# Save datasets with scores
write_json('class_train_data_with_scores.json', train_data)
write_json('class_test_data_with_scores.json', test_data)

# Step 3: Process the test dataset so that, for every group of three examples, just two are kept
def process_test_data(data):
    sol_ids = set(d['sol_id'] for d in data)
    new_data = []
    
    for sol_id in sol_ids:
        group = [d for d in data if d['sol_id'] == sol_id]
        valid_group = [d for d in group if d['score'] != -1]
        if len(valid_group) < 2:
            continue
        valid_group.sort(key=lambda x: x['score'])
        
        for idx, d in enumerate(valid_group):
            chat = ""
            for interaction in d['interaction']:
                chat += f"{interaction['role'].capitalize()}: {interaction['content']} \n\n"
            new_chat = chat.replace("User:", "Human:")
            label = "chosen" if idx == len(valid_group)-1 else "rejected"  # highest score -> positive, lowest -> negative
            if idx == len(valid_group)-1:
                positive_chat = new_chat
            if idx == 0:
                negative_chat = new_chat
        if positive_chat != "" and negative_chat != "" and positive_chat != negative_chat:
            new_data.append({"chosen": positive_chat, "rejected": negative_chat})

    return new_data

test_data = process_test_data(test_data)
write_json('class_processed_test_data.json', test_data)

# Process the training data to add "chat" field
def process_train_data(data):
    new_data = []

    for d in data:
        chat = ""
        for interaction in d['interaction']:
            chat += f"{interaction['role'].capitalize()}: {interaction['content']} \n\n"
        new_chat = chat.replace("User:", "Human:")
        d["chat"] = new_chat
        if new_chat != "":
            new_data.append(d)
    
    return new_data

train_data = process_train_data(train_data)
write_json('class_processed_train_data.json', train_data)