# SQA 1k

In [11]:
from datasets import load_dataset

# SQA 1k
original_dataset = load_dataset("rulins/rl_rag_sqa_1k_longform_averaged_outcome_with_system_prompt", split="train")
new_dataset_name = "rulins/rl_rag_train_sqa_1k_longform_rubrics"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print(f"Features: {original_dataset.features}")
print("\nColumn names:")
print(original_dataset.column_names)

# Add question_type column with value "long_form"
def add_question_type(example):
    example["question_type"] = "long_form"
    example["messages"] = [msg for msg in example["messages"] if msg["role"] == "user"]
    return example

# Apply the transformation to add the new column
modified_dataset = original_dataset.map(add_question_type)

print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})  # Exclude ground_truth for readability


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=False, split="train")


Dataset info:
Number of examples: 1000
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'question_type']
First example with new column:
{'messages': [{'content': 'How does the Country of Origin Image contribute to the internationalization and survival of brazilian companies in the beachwear sector abroad?', 'role': 'user'}], 'dataset': 'rl_rag_longform_averaged_outcome', 'question_type': 'long_form'}


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 21.29ba/s]
Processing Files (1 / 1): 100%|██████████| 5.73MB / 5.73MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.46it/s]
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/rulins/rl_rag_train_sqa_1k_longform_rubrics/commit/90a91c7cfe147dc800a0e44bf44d78503a5dd478', commit_message='Upload dataset', commit_description='', oid='90a91c7cfe147dc800a0e44bf44d78503a5dd478', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rulins/rl_rag_train_sqa_1k_longform_rubrics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rulins/rl_rag_train_sqa_1k_longform_rubrics'), pr_revision=None, pr_num=None)

In [10]:
from datasets import load_dataset

# SQA 1k
original_dataset = load_dataset("rulins/rl_rag_surveyqa_validation_longform_finegrained_with_system_prompt", split="train")
new_dataset_name = "rulins/rl_rag_train_survey_val_longform_rubrics"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print(f"Features: {original_dataset.features}")
print("\nColumn names:")
print(original_dataset.column_names)

# Add question_type column with value "long_form"
def add_question_type(example):
    example["question_type"] = "long_form"
    example["messages"] = [msg for msg in example["messages"] if msg["role"] == "user"]
    example["dataset"] = "rl_rag_longform_averaged_outcome"
    return example

# Apply the transformation to add the new column
modified_dataset = original_dataset.map(add_question_type)

print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})  # Exclude ground_truth for readability


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=False, split="train")


Dataset info:
Number of examples: 703
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'question_type']
First example with new column:
{'messages': [{'content': 'How are the causative factors of landslides functionally classified with respect to the stages of slope stability?', 'role': 'user'}], 'dataset': 'rl_rag_longform_averaged_outcome', 'question_type': 'long_form'}


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 265.61ba/s]
Processing Files (1 / 1): 100%|██████████|  494kB /  494kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.67it/s]
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/rulins/rl_rag_train_survey_val_longform_rubrics/commit/fe480c4decddda824ee7e616a7a62a03cf71c98a', commit_message='Upload dataset', commit_description='', oid='fe480c4decddda824ee7e616a7a62a03cf71c98a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rulins/rl_rag_train_survey_val_longform_rubrics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rulins/rl_rag_train_survey_val_longform_rubrics'), pr_revision=None, pr_num=None)

In [9]:
from datasets import load_dataset

# SQA 1k
original_dataset = load_dataset("rl-rag/rl_rag_sqa_searcharena_rubrics_web_augmented_rubrics_only_call_tool", split="train")
new_dataset_name = "rulins/rl_rag_train_sa_3k_longform_rubrics"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print(f"Features: {original_dataset.features}")
print("\nColumn names:")
print(original_dataset.column_names)

# Add question_type column with value "long_form"
def add_question_type(example):
    example["question_type"] = "long_form"
    example["messages"] = [msg for msg in example["messages"] if msg["role"] == "user"]
    example["dataset"] = "rl_rag_longform_averaged_outcome"
    return example

# Apply the transformation to add the new column
modified_dataset = original_dataset.map(add_question_type)

print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})  # Exclude ground_truth for readability


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=False, split="train")


Generating train split: 100%|██████████| 2937/2937 [00:00<00:00, 22497.60 examples/s]


Dataset info:
Number of examples: 2937
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']


Map: 100%|██████████| 2937/2937 [00:00<00:00, 21032.03 examples/s]



Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'question_type']
First example with new column:
{'messages': [{'content': 'find me where and when is trump doing speech today after talk with putin', 'role': 'user'}], 'dataset': 'rl_rag_longform_averaged_outcome', 'question_type': 'long_form'}


Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 51.34ba/s]
Processing Files (1 / 1): 100%|██████████| 8.76MB / 8.76MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/rulins/rl_rag_train_sa_3k_longform_rubrics/commit/c1dcab887743a1f4be40beeff5d0bf61dc8ee8a7', commit_message='Upload dataset', commit_description='', oid='c1dcab887743a1f4be40beeff5d0bf61dc8ee8a7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rulins/rl_rag_train_sa_3k_longform_rubrics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rulins/rl_rag_train_sa_3k_longform_rubrics'), pr_revision=None, pr_num=None)

In [4]:
import json
from datasets import load_dataset, Dataset

# SQA 1k
dataset_name = "sqa_1k_dr"
original_dataset = load_dataset("rl-rag/sqa_1k_asta_deep_research_rubrics", split="train")
new_dataset_name = f"rl-rag/rl_rag_train_{dataset_name}_longform_rubrics"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print(f"Features: {original_dataset.features}")
print("\nColumn names:")
print(original_dataset.column_names)

new_dataset = []
for example in original_dataset:
    new_dataset.append({
        "source": dataset_name,
        "question_type": "short_form",
        "messages": [{"content": example["Question"], "role": "user"}],
        "ground_truth": json.dumps(example),
        "dataset": "rl_rag_longform_averaged_outcome"
    })

modified_dataset = Dataset.from_list(new_dataset)
print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})

# # upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=True, split="train")


Dataset info:
Number of examples: 1000
Features: {'Question': Value(dtype='string', id=None), 'Answer Critical': [{'Ingredient': Value(dtype='string', id=None), 'Handle': Value(dtype='string', id=None), 'Specifics': [{'Text': Value(dtype='string', id=None), 'Citation': Value(dtype='string', id=None)}]}], 'Valuable': [{'Ingredient': Value(dtype='string', id=None), 'Handle': Value(dtype='string', id=None), 'Specifics': [{'Text': Value(dtype='string', id=None), 'Citation': Value(dtype='string', id=None)}]}], 'Context': [{'Ingredient': Value(dtype='string', id=None), 'Handle': Value(dtype='string', id=None), 'Specifics': [{'Text': Value(dtype='string', id=None), 'Citation': Value(dtype='string', id=None)}]}]}

Column names:
['Question', 'Answer Critical', 'Valuable', 'Context']

Modified dataset columns: ['source', 'question_type', 'messages', 'ground_truth', 'dataset']
First example with new column:
{'source': 'sqa_1k_dr', 'question_type': 'short_form', 'messages': [{'content': 'What is t

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 45.18ba/s]
Processing Files (1 / 1): 100%|██████████| 3.40MB / 3.40MB,  0.00B/s  
New Data Upload: 100%|██████████| 5.19kB / 5.19kB,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.52it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/rl-rag/rl_rag_train_sqa_1k_dr_longform_rubrics/commit/1a8d3eadd60dd7e73dc6d8f98847b43d1d58c552', commit_message='Upload dataset', commit_description='', oid='1a8d3eadd60dd7e73dc6d8f98847b43d1d58c552', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rl-rag/rl_rag_train_sqa_1k_dr_longform_rubrics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rl-rag/rl_rag_train_sqa_1k_dr_longform_rubrics'), pr_revision=None, pr_num=None)

In [5]:
import json
from datasets import load_dataset, Dataset

# SQA 1k
dataset_name = "sa_3k_dr"
original_dataset = load_dataset("rl-rag/sa_3k_deep_reserach_rubrics", split="train")
new_dataset_name = f"rl-rag/rl_rag_train_{dataset_name}_longform_rubrics"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print(f"Features: {original_dataset.features}")
print("\nColumn names:")
print(original_dataset.column_names)

new_dataset = []
for example in original_dataset:
    new_dataset.append({
        "source": dataset_name,
        "question_type": "short_form",
        "messages": [{"content": example["Question"], "role": "user"}],
        "ground_truth": json.dumps(example),
        "dataset": "rl_rag_longform_averaged_outcome"
    })

modified_dataset = Dataset.from_list(new_dataset)
print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})

# # upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=True, split="train")


Generating train split: 100%|██████████| 3416/3416 [00:00<00:00, 24151.80 examples/s]


Dataset info:
Number of examples: 3416
Features: {'Question': Value(dtype='string', id=None), 'Answer Critical': [{'Ingredient': Value(dtype='string', id=None), 'Handle': Value(dtype='string', id=None), 'Specifics': [{'Text': Value(dtype='string', id=None), 'Citation': Value(dtype='string', id=None)}]}], 'Valuable': [{'Ingredient': Value(dtype='string', id=None), 'Handle': Value(dtype='string', id=None), 'Specifics': [{'Text': Value(dtype='string', id=None), 'Citation': Value(dtype='string', id=None)}]}], 'Context': [{'Ingredient': Value(dtype='string', id=None), 'Handle': Value(dtype='string', id=None), 'Specifics': [{'Text': Value(dtype='string', id=None), 'Citation': Value(dtype='string', id=None)}]}]}

Column names:
['Question', 'Answer Critical', 'Valuable', 'Context']

Modified dataset columns: ['source', 'question_type', 'messages', 'ground_truth', 'dataset']
First example with new column:
{'source': 'sa_3k_dr', 'question_type': 'short_form', 'messages': [{'content': 'How to ins

Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 77.76ba/s]
Processing Files (1 / 1): 100%|██████████| 8.15MB / 8.15MB,  153kB/s  
New Data Upload: 100%|██████████| 8.15MB / 8.15MB,  153kB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.45s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/rl-rag/rl_rag_train_sa_3k_dr_longform_rubrics/commit/1c76be0c70a871f23eb045a594b04a3e69474aef', commit_message='Upload dataset', commit_description='', oid='1c76be0c70a871f23eb045a594b04a3e69474aef', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rl-rag/rl_rag_train_sa_3k_dr_longform_rubrics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rl-rag/rl_rag_train_sa_3k_dr_longform_rubrics'), pr_revision=None, pr_num=None)

# Short-form data format

In [None]:
import json

short_form_example = {
    "messages": {
        "content": "How are the causative factors of landslides functionally classified with respect to the stages of slope stability?",
        "role": "user"
    },
    "ground_truth": json.dumps(["Fernie Alpine Resort"]),
    "dataset": "re_search",
    "question_type": "short_form",
}

In [8]:
from datasets import load_dataset

# SQA 1k
original_dataset = load_dataset("rl-rag/asearcher_short_form_rlvr_with_system_prompt", split="ASearcherBase35k")
new_dataset_name = "rulins/rl_rag_train_as_base_shortform"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print(f"Features: {original_dataset.features}")
print("\nColumn names:")
print(original_dataset.column_names)

# Add question_type column with value "short_form"
def add_question_type(example):
    example["question_type"] = "short_form"
    example["messages"] = [msg for msg in example["messages"] if msg["role"] == "user"]
    example["messages"][0]["content"] = example["messages"][0]["content"].split("Question: ")[-1].strip()
    example["dataset"] = "re_search"
    return example

# Apply the transformation to add the new column
modified_dataset = original_dataset.map(add_question_type)

print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})  # Exclude ground_truth for readability


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=False, split="train")


Dataset info:
Number of examples: 35583
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'question_type']
First example with new column:
{'messages': [{'content': 'What nationality is the director of film Queensland (Film)?', 'role': 'user'}], 'dataset': 're_search', 'question_type': 'short_form'}


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format: 100%|██████████| 36/36 [00:00<00:00, 867.00ba/s]
Processing Files (1 / 1): 100%|██████████| 6.04MB / 6.04MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.99it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/rulins/rl_rag_train_as_base_shortform/commit/cacfd48cb4171709f0e57ab13ac9cb3ca4dbea5c', commit_message='Upload dataset', commit_description='', oid='cacfd48cb4171709f0e57ab13ac9cb3ca4dbea5c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rulins/rl_rag_train_as_base_shortform', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rulins/rl_rag_train_as_base_shortform'), pr_revision=None, pr_num=None)

In [7]:
from datasets import load_dataset

# SQA 1k
original_dataset = load_dataset("rl-rag/asearcher_short_form_rlvr_with_system_prompt", split="ASearcherLRM35k")
new_dataset_name = "rulins/rl_rag_train_as_hard_shortform"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print(f"Features: {original_dataset.features}")
print("\nColumn names:")
print(original_dataset.column_names)

# Add question_type column with value "short_form"
def add_question_type(example):
    example["question_type"] = "short_form"
    example["messages"] = [msg for msg in example["messages"] if msg["role"] == "user"]
    example["messages"][0]["content"] = example["messages"][0]["content"].split("Question: ")[-1].strip()
    example["dataset"] = "re_search"
    return example

# Apply the transformation to add the new column
modified_dataset = original_dataset.map(add_question_type)

print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})  # Exclude ground_truth for readability


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=False, split="train")


Dataset info:
Number of examples: 35054
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'question_type']
First example with new column:
{'messages': [{'content': 'Which international financial institution, established in the early 1930s, was originally tasked with facilitating the reparations imposed on Germany by the Treaty of Versailles, which followed a significant global conflict that led to the collapse of the Austro-Hungarian Empire and the region where a prolific book publisher and translator, Josef Florian, was born and published his works, losing its status as a crown land?', 'role': 'user'}], 'dataset': 're_search', 'question_type': 'short_form'}


Creating parquet from Arrow format: 100%|██████████| 36/36 [00:00<00:00, 957.42ba/s]
Processing Files (1 / 1): 100%|██████████| 5.09MB / 5.09MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.67it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/rulins/rl_rag_train_as_hard_shortform/commit/7fcb6dbd34ebe0df5eae29f19a2c8a60b61e7411', commit_message='Upload dataset', commit_description='', oid='7fcb6dbd34ebe0df5eae29f19a2c8a60b61e7411', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rulins/rl_rag_train_as_hard_shortform', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rulins/rl_rag_train_as_hard_shortform'), pr_revision=None, pr_num=None)

In [6]:
import json
from datasets import load_dataset

# SQA 1k
original_dataset = load_dataset("rl-rag/short_form_qa_rl", split="train")
new_dataset_name = "rulins/rl_rag_train_short_mix_v250908"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print(f"Features: {original_dataset.features}")
print("\nColumn names:")
print(original_dataset.column_names)


def check_ground_truth(gt):
    assert isinstance(gt, str)
    gt = json.loads(gt)
    if isinstance(gt, list):
        for item in gt:
            assert isinstance(item, str)
    elif isinstance(gt, str):
        assert isinstance(gt, str)
    else:
        raise ValueError(f"Invalid ground truth type: {type(gt)}")

# Add question_type column with value "short_form"
def add_question_type(example):
    example["source"] = example["dataset"]
    example["question_type"] = "short_form"
    example["messages"] = [example["messages"]]
    example["dataset"] = "re_search"
    check_ground_truth(example["ground_truth"])
    return example

# Apply the transformation to add the new column
modified_dataset = original_dataset.map(add_question_type)

print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})  # Exclude ground_truth for readability


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=False, split="train")


Repo card metadata block was not found. Setting CardData to empty.


Dataset info:
Number of examples: 2576
Features: {'messages': {'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}, 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'question_type': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset', 'question_type']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'question_type', 'source']
First example with new column:
{'messages': [{'content': "According to 'The 10 Most Powerful Data Trends That Will Transform Business In 2025', what fundamental requirement for business survival in 2025 involves a shift from historical analysis to real-time data insights?", 'role': 'user'}], 'dataset': 're_search', 'question_type': 'short_form', 'source': 'taskcraft'}


Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 1137.08ba/s]
Processing Files (1 / 1): 100%|██████████|  260kB /  260kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.28it/s]
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/rulins/rl_rag_train_short_mix_v250908/commit/18928392f544efcfe1ad9573f0ea8de0d6dcd637', commit_message='Upload dataset', commit_description='', oid='18928392f544efcfe1ad9573f0ea8de0d6dcd637', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rulins/rl_rag_train_short_mix_v250908', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rulins/rl_rag_train_short_mix_v250908'), pr_revision=None, pr_num=None)

In [19]:
from datasets import load_dataset, Dataset

dataset_name = "bcs_normal"
original_dataset = load_dataset("rl-rag/bc_synthetic_v_2", split="normal")
new_dataset_name = "rulins/rl_rag_train_bcs_normal_shortform"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print("\nColumn names:")
print(original_dataset.column_names)

new_dataset = []
for example in original_dataset:
    new_dataset.append({
        "source": dataset_name,
        "question_type": "short_form",
        "messages": [{"content": example["question"], "role": "user"}],
        "ground_truth": json.dumps([example["answer"]]),
        "dataset": "re_search"
    })

modified_dataset = Dataset.from_list(new_dataset)
print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items()})


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=False, split="train")


Dataset info:
Number of examples: 1996

Column names:
['question', 'answer']

Modified dataset columns: ['source', 'question_type', 'messages', 'ground_truth', 'dataset']
First example with new column:
{'source': 'bcs_normal', 'question_type': 'short_form', 'messages': [{'content': 'Which 19th-century American, born to a woman known as Nancy and the Founding Father credited as a principal drafter of the U.S. Constitution, was associated with the founding neighborhood in a New York borough, transformed a prominent Manhattan thoroughfare with railway infrastructure, and later died in a coastal village now part of Westchester County?', 'role': 'user'}], 'ground_truth': '["Gouverneur Morris Jr."]', 'dataset': 're_search'}


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 557.23ba/s]
Processing Files (1 / 1): 100%|██████████|  517kB /  517kB,  829kB/s  
New Data Upload: 100%|██████████|  167kB /  167kB,  829kB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/rulins/rl_rag_train_bcs_normal_shortform/commit/cf5a3cbd8b6285b5cb2d19415841e04f6fa06ed1', commit_message='Upload dataset', commit_description='', oid='cf5a3cbd8b6285b5cb2d19415841e04f6fa06ed1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rulins/rl_rag_train_bcs_normal_shortform', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rulins/rl_rag_train_bcs_normal_shortform'), pr_revision=None, pr_num=None)

In [None]:
from datasets import load_dataset, Dataset

dataset_name = "bcs_simple"
original_dataset = load_dataset("rl-rag/bc_synthetic_v_2", split="simple")
new_dataset_name = f"rulins/rl_rag_train_{dataset_name}_shortform"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print("\nColumn names:")
print(original_dataset.column_names)

new_dataset = []
for example in original_dataset:
    new_dataset.append({
        "source": dataset_name,
        "question_type": "short_form",
        "messages": [{"content": example["question"], "role": "user"}],
        "ground_truth": json.dumps([example["answer"]]),
        "dataset": "re_search"
    })

modified_dataset = Dataset.from_list(new_dataset)
print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items()})


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=False, split="train")


Dataset info:
Number of examples: 1996

Column names:
['question', 'answer']

Modified dataset columns: ['source', 'question_type', 'messages', 'ground_truth', 'dataset']
First example with new column:
{'source': 'bcs_simple', 'question_type': 'short_form', 'messages': [{'content': 'Which 19th-century American, born to a woman known as Nancy and the Founding Father credited as a principal drafter of the U.S. Constitution, was associated with the founding neighborhood in a New York borough, transformed a prominent Manhattan thoroughfare with railway infrastructure, and later died in a coastal village now part of Westchester County?', 'role': 'user'}], 'ground_truth': '["Gouverneur Morris Jr."]', 'dataset': 're_search'}


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 656.59ba/s]
Processing Files (1 / 1): 100%|██████████|  407kB /  407kB,  689kB/s  
New Data Upload: 100%|██████████|  275kB /  275kB,  689kB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/rulins/rl_rag_train_bcs_simple_shortform/commit/b78ba65f4567e8e85adfb97d3f669540c255b4e0', commit_message='Upload dataset', commit_description='', oid='b78ba65f4567e8e85adfb97d3f669540c255b4e0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rulins/rl_rag_train_bcs_simple_shortform', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rulins/rl_rag_train_bcs_simple_shortform'), pr_revision=None, pr_num=None)

## Evaluation Data

In [10]:
import json
from datasets import load_dataset

for dataset_name in ["nq", "tqa", "2wiki", "hotpotqa"]:
    original_dataset = load_dataset(f"rulins/{dataset_name}_rlvr_no_prompt", split="test")
    new_dataset_name = f"rl-rag/rl_rag_eval_{dataset_name}_shortform"

    # Examine dataset structure
    print("Dataset info:")
    print(f"Number of examples: {len(original_dataset)}")
    print(f"Features: {original_dataset.features}")
    print("\nColumn names:")
    print(original_dataset.column_names)


    # Add question_type column with value "short_form"
    def add_question_type(example):
        example["source"] = dataset_name
        example["question_type"] = "short_form"
        example["messages"] = example["messages"]
        example["dataset"] = "re_search"
        return example

    # Apply the transformation to add the new column
    modified_dataset = original_dataset.map(add_question_type)

    print(f"\nModified dataset columns: {modified_dataset.column_names}")
    print(f"First example with new column:")
    print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})  # Exclude ground_truth for readability


    # upload to new dataset
    modified_dataset.push_to_hub(new_dataset_name, private=True)


Dataset info:
Number of examples: 3610
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'source', 'question_type']
First example with new column:
{'messages': [{'content': 'when was the last time anyone was on the moon?', 'role': 'user'}], 'dataset': 're_search', 'source': 'nq', 'question_type': 'short_form'}


Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 1576.36ba/s]
Processing Files (1 / 1): 100%|██████████|  216kB /  216kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s]


Dataset info:
Number of examples: 17944
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'source', 'question_type']
First example with new column:
{'messages': [{'content': 'Who was the man behind The Chipmunks?', 'role': 'user'}], 'dataset': 're_search', 'source': 'tqa', 'question_type': 'short_form'}


Creating parquet from Arrow format: 100%|██████████| 18/18 [00:00<00:00, 704.02ba/s]
Processing Files (1 / 1): 100%|██████████| 3.55MB / 3.55MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.30it/s]


Dataset info:
Number of examples: 300
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'source', 'question_type']
First example with new column:
{'messages': [{'content': 'Who is the mother of the director of film Polish-Russian War (Film)?', 'role': 'user'}], 'dataset': 're_search', 'source': '2wiki', 'question_type': 'short_form'}


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1899.59ba/s]
Processing Files (1 / 1): 100%|██████████| 20.0kB / 20.0kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.06it/s]


Dataset info:
Number of examples: 7405
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'source', 'question_type']
First example with new column:
{'messages': [{'content': 'Were Scott Derrickson and Ed Wood of the same nationality?', 'role': 'user'}], 'dataset': 're_search', 'source': 'hotpotqa', 'question_type': 'short_form'}


Creating parquet from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 1457.05ba/s]
Processing Files (1 / 1): 100%|██████████|  625kB /  625kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.55it/s]


In [None]:
import json
from datasets import load_dataset


original_dataset = load_dataset(f"rulins/webbrowse_rlvr_no_prompt", split="test")
new_dataset_name = f"rl-rag/rl_rag_eval_bc_shortform"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print(f"Features: {original_dataset.features}")
print("\nColumn names:")
print(original_dataset.column_names)


# Add question_type column with value "short_form"
def add_question_type(example):
    example["source"] = "browse_comp"
    example["question_type"] = "short_form"
    example["messages"] = example["messages"]
    example["dataset"] = "re_search"
    return example

# Apply the transformation to add the new column
modified_dataset = original_dataset.map(add_question_type)

print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items() if k != 'ground_truth'})  # Exclude ground_truth for readability


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=True)


Dataset info:
Number of examples: 1266
Features: {'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ground_truth': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None)}

Column names:
['messages', 'ground_truth', 'dataset']

Modified dataset columns: ['messages', 'ground_truth', 'dataset', 'source', 'question_type']
First example with new column:
{'messages': [{'content': "An African author tragically passed away in a tragic road accident. As a child, he'd wanted to be a police officer. He lectured at a private university from 2018 until his death. In 2018, this author spoke about writing stories that have no sell by date in an interview. One of his books was selected to be a compulsory school reading in an African country in 2017. Which years did this author work as a probation officer?", 'role': 'user'}], 'dataset': 're_search', 'source': 'browse_comp', 'question_type': 'short_form'}


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 528.58ba/s]
Processing Files (1 / 1): 100%|██████████|  451kB /  451kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.64it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/rl-rag/rl_rag_eval_bc_shortform/commit/18e3e3ffa658e7940839896d64d2a74dfce39faa', commit_message='Upload dataset', commit_description='', oid='18e3e3ffa658e7940839896d64d2a74dfce39faa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rl-rag/rl_rag_eval_bc_shortform', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rl-rag/rl_rag_eval_bc_shortform'), pr_revision=None, pr_num=None)

In [17]:
import json
from datasets import load_dataset, Dataset


dataset_name = "bcs_easy"
original_dataset = load_dataset(f"rl-rag/verifiable_synthetic_depth_one_v2_verified", split="train")
new_dataset_name = f"rl-rag/rl_rag_eval_{dataset_name}_shortform"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print("\nColumn names:")
print(original_dataset.column_names)

new_dataset = []
for example in original_dataset:
    new_dataset.append({
        "source": dataset_name,
        "question_type": "short_form",
        "messages": [{"content": example["question"], "role": "user"}],
        "ground_truth": json.dumps([example["answer"]]),
        "dataset": "re_search"
    })

modified_dataset = Dataset.from_list(new_dataset)
print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items()})


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=True, split="test")


Dataset info:
Number of examples: 114

Column names:
['question', 'answer', 'explanation', 'o4_mini_response', 'o4_mini_api_calling', 'o4_mini_parsed_answer', 'o4_mini_is_correct']

Modified dataset columns: ['source', 'question_type', 'messages', 'ground_truth', 'dataset']
First example with new column:
{'source': 'bcs_easy', 'question_type': 'short_form', 'messages': [{'content': 'Which individual, who led troops in a northern campaign during the early era of a major Chinese imperial house, was later the subject of debated accounts in compiled chronicles, with his close family member dying not long afterward and the neighboring rival state significant in these events?', 'role': 'user'}], 'ground_truth': '["Zhao Dezhao"]', 'dataset': 're_search'}


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1305.01ba/s]
Processing Files (1 / 1): 100%|██████████| 30.7kB / 30.7kB,  153kB/s  
New Data Upload: 100%|██████████| 30.7kB / 30.7kB,  153kB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.34it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/rl-rag/rl_rag_eval_bcs_easy_shortform/commit/668dda4eb3b963073c7c5b1fabe9849de06ab99f', commit_message='Upload dataset', commit_description='', oid='668dda4eb3b963073c7c5b1fabe9849de06ab99f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rl-rag/rl_rag_eval_bcs_easy_shortform', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rl-rag/rl_rag_eval_bcs_easy_shortform'), pr_revision=None, pr_num=None)

In [18]:
import json
from datasets import load_dataset, Dataset


dataset_name = "bcs_hard"
original_dataset = load_dataset(f"rl-rag/verifiable_synthetic_varied_depth_o3_verified", split="train")
new_dataset_name = f"rl-rag/rl_rag_eval_{dataset_name}_shortform"

# Examine dataset structure
print("Dataset info:")
print(f"Number of examples: {len(original_dataset)}")
print("\nColumn names:")
print(original_dataset.column_names)

new_dataset = []
for example in original_dataset:
    new_dataset.append({
        "source": dataset_name,
        "question_type": "short_form",
        "messages": [{"content": example["question"], "role": "user"}],
        "ground_truth": json.dumps([example["answer"]]),
        "dataset": "re_search"
    })

modified_dataset = Dataset.from_list(new_dataset)
print(f"\nModified dataset columns: {modified_dataset.column_names}")
print(f"First example with new column:")
print({k: v for k, v in modified_dataset[0].items()})


# upload to new dataset
modified_dataset.push_to_hub(new_dataset_name, private=True, split="test")


Generating train split: 100%|██████████| 101/101 [00:00<00:00, 474.94 examples/s]


Dataset info:
Number of examples: 101

Column names:
['question', 'answer', 'explanation', 'o4_mini_response', 'o4_mini_api_calling', 'o4_mini_parsed_answer', 'o4_mini_is_correct', 'o3_response', 'o3_api_calling', 'o3_parsed_answer', 'o3_is_correct']

Modified dataset columns: ['source', 'question_type', 'messages', 'ground_truth', 'dataset']
First example with new column:
{'source': 'bcs_hard', 'question_type': 'short_form', 'messages': [{'content': "Which historical figure, whose familial and historical connections include an influential historian, a later emperor, a significant court official, a comprehensive chronicle, and a ruler who followed a notable founding emperor, became pivotal during a northern military campaign where accounts in traditional chronicles differ regarding events after the first emperor's reign?", 'role': 'user'}], 'ground_truth': '["Zhao Dezhao"]', 'dataset': 're_search'}


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1430.53ba/s]
Processing Files (1 / 1): 100%|██████████| 35.9kB / 35.9kB,  0.00B/s  
New Data Upload: 100%|██████████| 35.9kB / 35.9kB,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.54it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/rl-rag/rl_rag_eval_bcs_hard_shortform/commit/d11a8104c9e2c1fb91c639fed541ea05474ee0ac', commit_message='Upload dataset', commit_description='', oid='d11a8104c9e2c1fb91c639fed541ea05474ee0ac', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rl-rag/rl_rag_eval_bcs_hard_shortform', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rl-rag/rl_rag_eval_bcs_hard_shortform'), pr_revision=None, pr_num=None)