In [2]:
from datasets import load_dataset

In [4]:
ds = load_dataset("brimmann2/squad-v2-sampled")

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['gem_id', 'id', 'title', 'context', 'question', 'target', 'references', 'answers', 'embeddings', 'generated_text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['gem_id', 'id', 'title', 'context', 'question', 'target', 'references', 'answers', 'embeddings', 'generated_text'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['gem_id', 'id', 'title', 'context', 'question', 'target', 'references', 'answers', 'embeddings', 'generated_text'],
        num_rows: 250
    })
})

In [10]:
# Assuming your DatasetDict is named dataset_dict

print("--- Finding rows with '<xRAG>' ---")
for split in ds:
    rows_with_xrag = ds[split].filter(
        lambda example: '<xRAG>' in example['generated_text']
    )
    print(f"Found {len(rows_with_xrag)} rows with '<xRAG>' in the '{split}' split.")
    
    # You can uncomment the following lines to inspect an example
    # if len(rows_with_xrag) > 0:
    #     print("Example text:")
    #     print(rows_with_xrag[0]['generated_text'])
    #     print("-" * 20)

--- Finding rows with '<xRAG>' ---
Found 41 rows with '<xRAG>' in the 'train' split.
Found 12 rows with '<xRAG>' in the 'test' split.
Found 9 rows with '<xRAG>' in the 'validation' split.


In [11]:
# Define a function to remove the tag
def remove_xrag_tag(example):
    # .replace() will remove all occurrences of the substring
    example['generated_text'] = example['generated_text'].replace('<xRAG>', '')
    return example

# Apply the function to the whole dataset
cleaned_dataset_dict = ds.map(remove_xrag_tag)

print("\n--- Finished removing '<xRAG>' ---")


--- Finished removing '<xRAG>' ---


In [12]:
print("\n--- Verifying the changes ---")
# Check the 'train' split of the new, cleaned dataset
rows_after_cleaning = cleaned_dataset_dict['train'].filter(
    lambda example: '<xRAG>' in example['generated_text']
)

print(f"Found {len(rows_after_cleaning)} rows with '<xRAG>' in the cleaned 'train' split.")
# This should output 0.

# You can also compare an original example with its cleaned version
try:
    # Find an example that originally had the tag
    original_example = ds['train'].filter(lambda ex: '<xRAG>' in ex['generated_text'])[0]
    
    # Find the same example in the cleaned dataset using its 'id'
    example_id = original_example['id']
    cleaned_example = cleaned_dataset_dict['train'].filter(lambda ex: ex['id'] == example_id)[0]

    print("\n--- Before cleaning ---")
    print(original_example['generated_text'])

    print("\n--- After cleaning ---")
    print(cleaned_example['generated_text'])
except IndexError:
    print("\nNo examples with '<xRAG>' found in the original 'train' split to compare.")



--- Verifying the changes ---


Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Found 0 rows with '<xRAG>' in the cleaned 'train' split.


Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


--- Before cleaning ---
- The aircraft is not able to take off from a short runway.
- The aircraft is not able to land on a short runway.
- The aircraft is not able to take off from a runway with a high <xRAG> slope.
- The aircraft is not able to land on a runway with a high slope.
- The aircraft is not able to take off from a runway with a high headwind.
- The aircraft is not able to land on a runway with a high headwind.
- The aircraft is not able to take off from a runway with a high tailwind.
- The aircraft is not able to land on a runway with a high tailwind.
- The aircraft is not able to take off from a runway with a high crosswind.
- The aircraft is not able to land on a runway with a high crosswind.
- The aircraft is not able to take off from a runway with

--- After cleaning ---
- The aircraft is not able to take off from a short runway.
- The aircraft is not able to land on a short runway.
- The aircraft is not able to take off from a runway with a high  slope.
- The aircraf

In [13]:
# Assuming cleaned_dataset_dict is your dataset after running the map function

print("\n--- Verifying the entire cleaned dataset ---")

for split_name, dataset_split in cleaned_dataset_dict.items():
    print(f"Verifying split: '{split_name}'...")
    
    # Filter the split to find any remaining instances
    rows_with_xrag = dataset_split.filter(
        lambda example: '<xRAG>' in example['generated_text']
    )
    
    # Assert that the count of such rows is zero
    count = len(rows_with_xrag)
    assert count == 0, f"Found {count} rows with '<xRAG>' in the '{split_name}' split after cleaning!"
    
    print(f"-> Verification successful for '{split_name}'. No '<xRAG>' tags found.")

print("\n✅ Verification complete. All splits are clean.")


--- Verifying the entire cleaned dataset ---
Verifying split: 'train'...
-> Verification successful for 'train'. No '<xRAG>' tags found.
Verifying split: 'test'...


Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

-> Verification successful for 'test'. No '<xRAG>' tags found.
Verifying split: 'validation'...


Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

-> Verification successful for 'validation'. No '<xRAG>' tags found.

✅ Verification complete. All splits are clean.


In [14]:
cleaned_dataset_dict.push_to_hub("brimmann2/squad-v2-sampled")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/brimmann2/squad-v2-sampled/commit/c18493ce0910dbf81b2925c4a6f5209e0846ad6d', commit_message='Upload dataset', commit_description='', oid='c18493ce0910dbf81b2925c4a6f5209e0846ad6d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/brimmann2/squad-v2-sampled', endpoint='https://huggingface.co', repo_type='dataset', repo_id='brimmann2/squad-v2-sampled'), pr_revision=None, pr_num=None)