In [1]:
from datasets import load_dataset

# Load XSum dataset
xsum = load_dataset("EdinburghNLP/xsum")

# Print dataset structure
print(xsum)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/300M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})


In [2]:
# Check available splits
xsum.keys()


dict_keys(['train', 'validation', 'test'])

In [3]:
# View a single sample
sample = xsum["train"][0]

for key, value in sample.items():
    print(f"{key}:\n{value}\n")


document:
The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
"It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate that - bu

In [4]:
import pandas as pd

df = pd.DataFrame(xsum["train"])
df.head()


Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984


In [5]:
print("Train:", len(xsum["train"]))
print("Validation:", len(xsum["validation"]))
print("Test:", len(xsum["test"]))


Train: 204045
Validation: 11332
Test: 11334


In [6]:
# Length analysis
df["doc_length"] = df["document"].str.split().apply(len)
df["summary_length"] = df["summary"].str.split().apply(len)

df[["doc_length", "summary_length"]].describe()


Unnamed: 0,doc_length,summary_length
count,204045.0,204045.0
mean,373.864633,21.097645
std,304.632089,5.236819
min,0.0,1.0
25%,176.0,18.0
50%,295.0,21.0
75%,491.0,24.0
max,29189.0,70.0


Preprocessing the 'EdinburghNLP/xsum' dataset by identifying and handling missing values, removing empty or excessively short entries, and eliminating duplicate entries in the 'document' and 'summary' columns across all train, validation, test.

In [7]:
for split_name, dataset_split in xsum.items():
    print(f"\n--- Checking split: {split_name} ---")
    df_split = pd.DataFrame(dataset_split)

    for col in ['document', 'summary']:
        # Count null values
        null_count = df_split[col].isnull().sum()
        print(f"'{col}' column - Null values: {null_count}")

        # Count empty strings (including whitespace-only)
        empty_string_count = df_split[col].apply(lambda x: isinstance(x, str) and not x.strip()).sum()
        print(f"'{col}' column - Empty/Whitespace strings: {empty_string_count}")



--- Checking split: train ---
'document' column - Null values: 0
'document' column - Empty/Whitespace strings: 28
'summary' column - Null values: 0
'summary' column - Empty/Whitespace strings: 0

--- Checking split: validation ---
'document' column - Null values: 0
'document' column - Empty/Whitespace strings: 5
'summary' column - Null values: 0
'summary' column - Empty/Whitespace strings: 0

--- Checking split: test ---
'document' column - Null values: 0
'document' column - Empty/Whitespace strings: 1
'summary' column - Null values: 0
'summary' column - Empty/Whitespace strings: 0


## Remove empty or excessively short entries

In [8]:
min_doc_length = 10
min_summary_length = 3

for split_name in xsum.keys():
    original_count = len(xsum[split_name])

    # Filter function
    def filter_min_length(example):
        doc_len = len(example['document'].strip().split())
        summary_len = len(example['summary'].strip().split())
        return doc_len >= min_doc_length and summary_len >= min_summary_length

    xsum[split_name] = xsum[split_name].filter(filter_min_length)

    removed_count = original_count - len(xsum[split_name])
    print(f"--- Split: {split_name} ---")
    print(f"Original entries: {original_count}")
    print(f"Entries after length filtering: {len(xsum[split_name])}")
    print(f"Entries removed: {removed_count}\n")

print("Dataset after removing empty or excessively short entries:")
print(xsum)

Filter:   0%|          | 0/204045 [00:00<?, ? examples/s]

--- Split: train ---
Original entries: 204045
Entries after length filtering: 203896
Entries removed: 149



Filter:   0%|          | 0/11332 [00:00<?, ? examples/s]

--- Split: validation ---
Original entries: 11332
Entries after length filtering: 11318
Entries removed: 14



Filter:   0%|          | 0/11334 [00:00<?, ? examples/s]

--- Split: test ---
Original entries: 11334
Entries after length filtering: 11329
Entries removed: 5

Dataset after removing empty or excessively short entries:
DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 203896
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11318
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11329
    })
})


## Perform basic text cleaning


In [10]:
for split_name in xsum.keys():
    print(f"Applying cleaning to split: {split_name}")
    xsum[split_name] = xsum[split_name].map(
        lambda example: {
            'document': example['document'].strip(),
            'summary': example['summary'].strip(),
            'id': example['id']
        },
        batched=False  # Apply function to individual examples
    )
print("Finished applying whitespace stripping to 'document' and 'summary' columns.")

Applying cleaning to split: train


Map:   0%|          | 0/203896 [00:00<?, ? examples/s]

Applying cleaning to split: validation


Map:   0%|          | 0/11318 [00:00<?, ? examples/s]

Applying cleaning to split: test


Map:   0%|          | 0/11329 [00:00<?, ? examples/s]

Finished applying whitespace stripping to 'document' and 'summary' columns.



Leading and trailing whitespace were successfully removed from the 'document' and 'summary' columns across all dataset splits (train, validation, and test). The cleaning operation was applied to every example in each split using the `.map()` function, preserving the 'id' column.


## Remove duplicate entries

In [9]:
for split_name in xsum.keys():
    original_count = len(xsum[split_name])

    # Convert to pandas DataFrame to easily handle duplicates
    df_split = pd.DataFrame(xsum[split_name])

    # Remove duplicates based on 'document' and 'summary'
    df_deduplicated = df_split.drop_duplicates(subset=['document', 'summary'])

    # Update the dataset split
    xsum[split_name] = xsum[split_name].filter(lambda example: example['id'] in df_deduplicated['id'].values)

    removed_count = original_count - len(xsum[split_name])
    print(f"--- Split: {split_name} ---")
    print(f"Original entries: {original_count}")
    print(f"Entries after duplicate removal: {len(xsum[split_name])}")
    print(f"Entries removed: {removed_count}\n")

print("Dataset after removing duplicate entries:")
print(xsum)

Filter:   0%|          | 0/203896 [00:00<?, ? examples/s]

KeyboardInterrupt: 