In [1]:
from datasets import load_dataset
dataset1 = load_dataset('cnn_dailymail', '3.0.0')
dataset2 = load_dataset('xsum', trust_remote_code=True)
print(dataset1)
print(dataset2)


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})
DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})


In [2]:
print(dataset1.keys())  # To see available splits (train, test, validation)
print(dataset2.keys())

print(dataset1["train"][0])  # Print the first example from the CNN dataset
print(dataset2["train"][0])  # Print the first example from the XSum dataset


dict_keys(['train', 'validation', 'test'])
dict_keys(['train', 'validation', 'test'])
{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hoste

In [3]:
import pandas as pd

# Convert CNN dataset to a DataFrame
df_cnn = pd.DataFrame(dataset1["train"])
df_cnn = df_cnn[['article', 'highlights']]  # Keeping only relevant columns

# Convert XSum dataset to a DataFrame
df_xsum = pd.DataFrame(dataset2["train"])
df_xsum = df_xsum[['document', 'summary']]  # Keeping only relevant columns

# Display sample data
print(df_cnn.head())
print(df_xsum.head())


                                             article  \
0  LONDON, England (Reuters) -- Harry Potter star...   
1  Editor's note: In our Behind the Scenes series...   
2  MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...   
3  WASHINGTON (CNN) -- Doctors removed five small...   
4  (CNN)  -- The National Football League has ind...   

                                          highlights  
0  Harry Potter star Daniel Radcliffe gets £20M f...  
1  Mentally ill inmates in Miami are housed on th...  
2  NEW: "I thought I was going to die," driver sa...  
3  Five small polyps found during procedure; "non...  
4  NEW: NFL chief, Atlanta Falcons owner critical...  
                                            document  \
0  The full cost of damage in Newton Stewart, one...   
1  A fire alarm went off at the Holiday Inn in Ho...   
2  Ferrari appeared in a position to challenge un...   
3  John Edward Bates, formerly of Spalding, Linco...   
4  Patients and staff were evacuated from Cerahpa... 

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")


In [5]:
def tokenize_function(examples):
    return tokenizer(examples["article"], padding="max_length", truncation=True)

tokenized_cnn_dataset = dataset1.map(tokenize_function, batched=True)


In [6]:
from datasets import load_from_disk

tokenized_cnn_dataset = load_from_disk("tokenized_cnn_dataset")  # Load it back


In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_cnn_dataset["train"], batch_size=8, shuffle=True)


In [8]:
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")


In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",         # Where to save model checkpoints
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    learning_rate=2e-5,             # Learning rate (adjustable if needed)
    per_device_train_batch_size=8,  # Match batch size to DataLoader
    per_device_eval_batch_size=8,
    num_train_epochs=3,             # Number of times to pass through the dataset
    weight_decay=0.01,              # Regularization to avoid overfitting
    save_total_limit=2              # Keep only 2 checkpoints to save space
)




In [10]:
from transformers import AutoModelForSeq2SeqLM

MY_MODEL = "facebook/bart-large-cnn"  # You can also use "t5-small" or another model
MODEL_BART = AutoModelForSeq2SeqLM.from_pretrained(MY_MODEL)


In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

def preprocess_function(examples):
    return tokenizer(examples["article"], truncation=True, padding="max_length", max_length=614)

tokenized_cnn_dataset = dataset1.map(preprocess_function, batched=True)


Map:   0%|          | 0/13368 [00:02<?, ? examples/s]

In [12]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["article"], 
        max_length=614,  # ✅ Ensure max length is applied
        padding="max_length",  # ✅ Ensures all inputs have the same length
        truncation=True  # ✅ Forces truncation for longer texts
    )
    
    labels = tokenizer(
        examples["highlights"], 
        max_length=150,  # ✅ Shorter max length for summaries
        padding="max_length", 
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]  # ✅ Ensure labels are correctly stored

    return model_inputs

tokenized_cnn_dataset = dataset1.map(preprocess_function, batched=True)


Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

In [None]:
for sample in tokenized_cnn_dataset["train"]:
    assert len(sample["input_ids"]) == 614, f"Found length {len(sample['input_ids'])}"


In [None]:
trainer.train()
