In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.18-cp311-cp311-win_amd64.whl.metadata (8.0 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.

In [3]:
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
pip install spacy transformers datasets


In [None]:
python -m spacy download en_core_web_sm



In [None]:
import re

def preprocess(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text.strip()

# Apply preprocessing
dataset = dataset.map(lambda x: {'article': preprocess(x['article']), 'highlights': preprocess(x['highlights'])})


In [None]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extractive_summary(text, num_sentences=3):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return ' '.join(sentences[:num_sentences])

# Example usage
sample_article = dataset['train'][0]['article']
print(extractive_summary(sample_article))


In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments

# Load model and tokenizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Tokenize the dataset
def tokenize_function(example):
    inputs = tokenizer(example["article"], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["highlights"], max_length=128, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].shuffle().select(range(1000)),  # Subset for quick training
    eval_dataset=tokenized_dataset["validation"].shuffle().select(range(100)),
)

# Train the model
trainer.train()


In [None]:
from datasets import load_metric

# Load ROUGE metric
rouge = load_metric("rouge")

# Generate summaries and compute ROUGE scores
predictions = []
references = []

for example in dataset["test"].select(range(100)):
    summary = summarizer(example["article"], max_length=130, min_length=30, do_sample=False)[0]['summary_text']
    predictions.append(summary)
    references.append(example["highlights"])

# Compute ROUGE scores
results = rouge.compute(predictions=predictions, references=references)
print(results)


📝 Project Overview: Text Summarization with CNN/Daily Mail Dataset
This project focuses on implementing both extractive and abstractive summarization techniques on the CNN/Daily Mail dataset. The key steps undertaken are:

Data Loading: Utilized the datasets library to load the "cnn_dailymail" dataset (version 3.0.0).

Preprocessing: Cleaned the articles and summaries by removing extra whitespace and formatting inconsistencies.

Extractive Summarization: Applied spaCy's English model to extract the first few sentences from each article as a basic extractive summary.

Abstractive Summarization: Employed Hugging Face's Transformers library with the pre-trained facebook/bart-large-cnn model to generate abstractive summaries.

Model Fine-Tuning: Fine-tuned the BART model on a subset of the dataset to improve summarization quality.

Evaluation: Assessed the performance of the summarization model using ROUGE metrics to compare the generated summaries against the reference summaries.

Reporting: Documented the entire process and results within this Jupyter Notebook for reproducibility and further analysis.