In [13]:
# Suppress CUDA warnings (since you're using CPU)
import os
import warnings

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable CUDA
warnings.filterwarnings("ignore", category=UserWarning, module='tensorflow')
warnings.filterwarnings("ignore", category=UserWarning, module='torch')

import pandas as pd
import re
from transformers import pipeline
from rouge_score import rouge_scorer

# Load datasets from the Kaggle input directory
train_df = pd.read_csv('/kaggle/input/text-summarization/train.csv')
val_df = pd.read_csv('/kaggle/input/text-summarization/validation.csv')
test_df = pd.read_csv('/kaggle/input/text-summarization/test.csv')

# Text cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[.*?\]', '', text)
    return text.strip()

# Clean all datasets
for df in [train_df, val_df, test_df]:
    df['clean_article'] = df['article'].apply(clean_text)
    df['clean_summary'] = df['highlights'].apply(clean_text)  # Using 'highlights' as the summary column

# Load summarization pipeline (BART for CNN/DailyMail)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

# Abstractive summarization function
def abstractive_summary(text, max_len=120, min_len=30):
    text = ' '.join(text.split()[:400])  # Truncate long articles
    summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
    return summary[0]['summary_text']

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Pick 1 article from each dataset to summarize and evaluate
datasets = {'Train': train_df, 'Validation': val_df, 'Test': test_df}

for name, df in datasets.items():
    print(f"\n\n {name} Dataset Example:\n{'=' * 30}")
    article = df['clean_article'].iloc[0]
    reference = df['clean_summary'].iloc[0]
    generated = abstractive_summary(article)

    print(" Original Article:\n", article[:500], "...\n")
    print(" Generated Summary:\n", generated)
    print(" Reference Summary:\n", reference)

    # ROUGE evaluation
    score = scorer.score(reference, generated)
    print(" ROUGE Scores:", score)


Device set to use cpu




 Train Dataset Example:
 Original Article:
 by . associated press . published: . 14:11 est, 25 october 2013 . | . updated: . 15:36 est, 25 october 2013 . the bishop of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a virus in late september and early october. the state health department has issued an advisory of exposure for anyone who attended five churches and took communion. bishop john folda (pictured) of the fargo catholic diocese in n ...

 Generated Summary:
 Bishop john folda of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a virus. The state health department has issued an advisory of exposure for anyone who attended five churches and took communion.
 Reference Summary:
 bishop john folda, of north dakota, is taking time off after being diagnosed . he contracted the infection t