In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers datasets sumy nltk torch sentencepiece

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Building wheel for docopt (setup.py) ... [?25l[?25hdone


In [3]:

import pandas as pd
import nltk
nltk.download('punkt')

from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

from datasets import load_dataset, Dataset

from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    TrainingArguments,
    Trainer
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
DATA_PATH = "/content/drive/MyDrive/news_project/data/cleaned/balanced_10000_records.csv"
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,News ID,Category,Topic,Headline,News body,Title entity,Entity content
0,N27021,travel,travelnews,Most Dangerous Vacation Destinations,The following list is based on travel alerts a...,{},{}
1,N105998,music,musicnews,Pharrell Williams Guarantees Internships to 11...,Pharrell Williams surprised the 2019 graduatin...,{'Harlem': 'Harlem'},"{'Harlem': {'type': 'item', 'id': 'Q105676692'..."
2,N83893,foodanddrink,newstrends,Eat up! Grandwich competition starts July 1,"GRAND RAPIDS, Mich. - One of Grand Rapids' tas...",{},{}
3,N62196,news,newsus,DHS predicts up to 25 percent drop in migrant ...,WASHINGTON Border Patrol agents are on track...,{},{}
4,N77148,music,music-celebrity,Katy Perry Steps Out in Lingerie Look for Date...,Katy Perry's latest look works for a night in ...,"{'Orlando Bloom': 'Orlando Bloom', 'London': '...","{'Orlando Bloom': {'type': 'item', 'id': 'Q444..."


In [6]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
sports,1128
news,1037
finance,731
lifestyle,674
foodanddrink,641
autos,639
video,636
travel,636
health,614
weather,611


In [7]:
CANDIDATES = ["News body", "body", "text", "content", "article", "news"]
text_col = None
for c in CANDIDATES:
    if c in df.columns:
        text_col = c
        break

if text_col is None:
    # Fallback: choose the column with the longest average string length
    str_cols = [c for c in df.columns if df[c].dtype == "object"]
    if not str_cols:
        raise ValueError("❌ No string/text columns found in PENS dataset.")
    avg_lengths = {c: df[c].astype(str).str.len().mean() for c in str_cols}
    text_col = max(avg_lengths, key=avg_lengths.get)

print(f"✅ Using column '{text_col}' as article text.")

✅ Using column 'News body' as article text.


In [8]:
# Standardize text column name to 'text'
df["text"] = df[text_col].astype(str)

In [9]:
df.head()

Unnamed: 0,News ID,Category,Topic,Headline,News body,Title entity,Entity content,text
0,N27021,travel,travelnews,Most Dangerous Vacation Destinations,The following list is based on travel alerts a...,{},{},The following list is based on travel alerts a...
1,N105998,music,musicnews,Pharrell Williams Guarantees Internships to 11...,Pharrell Williams surprised the 2019 graduatin...,{'Harlem': 'Harlem'},"{'Harlem': {'type': 'item', 'id': 'Q105676692'...",Pharrell Williams surprised the 2019 graduatin...
2,N83893,foodanddrink,newstrends,Eat up! Grandwich competition starts July 1,"GRAND RAPIDS, Mich. - One of Grand Rapids' tas...",{},{},"GRAND RAPIDS, Mich. - One of Grand Rapids' tas..."
3,N62196,news,newsus,DHS predicts up to 25 percent drop in migrant ...,WASHINGTON Border Patrol agents are on track...,{},{},WASHINGTON Border Patrol agents are on track...
4,N77148,music,music-celebrity,Katy Perry Steps Out in Lingerie Look for Date...,Katy Perry's latest look works for a night in ...,"{'Orlando Bloom': 'Orlando Bloom', 'London': '...","{'Orlando Bloom': {'type': 'item', 'id': 'Q444...",Katy Perry's latest look works for a night in ...


In [10]:
# ==========================================================
# 5. Clean Text for Summarization
# ==========================================================
import re
def clean_text(t: str) -> str:
    t = str(t)
    # Remove email addresses
    t = re.sub(r"\S+@\S+", " ", t)
    # Remove URLs
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    # Remove boilerplate/news artifacts
    t = re.sub(r"RELATED:.*", " ", t)
    t = re.sub(r"Contact .*", " ", t)
    t = re.sub(r"Follow .*", " ", t)
    t = re.sub(r"Credit:.*", " ", t)
    t = re.sub(r"This article originally appeared.*", " ", t)
    # Collapse multiple spaces/newlines
    t = re.sub(r"\s+", " ", t)
    return t.strip()

In [11]:
print("🔹 Cleaning text...")
df["text"] = df["text"].fillna("").astype(str).apply(clean_text)

🔹 Cleaning text...


In [12]:
df['text'][9999]

'Hollywood superstars Elizabeth Taylor and Richard Burton divorced for the first time. (June 26)'

In [13]:
# Drop empty or too short texts
df = df[df["text"].str.len() > 80].reset_index(drop=True)
print("✅ Rows after basic cleaning:", len(df))

✅ Rows after basic cleaning: 9468


In [14]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
sports,1097
news,1010
finance,716
lifestyle,646
travel,614
autos,611
health,604
foodanddrink,598
music,569
tv,558


In [15]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [16]:
# ==========================================================
# 6. Generate Extractive Summaries (TextRank)
# ==========================================================
def auto_summary(text: str, num_sentences: int = 3) -> str:
    if not isinstance(text, str) or len(text.strip()) == 0:
        return ""
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    textrank_summarizer = TextRankSummarizer()
    sentences = textrank_summarizer(parser.document, num_sentences)
    return " ".join(str(s) for s in sentences)

print("🔹 Generating pseudo-reference summaries with TextRank...")
df["summary"] = df["text"].apply(lambda x: auto_summary(x, num_sentences=4))

🔹 Generating pseudo-reference summaries with TextRank...


In [17]:
df['summary'].head(10)

Unnamed: 0,summary
0,The following list is based on travel alerts a...
1,"The Academy in Harlem, N.Y., is a charter scho..."
2,For people with an adventurous palate who love...
3,WASHINGTON Border Patrol agents are on track t...
4,The 33-year-old pop star was spotted out in Lo...
5,Westside residents are upset after someone van...
6,"But the beach, ah, the beach, is, in its total..."
7,"The justices, in a 5-4 decision with the court..."
8,You've never seen the elephants at the zoo lik...
9,"For a start, the new entry deftly scales down ..."


In [18]:
len(df['summary'])

9468

In [19]:
df['summary'][9467]

'Hollywood superstars Elizabeth Taylor and Richard Burton divorced for the first time. (June 26)'

In [20]:
# Filter out any bad summaries
df = df[df["summary"].str.len() > 40].reset_index(drop=True)
print("✅ Rows after removing bad summaries:", len(df))

✅ Rows after removing bad summaries: 9467


In [21]:
# Save cleaned dataset with summaries
CLEAN_CSV_PATH = '/content/drive/MyDrive/news_project/data/cleaned/pens_clean_with_summaries.csv'
df[["text", "summary"]].to_csv(CLEAN_CSV_PATH, index=False)
print("✅ Cleaned dataset with summaries saved to:", CLEAN_CSV_PATH)

✅ Cleaned dataset with summaries saved to: /content/drive/MyDrive/news_project/data/cleaned/pens_clean_with_summaries.csv


In [22]:
# ==========================================================
# 7. Build HuggingFace Dataset & Train/Test Split
# ==========================================================
raw_dataset = Dataset.from_pandas(df[["text", "summary"]])
raw_dataset = raw_dataset.train_test_split(test_size=0.1, seed=42)
print(raw_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 8520
    })
    test: Dataset({
        features: ['text', 'summary'],
        num_rows: 947
    })
})


In [4]:
# ==========================================================
# 8. Load Pretrained BART
# ==========================================================
MODEL_NAME = "facebook/bart-large-cnn"
print("Loading tokenizer & model:", MODEL_NAME)
tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
model = BartForConditionalGeneration.from_pretrained(MODEL_NAME)
print("Loading Completed")

Loading tokenizer & model: facebook/bart-large-cnn


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Loading Completed


In [23]:
# ==========================================================
# 9. Tokenization Function
# ==========================================================
def preprocess(batch):
    articles = [str(x) for x in batch["text"]]
    summaries = [str(x) for x in batch["summary"]]

    model_inputs = tokenizer(
        articles,
        max_length=512,
        padding="max_length",
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            summaries,
            max_length=160,
            padding="max_length",
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("🔹 Tokenizing train and test splits...")
tokenized_dataset = raw_dataset.map(
    preprocess,
    batched=True,
    remove_columns=raw_dataset["train"].column_names,
)

print(tokenized_dataset)

🔹 Tokenizing train and test splits...


Map:   0%|          | 0/8520 [00:00<?, ? examples/s]



Map:   0%|          | 0/947 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8520
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 947
    })
})


In [25]:
# ==========================================================
# 10. Training Configuration
# ==========================================================
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/news_project/Summary_Trans",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    fp16=True,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=500,
    eval_strategy="epoch",
    report_to="none",  # avoids Weights & Biases
)

In [26]:
# ==========================================================
# 11. Trainer Setup & Fine-tuning
# ==========================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

In [27]:
print("🔹 Starting training...")
trainer.train()
print("Training finished.")

🔹 Starting training...


Epoch,Training Loss,Validation Loss
1,0.6302,0.501857
2,0.4542,0.497991




✅ Training finished.


In [28]:
# ==========================================================
# 12. Save Fine-tuned Model for Streamlit Use
# ==========================================================
MODEL_SAVE_PATH="/content/drive/MyDrive/news_project/bart_Summary_finetuned"
trainer.save_model(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
print("✅ Fine-tuned model saved to:", MODEL_SAVE_PATH)

✅ Fine-tuned model saved to: /content/drive/MyDrive/news_project/bart_Summary_finetuned


# Evaluation

In [31]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [33]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f911ee64a019619f9c68b602067caee4cf257639a53f66473104697fede1ace4
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [38]:
# ==========================================================
# 13. ROUGE Evaluation on Test Set (SAFE VERSION)
# ==========================================================

print("🔹 Evaluating with ROUGE on test set...")

import evaluate
from transformers import pipeline

rouge = evaluate.load("rouge")

# Load fine-tuned summarization pipeline
summarizer = pipeline(
    "summarization",
    model=MODEL_SAVE_PATH,
    tokenizer=MODEL_SAVE_PATH,
    device=0   # GPU
)


# ----------------------------------------------------------
# SAFE summarization wrapper to avoid CUDA crashes
# ----------------------------------------------------------
def safe_summarize(article):

    # Token/word length estimate
    input_len = max(10, len(article.split()))

    # Dynamically choose sensible generation lengths
    max_len = min(180, max(32, int(input_len * 0.75)))
    min_len = min(max_len - 5, max(15, int(max_len * 0.5)))

    summary = summarizer(
        article,
        max_length=max_len,
        min_length=min_len,
        num_beams=5,
        no_repeat_ngram_size=3,
        repetition_penalty=2.0,
        early_stopping=True,
        do_sample=False
    )[0]["summary_text"]

    # Clean termination
    if not summary.strip().endswith("."):
        summary = summary.rsplit(".", 1)[0] + "."

    return summary


# ----------------------------------------------------------
# Generate summaries on test samples
# ----------------------------------------------------------
predictions = []
references = []
texts = []

MAX_EVAL_SAMPLES = min(300, len(raw_dataset["test"]))   # speed-safe

for i in range(MAX_EVAL_SAMPLES):

    sample = raw_dataset["test"][i]

    article = sample["text"]
    ref_sum = sample["summary"]

    pred_sum = safe_summarize(article)

    predictions.append(pred_sum)
    references.append(ref_sum)
    texts.append(article)


# ----------------------------------------------------------
# Compute ROUGE
# ----------------------------------------------------------
rouge_scores = rouge.compute(
    predictions=predictions,
    references=references,
    use_stemmer=True,
)

print("✅ ROUGE scores:")
print(rouge_scores)


# ----------------------------------------------------------
# Save scores and predictions
# ----------------------------------------------------------

# Save overall ROUGE metrics
pd.DataFrame([rouge_scores]).to_csv(ROUGE_CSV_PATH, index=False)
print("✅ ROUGE scores saved to:", ROUGE_CSV_PATH)

# Save per-example evaluation results
pred_df = pd.DataFrame({
    "text": texts,
    "reference_summary": references,
    "predicted_summary": predictions,
})

pred_df.to_csv(PREDICTIONS_CSV_PATH, index=False)
print("✅ Per-example predictions saved to:", PREDICTIONS_CSV_PATH)


🔹 Evaluating with ROUGE on test set...


Device set to use cuda:0


AcceleratorError: CUDA error: device-side assert triggered
Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
