In [None]:
import zipfile, os

# === Path to your existing zip file ===
zip_path = "archive.zip"
extract_dir = "cnn_dailymail_data"

# === Extract ===
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extracted files:", os.listdir(extract_dir))


Extracted files: ['cnn_dailymail']


In [None]:
!pip install -q transformers datasets evaluate rouge_score sentencepiece


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# === Load CSVs ===
train_df = pd.read_csv(f"{extract_dir}/cnn_dailymail/train.csv")
val_df   = pd.read_csv(f"{extract_dir}/cnn_dailymail/validation.csv")
test_df  = pd.read_csv(f"{extract_dir}/cnn_dailymail/test.csv")

# Check column names
print(train_df.columns)

# Standardize column names
if "highlights" in train_df.columns:
    train_df.rename(columns={"highlights":"summary"}, inplace=True)
    val_df.rename(columns={"highlights":"summary"}, inplace=True)
    test_df.rename(columns={"highlights":"summary"}, inplace=True)

train_df = train_df[['article','summary']].dropna()
val_df   = val_df[['article','summary']].dropna()
test_df  = test_df[['article','summary']].dropna()

# Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)
dataset = DatasetDict({"train":train_ds, "validation":val_ds, "test":test_ds})

# === Tokenize ===
model_name = "t5-small"  # or "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_input_length = 512
max_target_length = 128

def preprocess(batch):
    inputs = ["summarize: " + doc for doc in batch["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(batch["summary"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True, remove_columns=["article","summary"])
print(tokenized)


Index(['id', 'article', 'highlights'], dtype='object')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})


In [None]:
# ============================================================
# 🚀 T5 Fine-tuning — Backward-Compatible, No Warnings or Errors
# ============================================================

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import evaluate
import numpy as np
import os
import transformers

# =========================
# Disable W&B logging
# =========================
os.environ["WANDB_DISABLED"] = "true"

# =========================
# Model and Tokenizer
# =========================
model_name = "t5-small"   # change if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# =========================
# Data Collator & Metric
# =========================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        rouge_types=["rouge1", "rouge2", "rougeL"]
    )
    return {k: round(v * 100, 4) for k, v in result.items()}

# =========================
# Detect Transformers Version
# =========================
version = transformers.__version__
print(f"⚙️ Transformers version: {version}")

# =========================
# Safe TrainingArguments
# =========================
try:
    # 🆕 For Transformers ≥ 4.26
    args = Seq2SeqTrainingArguments(
        output_dir="./t5-finetuned",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=2,
        predict_with_generate=True,
        fp16=True,
        logging_steps=100,
        report_to=[],
        save_total_limit=2,
        load_best_model_at_end=True,
    )
except TypeError:
    # 🧩 For older versions (< 4.26)
    args = Seq2SeqTrainingArguments(
        output_dir="./t5-finetuned",
        eval_strategy="epoch",  # old key name
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=2,
        predict_with_generate=True,
        fp16=True,
        logging_steps=100,
    )

# =========================
# Trainer Setup
# =========================
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"].select(range(2000)),       # demo subset
    eval_dataset=tokenized["validation"].select(range(500)),
    processing_class=tokenizer,   # replaces deprecated "tokenizer="
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# =========================
# Train and Save
# =========================
trainer.train()
trainer.save_model("./t5-finetuned-final")
tokenizer.save_pretrained("./t5-finetuned-final")

print("\n✅ Training complete and model saved successfully!")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


⚙️ Transformers version: 4.57.1


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,2.142,1.873861,24.5126,11.9617,20.4327
2,2.0321,1.870476,24.5136,11.958,20.4253



✅ Training complete and model saved successfully!


In [None]:
metrics = trainer.evaluate(tokenized["test"].select(range(500)))
print("ROUGE scores:", metrics)


ROUGE scores: {'eval_loss': 1.8457053899765015, 'eval_rouge1': 24.3673, 'eval_rouge2': 11.6414, 'eval_rougeL': 20.196, 'eval_runtime': 44.3394, 'eval_samples_per_second': 11.277, 'eval_steps_per_second': 2.819, 'epoch': 2.0}


In [None]:
def summarize_text(text):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

sample = test_df.sample(1).iloc[0]
print("Original article:\n", sample['article'][:800])
print("\nReference summary:\n", sample['summary'])
print("\nPredicted summary:\n", summarize_text(sample['article']))


Original article:
 Comedian Jenny Eclair travelled with her other half on a Painting In Venus break with Flavours . There comes a time in a woman’s life when beach holidays just don’t cut it any longer, when lying on golden sands (unless you’re buried up to your neck) serves only to remind you how much weight you forgot to lose again this year and how ill-fitting your swimming costume is. Being control freaks, most fifty-something females find ‘doing nothing’ a bit boring – after all, there are only so many hours one can spend on a Kindle, and woman cannot live by fiction alone. This is the time when the ‘alternative holiday experience’ tickles your holiday tastebuds and you find yourself looking at brochures for Nordic cruises. Excellent! Everyone looks fat pointing at a fjord while wearing an Aran jumper. 

Reference summary:
 The comedian stayed with Flavours who offer a Painting In Venice break .
Jenny and her partner Geof stayed at the farmhouse Villa Bianchi .
Days involved sitti

In [None]:
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# =========================================
# Load fine-tuned model (LOCAL)
# =========================================

def load_model():
    model_path = "t5-finetuned-final"  # Path to your local fine-tuned model folder
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    return tokenizer, model

tokenizer, model = load_model()

# =========================================
# Summarization function
# =========================================

def summarize_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=80,      # Limit summary length
            min_length=15,
            num_beams=4,        # Beam search for better quality
            repetition_penalty=2.5,  # Avoid repetition
            length_penalty=1.0,
            early_stopping=True
        )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# =========================================
# Gradio UI
# =========================================

iface = gr.Interface(
    fn=summarize_text,  # ✅ Correct function name
    inputs=gr.Textbox(lines=10, placeholder="Paste your article here..."),
    outputs=gr.Textbox(lines=5, label="Summary"),
    title="📰 Text Summarizer (Fine-tuned T5)",
    description="Enter a paragraph or article to generate an abstractive summary using your fine-tuned T5 model."
)

# =========================================
# Launch the app
# =========================================

if __name__ == "__main__":
    iface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fe745c7980650a4145.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
