In [3]:
!pip install transformers datasets rouge-score nltk gradio pandas torch accelerate -q

In [24]:
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from rouge_score import rouge_scorer
import nltk
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True) # Added to download punkt_tab

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [5]:
import os
from google.colab import files

# Upload your kaggle.json file
print("Please upload your kaggle.json file")
uploaded = files.upload()

# Create .kaggle directory
!mkdir -p ~/.kaggle

# Move the uploaded file to .kaggle directory
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

print("✅ Kaggle API configured successfully!")

Please upload your kaggle.json file


Saving kaggle.json to kaggle.json
✅ Kaggle API configured successfully!


In [6]:
# Download the dataset from Kaggle
print("Downloading CNN/DailyMail dataset from Kaggle...")
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail

# Unzip the dataset
print("\nExtracting dataset...")
!unzip -q newspaper-text-summarization-cnn-dailymail.zip -d ./cnn_dailymail_data

print("✅ Dataset downloaded and extracted successfully!")

# List the files
print("\nDataset files:")
!ls -lh ./cnn_dailymail_data/

Downloading CNN/DailyMail dataset from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
License(s): CC0-1.0
newspaper-text-summarization-cnn-dailymail.zip: Skipping, found more recently modified local copy (use --force to force download)

Extracting dataset...
replace ./cnn_dailymail_data/cnn_dailymail/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./cnn_dailymail_data/cnn_dailymail/train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
y
y
replace ./cnn_dailymail_data/cnn_dailymail/validation.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ✅ Dataset downloaded and extracted successfully!

Dataset files:
total 4.0K
drwxr-xr-x 2 root root 4.0K Oct 28 16:02 cnn_dailymail


In [7]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from rouge_score import rouge_scorer
import nltk
import warnings
import json
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt', quiet=True)

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [8]:
# Load the CSV files from Kaggle dataset
print("Loading CNN/DailyMail dataset from CSV files...")
import os
data_dir = './cnn_dailymail_data/cnn_dailymail' # Updated directory
files_in_dir = os.listdir(data_dir)
print(f"Files in dataset directory: {files_in_dir}")

# Try to load the main CSV file
csv_files = [f for f in files_in_dir if f.endswith('.csv')]
print(f"\nCSV files found: {csv_files}")

# Load the data based on available files
if 'train.csv' in csv_files:
    # Separate files for train, validation, test
    train_df = pd.read_csv(f'{data_dir}/train.csv')
    if 'validation.csv' in csv_files or 'valid.csv' in csv_files:
        val_file = 'validation.csv' if 'validation.csv' in csv_files else 'valid.csv'
        val_df = pd.read_csv(f'{data_dir}/{val_file}')
    else:
        val_df = None

    if 'test.csv' in csv_files:
        test_df = pd.read_csv(f'{data_dir}/test.csv')
    else:
        test_df = None
else:
    # Single CSV file - we'll split it
    if not csv_files:
        raise FileNotFoundError("No CSV files found in the specified directory.")
    main_csv = csv_files[0]
    print(f"\nLoading from single CSV: {main_csv}")
    df = pd.read_csv(f'{data_dir}/{main_csv}')

    # Display columns
    print(f"\nDataset columns: {df.columns.tolist()}")
    print(f"Dataset shape: {df.shape}")

    # Split the data (80% train, 10% validation, 10% test)
    from sklearn.model_selection import train_test_split

    train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print("\n" + "="*80)
print("DATASET LOADED SUCCESSFULLY")
print("="*80)
print(f"\nTraining samples: {len(train_df)}")
if val_df is not None:
    print(f"Validation samples: {len(val_df)}")
if test_df is not None:
    print(f"Test samples: {len(test_df)}")

# Display sample data
print("\n" + "="*80)
print("SAMPLE DATA:")
print("="*80)
print(f"\nColumns: {train_df.columns.tolist()}")
print(f"\nFirst row:")
print(train_df.iloc[0])

Loading CNN/DailyMail dataset from CSV files...
Files in dataset directory: ['train.csv', 'validation.csv', 'test.csv']

CSV files found: ['train.csv', 'validation.csv', 'test.csv']

DATASET LOADED SUCCESSFULLY

Training samples: 287113
Validation samples: 13368
Test samples: 11490

SAMPLE DATA:

Columns: ['id', 'article', 'highlights']

First row:
id                     0001d1afc246a7964130f43ae940af6bc6c57f01
article       By . Associated Press . PUBLISHED: . 14:11 EST...
highlights    Bishop John Folda, of North Dakota, is taking ...
Name: 0, dtype: object


In [9]:
# Inspect the dataset structure
print("Inspecting dataset structure...")
print(f"\nColumn names: {train_df.columns.tolist()}")
print(f"\nData types:\n{train_df.dtypes}")
print(f"\nMissing values:\n{train_df.isnull().sum()}")

# Check common column naming patterns
possible_article_cols = ['article', 'text', 'document', 'story', 'content']
possible_summary_cols = ['highlights', 'summary', 'abstract', 'summarization']

article_col = None
summary_col = None

# Find the correct column names
for col in train_df.columns:
    col_lower = col.lower()
    if any(ac in col_lower for ac in possible_article_cols) and article_col is None:
        article_col = col
    if any(sc in col_lower for sc in possible_summary_cols) and summary_col is None:
        summary_col = col

if article_col is None or summary_col is None:
    print("\nCould not automatically detect column names.")
    print("Available columns:", train_df.columns.tolist())
    print("\nPlease specify the column names:")
    article_col = input("Enter the article/text column name: ")
    summary_col = input("Enter the summary/highlights column name: ")

print(f"\n Using columns:")
print(f"   Article column: '{article_col}'")
print(f"   Summary column: '{summary_col}'")

# Standardize column names
train_df = train_df.rename(columns={article_col: 'article', summary_col: 'highlights'})
if val_df is not None:
    val_df = val_df.rename(columns={article_col: 'article', summary_col: 'highlights'})
if test_df is not None:
    test_df = test_df.rename(columns={article_col: 'article', summary_col: 'highlights'})

# Remove rows with missing values
train_df = train_df.dropna(subset=['article', 'highlights'])
if val_df is not None:
    val_df = val_df.dropna(subset=['article', 'highlights'])
if test_df is not None:
    test_df = test_df.dropna(subset=['article', 'highlights'])

# Display sample
print("\n" + "="*80)
print("SAMPLE ARTICLE:")
print("="*80)
print(f"\nArticle:\n{train_df.iloc[0]['article'][:500]}...")
print(f"\nHighlights (Summary):\n{train_df.iloc[0]['highlights']}")
print("="*80)

Inspecting dataset structure...

Column names: ['id', 'article', 'highlights']

Data types:
id            object
article       object
highlights    object
dtype: object

Missing values:
id            0
article       0
highlights    0
dtype: int64

 Using columns:
   Article column: 'article'
   Summary column: 'highlights'

SAMPLE ARTICLE:

Article:
By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in N...

Highlights (Summary):
Bishop John Folda, of North Dakota, is taking time off after being diagnosed .
He contracted the infection through contam

In [10]:
# Configuration
MODEL_NAME = "t5-small"  # Can use 't5-base' for better results but slower
MAX_INPUT_LENGTH = 512    # Maximum length of input article
MAX_TARGET_LENGTH = 128   # Maximum length of summary
BATCH_SIZE = 8           # Adjust based on your GPU memory
NUM_TRAIN_SAMPLES = 10000 # Use subset for faster training
NUM_VAL_SAMPLES = 1000

print(f"Model: {MODEL_NAME}")
print(f"Max Input Length: {MAX_INPUT_LENGTH}")
print(f"Max Target Length: {MAX_TARGET_LENGTH}")
print(f"Batch Size: {BATCH_SIZE}")

Model: t5-small
Max Input Length: 512
Max Target Length: 128
Batch Size: 8


In [11]:
# Load tokenizer
print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

print(f"Tokenizer loaded successfully!")
print(f"Vocabulary size: {len(tokenizer)}")
print(f"Special tokens: {tokenizer.all_special_tokens}")

Loading tokenizer: t5-small


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Tokenizer loaded successfully!
Vocabulary size: 32100
Special tokens: ['</s>', '<unk>', '<pad>', '<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>', '<extra_id_46>', '<extra_id_47>', '<extra_id_48>', '<extra_id_49>', '<extra_id_50>', '<extra_id_51>', '<extra_id_52>', '<extra_id_5

In [13]:
# Load CNN/DailyMail dataset
print("Loading CNN/DailyMail dataset...")
dataset = load_dataset("cnn_dailymail", "3.0.0")

print(f"\nDataset structure:")
print(dataset)

# Display sample data
print("\n" + "="*80)
print("SAMPLE ARTICLE:")
print("="*80)
print(f"\nArticle:\n{dataset['train'][0]['article'][:500]}...")
print(f"\nHighlights (Summary):\n{dataset['train'][0]['highlights']}")
print("="*80)

# Dataset statistics
print(f"\nDataset Statistics:")
print(f"Training samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")

Loading CNN/DailyMail dataset...


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]


Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

SAMPLE ARTICLE:

Article:
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s...

Highlights (Summary):
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says h

In [17]:
def preprocess_function(examples):
    """
    Preprocess the dataset by tokenizing articles and summaries.
    T5 requires a task prefix for the input.
    """
    # Add prefix for T5
    inputs = ["summarize: " + doc for doc in examples['article']]

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Tokenize targets (summaries)
    labels = tokenizer(
        text_target=examples['highlights'],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding='max_length'
    )

    model_inputs['labels'] = labels['input_ids']

    return model_inputs

# Test preprocessing on a single example
print("Testing preprocessing function...")
sample = dataset['train'].select(range(1))
processed = preprocess_function(sample)
print(f"Input IDs shape: {len(processed['input_ids'][0])}")
print(f"Labels shape: {len(processed['labels'][0])}")
print("Preprocessing function works correctly!")

Testing preprocessing function...
Dataset type: <class 'datasets.arrow_dataset.Dataset'>
Sample data type: <class 'dict'>
Sample article type: <class 'datasets.arrow_dataset.Column'>
Sample article: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 milli...

Input IDs shape: 512
Labels shape: 128
✅ Preprocessing function works correctly!

Sample tokenized input (first 20 tokens):
[21603, 10, 301, 24796, 4170, 6, 2789, 41, 18844, 61, 1636, 8929, 16023, 2213, 4173, 6324, 12591, 15, 11391, 592]

Decoded sample: summarize: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won'


In [18]:
# Select subset of data for training
print(f"Preparing training dataset with {NUM_TRAIN_SAMPLES} samples...")

# Make sure we don't exceed dataset size
actual_train_samples = min(NUM_TRAIN_SAMPLES, len(dataset['train']))
actual_val_samples = min(NUM_VAL_SAMPLES, len(dataset['validation']))

print(f"Using {actual_train_samples} training samples")
print(f"Using {actual_val_samples} validation samples")

train_dataset_subset = dataset['train'].select(range(actual_train_samples))
val_dataset_subset = dataset['validation'].select(range(actual_val_samples))

# Apply preprocessing
print("\nTokenizing training data...")
try:
    tokenized_train = train_dataset_subset.map(
        preprocess_function,
        batched=True,
        batch_size=100,  # Process in batches
        remove_columns=train_dataset_subset.column_names,
        desc="Tokenizing training data"
    )
    print(f"✅ Training data tokenized: {len(tokenized_train)} samples")
except Exception as e:
    print(f"❌ Error tokenizing training data: {e}")
    raise

print("\nTokenizing validation data...")
try:
    tokenized_val = val_dataset_subset.map(
        preprocess_function,
        batched=True,
        batch_size=100,
        remove_columns=val_dataset_subset.column_names,
        desc="Tokenizing validation data"
    )
    print(f"✅ Validation data tokenized: {len(tokenized_val)} samples")
except Exception as e:
    print(f"❌ Error tokenizing validation data: {e}")
    raise

print("\n" + "="*80)
print("TOKENIZATION COMPLETE")
print("="*80)
print(f"Tokenized training samples: {len(tokenized_train)}")
print(f"Tokenized validation samples: {len(tokenized_val)}")
print(f"Features: {tokenized_train.features}")

Preparing training dataset with 10000 samples...
Using 10000 training samples
Using 1000 validation samples

Tokenizing training data...


Tokenizing training data:   0%|          | 0/10000 [00:00<?, ? examples/s]

✅ Training data tokenized: 10000 samples

Tokenizing validation data...


Tokenizing validation data:   0%|          | 0/1000 [00:00<?, ? examples/s]

✅ Validation data tokenized: 1000 samples

TOKENIZATION COMPLETE
Tokenized training samples: 10000
Tokenized validation samples: 1000
Features: {'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': List(Value('int64'))}


In [19]:
# Load pre-trained T5 model
print(f"Loading pre-trained model: {MODEL_NAME}")
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Move model to device
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel loaded successfully!")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Loading pre-trained model: t5-small


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Model loaded successfully!
Total parameters: 60,506,624
Trainable parameters: 60,506,624


In [20]:
def compute_metrics(eval_pred):
    """
    Compute ROUGE metrics for evaluation.
    """
    predictions, labels = eval_pred

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels (used for padding)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Add newline for ROUGE calculation
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Calculate ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for pred, label in zip(decoded_preds, decoded_labels):
        scores = scorer.score(label, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    result = {
        'rouge1': np.mean(rouge1_scores),
        'rouge2': np.mean(rouge2_scores),
        'rougeL': np.mean(rougeL_scores)
    }

    return result

print("ROUGE metric computation function defined!")

ROUGE metric computation function defined!


In [21]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-summarization-results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    push_to_hub=False,
    report_to="none"
)

print("Training arguments configured:")
print(f"  - Output directory: {training_args.output_dir}")
print(f"  - Number of epochs: {training_args.num_train_epochs}")
print(f"  - Learning rate: {training_args.learning_rate}")
print(f"  - Batch size: {training_args.per_device_train_batch_size}")
print(f"  - FP16: {training_args.fp16}")

Training arguments configured:
  - Output directory: ./t5-summarization-results
  - Number of epochs: 3
  - Learning rate: 2e-05
  - Batch size: 8
  - FP16: True


In [22]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Trainer initialized successfully!")
print(f"Ready to train on {len(tokenized_train)} samples")
print(f"Validation on {len(tokenized_val)} samples")

Trainer initialized successfully!
Ready to train on 10000 samples
Validation on 1000 samples


In [25]:
# Train the model
print("Starting training...")
print("="*80)

train_result = trainer.train()

print("\n" + "="*80)
print("Training completed!")
print("="*80)
print(f"\nTraining metrics:")
print(f"  - Training loss: {train_result.training_loss:.4f}")
print(f"  - Training runtime: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"  - Training samples/second: {train_result.metrics['train_samples_per_second']:.2f}")

Starting training...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,1.1034,0.84709,0.242566,0.089788,0.195639
2,1.0714,0.84561,0.243567,0.089202,0.195897
3,1.0714,0.845735,0.243059,0.089808,0.196651


Epoch,Training Loss,Validation Loss


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



Training completed!

Training metrics:
  - Training loss: 1.0866
  - Training runtime: 1016.94 seconds
  - Training samples/second: 29.50


In [26]:
# Evaluate on validation set
print("\nEvaluating model on validation set...")
print("="*80)

eval_results = trainer.evaluate()

print("\nEvaluation Results:")
print("="*80)
print(f"ROUGE-1: {eval_results['eval_rouge1']:.4f}")
print(f"ROUGE-2: {eval_results['eval_rouge2']:.4f}")
print(f"ROUGE-L: {eval_results['eval_rougeL']:.4f}")
print(f"Eval Loss: {eval_results['eval_loss']:.4f}")
print("="*80)


Evaluating model on validation set...



Evaluation Results:
ROUGE-1: 0.2436
ROUGE-2: 0.0892
ROUGE-L: 0.1959
Eval Loss: 0.8456


In [27]:
# Save the fine-tuned model
output_dir = "./fine-tuned-t5-summarization"
print(f"Saving fine-tuned model to {output_dir}...")

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model saved successfully!")
print(f"Model location: {output_dir}")

Saving fine-tuned model to ./fine-tuned-t5-summarization...
Model saved successfully!
Model location: ./fine-tuned-t5-summarization


In [28]:
def generate_summary(text, max_length=128, min_length=30, num_beams=4):
    """
    Generate summary for given text using the fine-tuned model.
    """
    # Prepare input
    input_text = "summarize: " + text
    inputs = tokenizer(
        input_text,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Generate summary
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=max_length,
        min_length=min_length,
        num_beams=num_beams,
        early_stopping=True
    )

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Test on a sample
print("Testing summarization on a sample article...")
print("="*80)

test_article = dataset['test'][0]['article']
original_summary = dataset['test'][0]['highlights']

generated_summary = generate_summary(test_article)

print("\nORIGINAL ARTICLE (first 500 chars):")
print(test_article[:500] + "...")
print("\n" + "="*80)
print("\nORIGINAL SUMMARY:")
print(original_summary)
print("\n" + "="*80)
print("\nGENERATED SUMMARY:")
print(generated_summary)
print("="*80)

Testing summarization on a sample article...

ORIGINAL ARTICLE (first 500 chars):
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, includin...


ORIGINAL SUMMARY:
Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .


GENERATED SUMMARY:
The Palestinian Authority officially becomes the 123rd member of the International Criminal Court. The Palestinians signed the ICC's foundi

In [29]:
def compare_summaries(num_examples=5):
    """
    Compare original and generated summaries for multiple examples.
    """
    print("COMPARING ORIGINAL VS GENERATED SUMMARIES")
    print("="*100)

    for i in range(num_examples):
        article = dataset['test'][i]['article']
        original = dataset['test'][i]['highlights']
        generated = generate_summary(article)

        print(f"\n{'='*100}")
        print(f"EXAMPLE {i+1}")
        print(f"{'='*100}")
        print(f"\nARTICLE (truncated):\n{article[:300]}...")
        print(f"\n{'-'*100}")
        print(f"\nORIGINAL SUMMARY:\n{original}")
        print(f"\n{'-'*100}")
        print(f"\nGENERATED SUMMARY:\n{generated}")
        print(f"\n{'='*100}")

# Compare summaries
compare_summaries(num_examples=3)

COMPARING ORIGINAL VS GENERATED SUMMARIES

EXAMPLE 1

ARTICLE (truncated):
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the cou...

----------------------------------------------------------------------------------------------------

ORIGINAL SUMMARY:
Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

----------------------------------------------------------------------------------------------------

GENERATED SUMMARY:
The Palestinian Authority officially becomes the 123rd member of the International Criminal Court. The Palestinians signed the ICC's founding Ro

In [30]:
def evaluate_test_set(num_samples=100):
    """
    Evaluate model on test set and calculate ROUGE scores.
    """
    print(f"Evaluating on {num_samples} test samples...")

    test_subset = dataset['test'].select(range(num_samples))

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for i, example in enumerate(test_subset):
        if (i + 1) % 10 == 0:
            print(f"Processing {i+1}/{num_samples}...")

        article = example['article']
        reference = example['highlights']
        generated = generate_summary(article)

        scores = scorer.score(reference, generated)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    results = {
        'rouge1': np.mean(rouge1_scores),
        'rouge2': np.mean(rouge2_scores),
        'rougeL': np.mean(rougeL_scores)
    }

    print("\n" + "="*80)
    print("TEST SET EVALUATION RESULTS")
    print("="*80)
    print(f"ROUGE-1: {results['rouge1']:.4f}")
    print(f"ROUGE-2: {results['rouge2']:.4f}")
    print(f"ROUGE-L: {results['rougeL']:.4f}")
    print("="*80)

    return results

# Evaluate on test set
test_results = evaluate_test_set(num_samples=100)

Evaluating on 100 test samples...
Processing 10/100...
Processing 20/100...
Processing 30/100...
Processing 40/100...
Processing 50/100...
Processing 60/100...
Processing 70/100...
Processing 80/100...
Processing 90/100...
Processing 100/100...

TEST SET EVALUATION RESULTS
ROUGE-1: 0.3344
ROUGE-2: 0.1400
ROUGE-L: 0.2514


In [31]:
import gradio as gr

def summarize_text(text, max_length, min_length, num_beams):
    """Gradio interface function for summarization"""
    if not text.strip():
        return "Please enter some text to summarize."

    summary = generate_summary(
        text,
        max_length=int(max_length),
        min_length=int(min_length),
        num_beams=int(num_beams)
    )

    # Calculate statistics
    original_words = len(text.split())
    summary_words = len(summary.split())
    reduction = ((1 - summary_words / original_words) * 100)

    stats = f"\n\n📊 Statistics:\n"
    stats += f"Original: {original_words} words\n"
    stats += f"Summary: {summary_words} words\n"
    stats += f"Reduction: {reduction:.1f}%"

    return summary + stats

# Example articles
example_article_1 = dataset['test'][0]['article']
example_article_2 = dataset['test'][1]['article']
example_article_3 = dataset['test'][2]['article']

# Create Gradio interface
demo = gr.Interface(
    fn=summarize_text,
    inputs=[
        gr.Textbox(
            lines=10,
            placeholder="Enter your article here...",
            label="Input Article"
        ),
        gr.Slider(
            minimum=50,
            maximum=200,
            value=128,
            step=10,
            label="Max Summary Length"
        ),
        gr.Slider(
            minimum=10,
            maximum=50,
            value=30,
            step=5,
            label="Min Summary Length"
        ),
        gr.Slider(
            minimum=2,
            maximum=8,
            value=4,
            step=1,
            label="Number of Beams"
        )
    ],
    outputs=gr.Textbox(
        lines=8,
        label="Generated Summary"
    ),
    title="🤖 T5 Text Summarization",
    description="Fine-tuned T5 model for abstractive text summarization on CNN/DailyMail dataset",
    examples=[
        [example_article_1, 128, 30, 4],
        [example_article_2, 128, 30, 4],
        [example_article_3, 128, 30, 4]
    ],
    theme=gr.themes.Soft()
)

# Launch the interface
print("Launching Gradio interface...")
demo.launch(share=True)

Launching Gradio interface...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://051e7e92c3a0eaa25a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


