In [None]:
!pip install transformers[torch] pandas "numpy<2.0" scikit-learn nltk rouge-score tqdm spacy
!python -m spacy download en_core_web_lg

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
INFO: pip is looking at multiple versions of thinc to determine which version is compatible with other requirements. This could take a while.
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.3.0,>=1.2.0 (from thinc<8.4.0,>=8.3.4->spacy)
  Downloading blis-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<2.7,>=2.1->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<2.7,>=2.1->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==

In [2]:
# Cell 2: Imports and Setup
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from tqdm.auto import tqdm
import os
from pathlib import Path
import spacy

# --- Global Configuration ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 512
MAX_EPOCHS = 6
PATIENCE = 2

# --- Load Global Models ---
print("Loading spaCy model...")
NLP = spacy.load("en_core_web_lg")
print("spaCy model loaded.")



2025-09-05 19:11:10.052966: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757099470.261004      18 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757099470.315512      18 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
Loading spaCy model...
spaCy model loaded.


In [None]:
# Cell 3: Data Loading and Oracle Labeling
# (This cell is identical to the previous BERTSum script)
# It contains the find_dataset_path, load_bbc_dataset, and create_oracle_labels functions.
def find_dataset_path(start_path="/kaggle/input/"):
    """
    Searches robustly for the correct 'BBC News Summary' directory that
    contains the category subfolders.
    """
    print("--- Searching for dataset directory ---")
    
    for root, dirs, files in os.walk(start_path):
        if "News Articles" in dirs and "Summaries" in dirs:
            articles_path = Path(root) / "News Articles"
            
            if any(p.is_dir() for p in articles_path.iterdir()):
                print(f"Found valid dataset base at: {root}")
                return root
            
    raise FileNotFoundError("Could not automatically locate the 'BBC News Summary' dataset with category subfolders. "
                            "Please check the input directory structure in the Kaggle sidebar.")


def load_bbc_dataset(base_path):
    """Loads and pivots the BBC News Summary dataset from the specified path."""
    print(f"Attempting to load dataset from: {base_path}")
    all_data = []
    articles_path = Path(base_path) / "News Articles"
    summaries_path = Path(base_path) / "Summaries"
    for category_path in articles_path.iterdir():
        if category_path.is_dir():
            category = category_path.name
            for article_file in category_path.glob("*.txt"):
                try:
                    with open(article_file, 'r', encoding='utf-8', errors='ignore') as f: article_content = f.read()
                    summary_file = summaries_path / category / article_file.name
                    with open(summary_file, 'r', encoding='utf-8', errors='ignore') as f: summary_content = f.read()
                    all_data.append({"article": article_content, "reference_summary": summary_content})
                except Exception:
                    continue
    return pd.DataFrame(all_data)


try:
    DATASET_PATH = find_dataset_path()
    df = load_bbc_dataset(DATASET_PATH)
    main_train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_df, val_df = train_test_split(main_train_df, test_size=0.1, random_state=42)
    print("\nSuccessfully loaded and split the data.")
except (FileNotFoundError, ValueError) as e:
    print(f"\nERROR: {e}")
    train_df, val_df = pd.DataFrame(), pd.DataFrame()

--- Searching for dataset directory ---
Found valid dataset base at: /kaggle/input/bbc-news-summary/BBC News Summary
Attempting to load dataset from: /kaggle/input/bbc-news-summary/BBC News Summary

Successfully loaded and split the data.


In [4]:
# --- 4. Oracle Label Generation ---
def create_oracle_labels(article_text, reference_summary):
    """Greedily selects sentences to maximize ROUGE-2 F1-score."""
    try:
        article_sentences = sent_tokenize(article_text)
    except:
        return [], []
    
    if not article_sentences or not reference_summary:
        return article_sentences, [0] * len(article_sentences)

    scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
    
    selected_indices = []
    
    # Greedily add sentences
    while True:
        best_candidate_idx = -1
        best_rouge_gain = -1.0
        
        current_summary = " ".join([article_sentences[j] for j in sorted(selected_indices)])
        base_rouge = scorer.score(reference_summary, current_summary)['rouge2'].fmeasure
        
        for i in range(len(article_sentences)):
            if i in selected_indices:
                continue
            
            # Try adding this sentence
            temp_selection = sorted(selected_indices + [i])
            summary_text = " ".join([article_sentences[j] for j in temp_selection])
            scores = scorer.score(reference_summary, summary_text)
            rouge_score = scores['rouge2'].fmeasure
            
            if rouge_score > base_rouge and (rouge_score - base_rouge) > best_rouge_gain:
                best_rouge_gain = rouge_score - base_rouge
                best_candidate_idx = i
                
        if best_candidate_idx != -1:
            selected_indices.append(best_candidate_idx)
        else:
            break
            
    labels = [1 if i in selected_indices else 0 for i in range(len(article_sentences))]
    return article_sentences, labels

# --- Example of Oracle Labeling ---
print("--- Oracle Labeling Example ---")
example_sents, example_labels = create_oracle_labels(train_df.iloc[0].article, train_df.iloc[0].reference_summary)
for sent, label in zip(example_sents, example_labels):
    print(f"LABEL: {label} | SENTENCE: {sent[:80]}...")

--- Oracle Labeling Example ---
LABEL: 0 | SENTENCE: Budget to set scene for election

Gordon Brown will seek to put the economy at t...
LABEL: 0 | SENTENCE: He is expected to stress the importance of continued economic stability, with lo...
LABEL: 1 | SENTENCE: The chancellor is expected to freeze petrol duty and raise the stamp duty thresh...
LABEL: 0 | SENTENCE: But the Conservatives and Lib Dems insist voters face higher taxes and more mean...
LABEL: 0 | SENTENCE: Treasury officials have said there will not be a pre-election giveaway, but Mr B...
LABEL: 1 | SENTENCE: - Increase in the stamp duty threshold from £60,000 
 - A freeze on petrol duty ...
LABEL: 0 | SENTENCE: Ten years ago, buyers had a much greater chance of avoiding stamp duty, with clo...
LABEL: 1 | SENTENCE: Since then, average UK property prices have more than doubled while the starting...
LABEL: 1 | SENTENCE: Tax credits As a result, the number of properties incurring stamp duty has rocke...
LABEL: 1 | SENTENCE: Th

In [5]:
# Cell 4: SVO Extraction and MODIFIED PyTorch Dataset

def extract_svo_triples(doc):
    """Extracts Subject-Verb-Object triples from a spaCy Doc object."""
    triples = []
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "VERB":
                subjects = [child for child in token.children if child.dep_ in ("nsubj", "nsubjpass")]
                objects = [child for child in token.children if child.dep_ in ("dobj", "pobj", "attr")]
                if subjects and objects:
                    for s in subjects:
                        for o in objects:
                            triples.append((s.lemma_.lower(), token.lemma_.lower(), o.lemma_.lower()))
    return triples

## MODIFIED ##
class SVO_SummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=MAX_LEN):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, item):
        row = self.dataframe.iloc[item]
        article_sentences, labels = create_oracle_labels(row.article, row.reference_summary)
        
        if not article_sentences:
            return {'is_empty': True}

        # --- NEW: SVO ENRICHMENT ---
        text_for_bert = ""
        for sent in article_sentences:
            # 1. Extract SVO triples for this sentence
            sent_doc = NLP(sent)
            triples = extract_svo_triples(sent_doc)
            
            # 2. Linearize the triples into a "fact string"
            fact_string = ""
            if triples:
                facts = [" ".join(triple) for triple in triples]
                fact_string = " Facts: " + " ; ".join(facts) + "."
            
            # 3. Create the enriched input string
            # Format: [CLS] Original Sentence [SEP] Fact String [SEP]
            enriched_input = sent + fact_string
            
            # Append to the document-level text
            text_for_bert += enriched_input + " [SEP] [CLS] "
        # ---------------------------
        
        inputs = self.tokenizer.encode_plus(
            text_for_bert, max_length=self.max_len, padding='max_length',
            truncation=True, return_tensors='pt'
        )
        
        input_ids = inputs['input_ids'].flatten()
        attention_mask = inputs['attention_mask'].flatten()
        
        cls_indices = (input_ids == self.tokenizer.cls_token_id).nonzero().flatten()
        
        num_cls_tokens = len(cls_indices)
        labels = labels[:num_cls_tokens]
        
        padded_labels = np.zeros(self.max_len)
        if len(labels) > 0:
            padded_labels[:len(labels)] = labels
        
        return {
            'is_empty': False,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'cls_indices': cls_indices,
            'labels': torch.tensor(padded_labels, dtype=torch.float)
        }

In [6]:
# --- 6. BERTSum Model ---
class BERTSummarizer(torch.nn.Module):
    def __init__(self, model_name=PRE_TRAINED_MODEL_NAME):
        super(BERTSummarizer, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(self.bert.config.hidden_size, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 1)
        )

    def forward(self, input_ids, attention_mask, cls_indices):
        input_ids = input_ids.squeeze(0)
        attention_mask = attention_mask.squeeze(0)
        
        outputs = self.bert(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
        last_hidden_state = outputs.last_hidden_state.squeeze(0)

        cls_embeddings = last_hidden_state[cls_indices]
        
        logits = self.classifier(cls_embeddings)
        return torch.sigmoid(logits)

# --- Initialize tokenizer and model ---
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = BERTSummarizer().to(DEVICE)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc="Training"):
        if batch['is_empty'][0]: continue
        input_ids, attention_mask, cls_indices, labels = (
            batch['input_ids'].to(device), batch['attention_mask'].to(device),
            batch['cls_indices'].to(device), batch['labels'].to(device)
        )
        optimizer.zero_grad()
        predictions = model(input_ids, attention_mask, cls_indices.squeeze(0)).squeeze()
        num_predictions = predictions.shape[0]
        true_labels = labels.squeeze(0)[:num_predictions]
        loss = loss_fn(predictions, true_labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    return total_loss / len(data_loader)

def eval_epoch(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validating"):
            if batch['is_empty'][0]: continue
            input_ids, attention_mask, cls_indices, labels = (
                batch['input_ids'].to(device), batch['attention_mask'].to(device),
                batch['cls_indices'].to(device), batch['labels'].to(device)
            )
            predictions = model(input_ids, attention_mask, cls_indices.squeeze(0)).squeeze()
            num_predictions = predictions.shape[0]
            true_labels = labels.squeeze(0)[:num_predictions]
            loss = loss_fn(predictions, true_labels)
            total_loss += loss.item()
    return total_loss / len(data_loader)

# --- Setup for Training with the NEW Dataset Class ---
train_dataset = SVO_SummarizationDataset(train_df, tokenizer) # Use the new class
val_dataset = SVO_SummarizationDataset(val_df, tokenizer)   # Use the new class
train_loader = DataLoader(train_dataset, batch_size=1)
val_loader = DataLoader(val_dataset, batch_size=1)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fn = torch.nn.BCELoss()
total_steps = len(train_loader) * MAX_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# --- Main Training Loop with Early Stopping ---
best_validation_loss = float('inf')
epochs_no_improve = 0
for epoch in range(MAX_EPOCHS):
    print(f'--- Epoch {epoch + 1}/{MAX_EPOCHS} ---')
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, DEVICE, scheduler)
    print(f'Train loss: {train_loss:.4f}')
    
    val_loss = eval_epoch(model, val_loader, loss_fn, DEVICE)
    print(f'Validation loss: {val_loss:.4f}')
    
    if val_loss < best_validation_loss:
        best_validation_loss = val_loss
        torch.save(model.state_dict(), 'bertsum_best_model.bin')
        epochs_no_improve = 0
        print("Validation loss improved. Saving model.")
    else:
        epochs_no_improve += 1
        print(f"Validation loss did not improve. Counter: {epochs_no_improve}/{PATIENCE}")

    if epochs_no_improve >= PATIENCE:
        print("Early stopping triggered.")
        break

--- Epoch 1/6 ---


Training:   0%|          | 0/1602 [00:00<?, ?it/s]

Train loss: 0.6609


Validating:   0%|          | 0/178 [00:00<?, ?it/s]

Validation loss: 0.6305
Validation loss improved. Saving model.
--- Epoch 2/6 ---


Training:   0%|          | 0/1602 [00:00<?, ?it/s]

Train loss: 0.5739


Validating:   0%|          | 0/178 [00:00<?, ?it/s]

Validation loss: 0.5297
Validation loss improved. Saving model.
--- Epoch 3/6 ---


Training:   0%|          | 0/1602 [00:00<?, ?it/s]

Train loss: 0.3961


Validating:   0%|          | 0/178 [00:00<?, ?it/s]

Validation loss: 0.5702
Validation loss did not improve. Counter: 1/2
--- Epoch 4/6 ---


Training:   0%|          | 0/1602 [00:00<?, ?it/s]

Train loss: 0.2676


Validating:   0%|          | 0/178 [00:00<?, ?it/s]

Validation loss: 0.7138
Validation loss did not improve. Counter: 2/2
Early stopping triggered.


In [None]:
# Cell 7: Inference Function (MODIFIED)

## MODIFIED ##
def summarize_with_svo_bertsum(text, model, tokenizer, device, max_sents=3):
    model.eval()
    try:
        article_sentences = sent_tokenize(text)
    except:
        return "Could not process text."
    if not article_sentences: return ""

    # --- NEW: Apply the same SVO enrichment during inference ---
    text_for_bert = ""
    for sent in article_sentences:
        sent_doc = NLP(sent)
        triples = extract_svo_triples(sent_doc)
        fact_string = ""
        if triples:
            facts = [" ".join(triple) for triple in triples]
            fact_string = " Facts: " + " ; ".join(facts) + "."
        enriched_input = sent + fact_string
        text_for_bert += enriched_input + " [SEP] [CLS] "
    # -----------------------------------------------------------
    
    inputs = tokenizer.encode_plus(
        text_for_bert, max_length=MAX_LEN, padding='max_length',
        truncation=True, return_tensors='pt'
    )
    
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    cls_indices = (input_ids.squeeze(0) == tokenizer.cls_token_id).nonzero().flatten()

    with torch.no_grad():
        predictions = model(input_ids, attention_mask, cls_indices).squeeze()
        
    sentence_scores = predictions.cpu().numpy()
    if sentence_scores.ndim == 0:
        sentence_scores = np.array([sentence_scores])

    num_sentences_to_select = min(max_sents, len(sentence_scores))
    top_indices = np.argsort(sentence_scores)[-num_sentences_to_select:]
    top_indices.sort()
    
    summary = " ".join([article_sentences[i] for i in top_indices if i < len(article_sentences)])
    return summary

# --- Example Usage ---
# Load the BEST SVO-BERTSum model's weights
model.load_state_dict(torch.load('bertsum_best_model.bin')) 
model = model.to(DEVICE)

sample_article = val_df.iloc[20]['article']
print("\n--- Summarizing Sample Article with SVO-BERTSum ---")
summary = summarize_with_svo_bertsum(sample_article, model, tokenizer, DEVICE)
print(f"\nGENERATED SUMMARY:\n{summary}")


--- Summarizing Sample Article with SVO-BERTSum ---

GENERATED SUMMARY:
BBC to pour £9m into new comedy

The BBC is to invest £9m in developing new comedy and entertainment programmes outside London. Director of television Jana Bennett said the changes were about "getting the best ideas on screen as efficiently and effectively as we can". Ms Bennett said the new roles would benefit those making programmes within the BBC as well as those making shows for the channel independently.
