In [None]:
# Cell 1: Installations
!pip install pandas "numpy<2.0" scikit-learn nltk rouge-score tqdm spacy xgboost
!python -m spacy download en_core_web_lg

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
INFO: pip is looking at multiple versions of thinc to determine which version is compatible with other requirements. This could take a while.
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.3.0,>=1.2.0 (from thinc<8.4.0,>=8.3.4->spacy)
  Downloading blis-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Downloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading blis-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m85.5 MB/s[0m eta [3

In [None]:
# Cell 2: Imports and Setup
import spacy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from tqdm.auto import tqdm
import os
from pathlib import Path
from collections import Counter # ## ADDED ## - Counter for KG creation
import re
import xgboost as xgb

print("Loading spaCy model... (This may take a moment)")
NLP = spacy.load("en_core_web_lg")
print("spaCy model loaded.")

Loading spaCy model... (This may take a moment)
spaCy model loaded.


In [3]:
# Cell 3: All Helper Function Definitions

def find_dataset_path(start_path="/kaggle/input/"):
    """Searches robustly for the correct dataset directory."""
    print(f"--- Searching for dataset directory starting from '{start_path}' ---")
    for root, dirs, files in os.walk(start_path):
        if "News Articles" in dirs and "Summaries" in dirs:
            articles_path = Path(root) / "News Articles"
            if any(p.is_dir() for p in articles_path.iterdir()):
                print(f"Found valid dataset base at: {root}")
                return root
    local_path = "./BBC News Summary"
    if os.path.exists(local_path) and os.path.exists(os.path.join(local_path, "News Articles")):
         print(f"Found valid dataset base at local path: {local_path}")
         return local_path
    raise FileNotFoundError("Could not automatically locate the 'BBC News Summary' dataset.")

def load_bbc_dataset(base_path):
    """Loads the BBC News Summary dataset from the specified path."""
    print(f"Attempting to load dataset from: {base_path}")
    all_data = []
    articles_path = Path(base_path) / "News Articles"
    summaries_path = Path(base_path) / "Summaries"
    for category_path in articles_path.iterdir():
        if category_path.is_dir():
            category = category_path.name
            for article_file in category_path.glob("*.txt"):
                try:
                    with open(article_file, 'r', encoding='utf-8', errors='ignore') as f: article_content = f.read()
                    summary_file = summaries_path / category / article_file.name
                    with open(summary_file, 'r', encoding='utf-8', errors='ignore') as f: summary_content = f.read()
                    all_data.append({"article": article_content, "reference_summary": summary_content})
                except Exception:
                    continue
    return pd.DataFrame(all_data)

def create_oracle_labels(article_text, reference_summary):
    """Greedily selects sentences to maximize ROUGE-2 F1-score."""
    try:
        article_sentences = sent_tokenize(article_text)
    except:
        return [], []
    if not article_sentences or not reference_summary:
        return article_sentences, [0] * len(article_sentences)
    scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
    selected_indices = []
    while True:
        best_candidate_idx, best_rouge_gain = -1, -1.0
        current_summary = " ".join([article_sentences[j] for j in sorted(selected_indices)])
        base_rouge = scorer.score(reference_summary, current_summary)['rouge2'].fmeasure
        for i in range(len(article_sentences)):
            if i in selected_indices: continue
            temp_selection = sorted(selected_indices + [i])
            summary_text = " ".join([article_sentences[j] for j in temp_selection])
            scores = scorer.score(reference_summary, summary_text)
            rouge_score = scores['rouge2'].fmeasure
            if rouge_score > base_rouge and (rouge_score - base_rouge) > best_rouge_gain:
                best_rouge_gain = rouge_score - base_rouge
                best_candidate_idx = i
        if best_candidate_idx != -1:
            selected_indices.append(best_candidate_idx)
        else:
            break
    labels = [1 if i in selected_indices else 0 for i in range(len(article_sentences))]
    return article_sentences, labels

def extract_svo_triples(doc):
    """Extracts Subject-Verb-Object triples from a spaCy Doc object."""
    triples = []
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "VERB":
                subjects = [child for child in token.children if child.dep_ in ("nsubj", "nsubjpass")]
                objects = [child for child in token.children if child.dep_ in ("dobj", "pobj", "attr")]
                if subjects and objects:
                    for s in subjects:
                        for o in objects:
                            triples.append((s.lemma_.lower(), token.lemma_.lower(), o.lemma_.lower()))
    return triples

def create_feature_dataset(dataframe):
    """Processes a dataframe of articles to create a feature set for every sentence."""
    feature_list = []
    for row in tqdm(dataframe.itertuples(), total=len(dataframe), desc="Engineering Features"):
        article_text = row.article
        reference_summary = row.reference_summary
        article_sentences, labels = create_oracle_labels(article_text, reference_summary)
        if not article_sentences: continue
        article_doc = NLP(article_text)
        article_kg = Counter(extract_svo_triples(article_doc))
        for i, sentence_text in enumerate(article_sentences):
            sentence_doc = NLP(sentence_text)
            sentence_triples = extract_svo_triples(sentence_doc)
            freqs = [article_kg.get(triple, 0) for triple in sentence_triples]
            feature_list.append({
                'sentence_position': i / len(article_sentences),
                'sentence_length': len([token for token in sentence_doc if not token.is_punct]),
                'numerical_data_count': len(re.findall(r'\d+', sentence_text)),
                'proper_noun_count': len([token for token in sentence_doc if token.pos_ == "PROPN"]),
                'num_triples_in_sentence': len(sentence_triples),
                'avg_triple_frequency': np.mean(freqs) if freqs else 0,
                'max_triple_frequency': np.max(freqs) if freqs else 0,
                'is_summary_sentence': labels[i]
            })
    return pd.DataFrame(feature_list)

print("All helper functions defined successfully.")

All helper functions defined successfully.


In [4]:
# Cell 4: Data Loading and Feature Engineering Execution

try:
    DATASET_PATH = find_dataset_path()
    df = load_bbc_dataset(DATASET_PATH)

    if df.empty:
        raise ValueError("The loaded DataFrame is empty.")

    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    print(f"\nSuccessfully loaded and split the data.")
    print(f"Training set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    
    # --- Create the training and validation feature sets ---
    print("\nCreating training features...")
    train_features_df = create_feature_dataset(train_df)
    print("\nCreating validation features...")
    val_features_df = create_feature_dataset(val_df)

    print("\nTraining features shape:", train_features_df.shape)
    print("Sample of training data:")
    print(train_features_df.head())

except (FileNotFoundError, ValueError) as e:
    print(f"\nERROR: {e}")
    train_features_df, val_features_df = pd.DataFrame(), pd.DataFrame()

--- Searching for dataset directory starting from '/kaggle/input/' ---
Found valid dataset base at: /kaggle/input/bbc-news-summary/BBC News Summary
Attempting to load dataset from: /kaggle/input/bbc-news-summary/BBC News Summary

Successfully loaded and split the data.
Training set size: 2002
Validation set size: 223

Creating training features...


Engineering Features:   0%|          | 0/2002 [00:00<?, ?it/s]


Creating validation features...


Engineering Features:   0%|          | 0/223 [00:00<?, ?it/s]


Training features shape: (37499, 8)
Sample of training data:
   sentence_position  sentence_length  numerical_data_count  \
0           0.000000               30                     0   
1           0.028571               26                     0   
2           0.057143               20                     0   
3           0.085714               25                     1   
4           0.114286               25                     0   

   proper_noun_count  num_triples_in_sentence  avg_triple_frequency  \
0                  4                        2                   1.0   
1                  1                        0                   0.0   
2                  1                        2                   1.0   
3                  7                        1                   1.0   
4                  1                        0                   0.0   

   max_triple_frequency  is_summary_sentence  
0                     1                    0  
1                     0               

In [5]:
# Cell 6: XGBoost Model Training

# Separate features (X) from labels (y)
feature_columns = [col for col in train_features_df.columns if col != 'is_summary_sentence']
X_train = train_features_df[feature_columns]
y_train = train_features_df['is_summary_sentence']

X_val = val_features_df[feature_columns]
y_val = val_features_df['is_summary_sentence']

# Initialize and train the XGBoost Classifier
print("\n--- Training XGBoost Classifier ---")
xgb_classifier = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=1000,
    learning_rate=0.05,
    n_jobs=-1,
    early_stopping_rounds=50
)

xgb_classifier.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)

# Save the trained model
xgb_classifier.save_model("svo_xgb_summarizer.json")
print("Model training complete and saved to 'svo_xgb_summarizer.json'")


--- Training XGBoost Classifier ---
[0]	validation_0-logloss:0.67745
[100]	validation_0-logloss:0.64060
[200]	validation_0-logloss:0.63936
[300]	validation_0-logloss:0.63894
[318]	validation_0-logloss:0.63905
Model training complete and saved to 'svo_xgb_summarizer.json'


In [None]:
# Cell 7: Inference and Evaluation

def summarize_with_svo_xgb(text, model, max_sents=3):
    """
    Summarizes a new article using the trained SVO-XGBoost model.
    """
    try:
        article_sentences = sent_tokenize(text)
    except:
        return "Could not process text."
    if not article_sentences: return ""

    # This part mirrors the feature engineering process
    article_doc = NLP(text)
    article_kg = Counter(extract_svo_triples(article_doc))
    
    sentence_features = []
    for i, sentence_text in enumerate(article_sentences):
        sentence_doc = NLP(sentence_text)
        sentence_triples = extract_svo_triples(sentence_doc)
        freqs = [article_kg.get(triple, 0) for triple in sentence_triples]
        
        features = {
            'sentence_position': i / len(article_sentences),
            'sentence_length': len([token for token in sentence_doc if not token.is_punct]),
            'numerical_data_count': len(re.findall(r'\d+', sentence_text)),
            'proper_noun_count': len([token for token in sentence_doc if token.pos_ == "PROPN"]),
            'num_triples_in_sentence': len(sentence_triples),
            'avg_triple_frequency': np.mean(freqs) if freqs else 0,
            'max_triple_frequency': np.max(freqs) if freqs else 0,
        }
        sentence_features.append(features)
        
    features_df = pd.DataFrame(sentence_features)
    
    predictions = model.predict_proba(features_df)[:, 1]
    
    num_to_select = min(max_sents, len(article_sentences))
    top_indices = np.argsort(predictions)[-num_to_select:]
    top_indices.sort()
    
    summary = " ".join([article_sentences[i] for i in top_indices])
    return summary

# --- Load the model and test on a sample ---
print("\n--- Loading trained model for inference ---")
loaded_model = xgb.XGBClassifier()
loaded_model.load_model("svo_xgb_summarizer.json")

# Select a random article from the validation set
sample_article = val_df.iloc[15]['article']
reference_summary = val_df.iloc[15]['reference_summary']

print("\n--- Summarizing Sample Article ---")
print(f"REFERENCE SUMMARY:\n{reference_summary}")

# Determine summary length based on reference for a fair comparison
num_sents = len(sent_tokenize(reference_summary))
summary = summarize_with_svo_xgb(sample_article, loaded_model, max_sents=num_sents)
print(f"\nGENERATED SUMMARY (SVO-XGBoost):\n{summary}")


--- Loading trained model for inference ---

--- Summarizing Sample Article ---
REFERENCE SUMMARY:
He was out of order but he knows that," said Redknapp.Prutton has apologised publicly for his actions and to Arsenal's Robert Pires, who was injured in a wild tackle by the Saints' midfield man."He's a decent lad."David has made a big mistake and he knows it.Paolo di Canio was given a seven-match suspension when he pushed referee Paul Alcock over in a Premiership game between Sheffield Wednesday and Arsenal in 1998.Prutton will be joined at Wednesday's hearing by Saints boss Harry Redknapp, who believes that the FA will throw the book at his player.Redknapp himself sprinted along the touchline to help physio Jim Joyce and coach Denis Rofe shepherd the enraged Prutton away from referee's assistant Norman.Southampton's David Prutton faces a possible seven-match ban when he goes before the Football Association.I apologise to the ref and linesman, who were only doing their job.

GENERATED SU