## ML MODDEL

In [1]:
# ---------------------------------------------------------
# IMPORTS
# ---------------------------------------------------------
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import joblib

# # Download NLTK data
# nltk.download("punkt")



In [2]:
# ---------------------------------------------------------
# TEXT CLEANING FUNCTION
# ---------------------------------------------------------
import string
def clean_text(text):

    if not isinstance(text, str):
        return ""

    # Remove HTML
    text = re.sub(r"<.*?>", " ", text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)

    # Remove emojis
    text = re.sub(
        "[" 
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        "]+", 
        "", 
        text
    )

    # Remove special symbols
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", " ", text)

    # Keep only allowed characters
    allowed = set(string.ascii_letters + string.digits + " .,!?")
    text = "".join(ch for ch in text if ch in allowed)

    # Lowercase
    text = text.lower()

    # Normalize whitespace
    text = " ".join(text.split())

    return text

In [3]:
# ---------------------------------------------------------
# LOAD DATA
# ---------------------------------------------------------
df = pd.read_csv("../data/news.tsv", sep="\t")

df["text"] = df["Headline"].fillna("") + " " + df["News body"].fillna("")
df = df.rename(columns={"Category": "label"}).dropna()

# Clean text
df["clean_text"] = df["text"].apply(clean_text)

# This is required for evaluation
df["full_text"] = df["text"]   # FIXED



In [14]:
df['text'][12]

'The ideal landing spots for the top 30 NBA free agents The madness of NBA free agency is almost upon us. Scores of free agents, hundreds of millions in cap space and no clear title favorite means that it should be an energetic evening when free agency officially opens on June 30. But where are the top free agents headed? Only God and Woj know such things, but we\'ll present our best arguments for the top 30. It\'s not a prediction of where the players end up but rather a determination of where they deserve to end up when the music stops and the mad dash for chairs is over. The madness of NBA free agency is almost upon us. Scores of free agents, hundreds of millions in cap space and no clear title favorite means that it should be an energetic evening when free agency officially opens on June 30. But where are the top free agents headed? Only God and Woj know such things, but we\'ll present our best arguments for the top 30. It\'s not a prediction of where the players end up but rather 

In [5]:
# ---------------------------------------------------------
# TEXT RANK SUMMARIZER
# ---------------------------------------------------------
def textrank_summarize(text, top_n=3):
    cleaned = clean_text(text)
    sentences = sent_tokenize(cleaned)

    if len(sentences) <= top_n:
        return " ".join(sentences)

    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(sentences).toarray()

    sim_matrix = cosine_similarity(vectors)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary = " ".join([s for _, s in ranked[:top_n]])

    return summary



In [6]:
# ---------------------------------------------------------
# TF-IDF SENTENCE SCORING
# ---------------------------------------------------------
def tfidf_summarize(text, top_n=3):
    cleaned = clean_text(text)
    sentences = sent_tokenize(cleaned)

    if len(sentences) <= top_n:
        return " ".join(sentences)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    scores = tfidf_matrix.mean(axis=1).A.flatten()

    ranked_idx = np.argsort(scores)[::-1]
    selected = [sentences[i] for i in ranked_idx[:top_n]]

    return " ".join(selected)

# ---------------------------------------------------------
# REFERENCE SUMMARY (WEAK BASELINE)
# ---------------------------------------------------------
def reference_summary(text):
    sents = sent_tokenize(clean_text(text))
    return " ".join(sents[:2])



In [7]:
# ---------------------------------------------------------
# ROUGE EVALUATION
# ---------------------------------------------------------
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def evaluate_model(summarizer_fn, df, samples=50):

    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

    for i in range(min(samples, len(df))):

        text = df.iloc[i]["full_text"]
        ref = reference_summary(text)
        pred = summarizer_fn(text)

        scores = scorer.score(ref, pred)

        rouge1_scores.append(scores["rouge1"].fmeasure)
        rouge2_scores.append(scores["rouge2"].fmeasure)
        rougeL_scores.append(scores["rougeL"].fmeasure)

    return {
        "rouge1": np.mean(rouge1_scores),
        "rouge2": np.mean(rouge2_scores),
        "rougeL": np.mean(rougeL_scores),
    }



In [8]:
# ---------------------------------------------------------
# RUN EVALUATION
# ---------------------------------------------------------
print("Evaluating TextRank...")
textrank_scores = evaluate_model(textrank_summarize, df)

print("Evaluating TF-IDF...")
tfidf_scores = evaluate_model(tfidf_summarize, df)


# ---------------------------------------------------------
# SAVE RESULTS TO CSV
# ---------------------------------------------------------
OUTPUT_CSV = "rouge_eval_results.csv"  # change if needed

new_results = pd.DataFrame([
    {
        "Model": "TextRank",
        "rouge1": textrank_scores["rouge1"],
        "rouge2": textrank_scores["rouge2"],
        "rougeL": textrank_scores["rougeL"],
        "rougeLsum": textrank_scores["rougeL"],
        "Average Score": np.mean([
            textrank_scores["rouge1"],
            textrank_scores["rouge2"],
            textrank_scores["rougeL"],
        ])
    },
    {
        "Model": "TF-IDF",
        "rouge1": tfidf_scores["rouge1"],
        "rouge2": tfidf_scores["rouge2"],
        "rougeL": tfidf_scores["rougeL"],
        "rougeLsum": tfidf_scores["rougeL"],
        "Average Score": np.mean([
            tfidf_scores["rouge1"],
            tfidf_scores["rouge2"],
            tfidf_scores["rougeL"],
        ])
    }
])

new_results = new_results.round(4)

# If CSV exists → append, else create new
try:
    old_df = pd.read_csv(OUTPUT_CSV)
    final_df = pd.concat([old_df, new_results], ignore_index=True)
except FileNotFoundError:
    final_df = new_results

final_df.to_csv(OUTPUT_CSV, index=False)

print("\nEvaluation Completed.\nSaved to:", OUTPUT_CSV)


Evaluating TextRank...
Evaluating TF-IDF...

Evaluation Completed.
Saved to: rouge_eval_results.csv


In [3]:
import pandas as pd
df1=pd.read_csv("../rouge_eval_results.csv")
df1

Unnamed: 0,Model,rouge1,rouge2,rougeL,rougeLsum,Average Score
0,bert,0.748193,0.686271,0.688875,0.688424,0.7077
1,lstm_bahdanau,0.685955,0.300094,0.513951,0.225389,0.5433
2,TextRank,0.5028,0.3924,0.4402,0.4402,0.4451
3,TF-IDF,0.4836,0.3779,0.4144,0.4144,0.4253


In [14]:
sample_text = df['full_text'][0]
sam = textrank_summarize(sample_text)
print(sam)

pereira is also an international, and if de boer wants to include ezequiel barco on the bench, he will need to shed one of his internationals from the last squad. well, aside from the obvious, we still don t have a ton of data points from frank de boer in how he prefers to rotate his team for let s be honest an inferior competition. with three internationals in the starting lineup, i think eric remedi and ezequiel barco will round out the five maximum allowed.


In [15]:
best_model = {
    "name": best_model_name,
    "top_n": 3
}

import joblib
joblib.dump(best_model, f"../Summarization/best_summarizer_{best_model_name}.pkl")

summarizer_code = """
import re
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# ---------------------------------------------------------
# CLEANING FUNCTION
# ---------------------------------------------------------
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\\S+|www\\S+|https\\S+", " ", text)
    text = re.sub("[" 
                  u"\\U0001F600-\\U0001F64F"
                  u"\\U0001F300-\\U0001F5FF"
                  u"\\U0001F680-\\U0001F6FF"
                  u"\\U0001F1E0-\\U0001F1FF"
                  "]+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\\s.,!?]", " ", text)
    text = text.lower()
    text = " ".join(text.split())
    return text

# ---------------------------------------------------------
# TEXT RANK SUMMARY
# ---------------------------------------------------------
def textrank_summarize(text, top_n=3):
    cleaned = clean_text(text)
    sentences = sent_tokenize(cleaned)
    if len(sentences) <= top_n:
        return " ".join(sentences)
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(sentences).toarray()
    sim_matrix = cosine_similarity(vectors)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)
    ranked = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    return " ".join([s for _, s in ranked[:top_n]])

# ---------------------------------------------------------
# TF-IDF SCORING SUMMARY
# ---------------------------------------------------------
def tfidf_summarize(text, top_n=3):
    cleaned = clean_text(text)
    sentences = sent_tokenize(cleaned)
    if len(sentences) <= top_n:
        return " ".join(sentences)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    scores = tfidf_matrix.mean(axis=1).A.flatten()
    ranked_idx = scores.argsort()[::-1]
    selected = [sentences[i] for i in ranked_idx[:top_n]]
    return " ".join(selected)

# ---------------------------------------------------------
# UNIFIED SUMMARIZER FUNCTION
# ---------------------------------------------------------
def summarize(text, model_name, top_n=3):
    if model_name == "TextRank":
        return textrank_summarize(text, top_n)
    elif model_name == "TF-IDF":
        return tfidf_summarize(text, top_n)
    else:
        raise ValueError("Unknown summarization model: " + model_name)

"""

with open("summarizer_functions.py", "w", encoding="utf-8") as f:
    f.write(summarizer_code)

print("Saved summarizer functions → ../Summarization/summarizer_functions.py")


Saved summarizer functions → ../Summarization/summarizer_functions.py


#### DL Model

In [1]:
import pandas as pd
import numpy as np
import re, string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# ---------------------------------------------------------
# TEXT CLEANING FUNCTION
# ---------------------------------------------------------
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r"<.*?>", " ", text)                      # Remove HTML
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)   # Remove URLs

    # Remove emojis
    text = re.sub("[" 
                  u"\U0001F600-\U0001F64F"
                  u"\U0001F300-\U0001F5FF"
                  u"\U0001F680-\U0001F6FF"
                  u"\U0001F1E0-\U0001F1FF"
                  "]+", "", text)

    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", " ", text)        # Special chars
    allowed = set(string.ascii_letters + string.digits + " .,!?")
    text = "".join(ch for ch in text if ch in allowed)

    text = text.lower()
    text = " ".join(text.split())
    return text


In [3]:
# ---------------------------------------------------------
# LOAD DATA (PENS)
# ---------------------------------------------------------
df = pd.read_csv("../data/news.tsv", sep="\t")

df["article"] = df["News body"].fillna("").apply(clean_text)
df["summary"] = df["Headline"].fillna("").apply(clean_text)

df = df[(df["article"].str.len() > 0) & (df["summary"].str.len() > 0)]

# Add special tokens
df["summary_in"]  = "<sos> " + df["summary"]
df["summary_out"] = df["summary"] + " <eos>"

In [4]:
# ---------------------------------------------------------
# TOKENIZATION
# ---------------------------------------------------------
MAX_ART_LEN = 400
MAX_SUM_LEN = 30
SRC_VOCAB = 20000
TGT_VOCAB = 10000

src_tok = Tokenizer(num_words=SRC_VOCAB, oov_token="<unk>", filters="")
src_tok.fit_on_texts(df["article"])

tgt_tok = Tokenizer(num_words=TGT_VOCAB, oov_token="<unk>")
tgt_tok.fit_on_texts(df["summary_in"].tolist() + df["summary_out"].tolist())

# Convert to sequences
enc_seq = pad_sequences(
    src_tok.texts_to_sequences(df["article"]),
    maxlen=MAX_ART_LEN,
    padding="post",
    truncating="post"
)

dec_in_seq = pad_sequences(
    tgt_tok.texts_to_sequences(df["summary_in"]),
    maxlen=MAX_SUM_LEN,
    padding="post",
    truncating="post"
)

dec_out_seq = pad_sequences(
    tgt_tok.texts_to_sequences(df["summary_out"]),
    maxlen=MAX_SUM_LEN,
    padding="post",
    truncating="post"
)

src_vocab = min(SRC_VOCAB, len(src_tok.word_index)+1)
tgt_vocab = min(TGT_VOCAB, len(tgt_tok.word_index)+1)

In [15]:
import joblib
joblib.dump(src_tok, "src_tokenizer.pkl")
joblib.dump(tgt_tok, "tgt_tokenizer.pkl")

['tgt_tokenizer.pkl']

In [5]:
# ---------------------------------------------------------
# CALLBACKS
# ---------------------------------------------------------


from tensorflow.keras.callbacks import (
    EarlyStopping,
    ReduceLROnPlateau,
    ModelCheckpoint
)
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    min_lr=1e-5,
    verbose=1
)

ckpt_lstm = ModelCheckpoint(
    "../Summarization/lstm_best_model.h5",
    monitor="val_loss",
    save_best_only=True,
    verbose=1
)

ckpt_gru = ModelCheckpoint(
    "../Summarization/gru_best_model.h5",
    monitor="val_loss",
    save_best_only=True,
    verbose=1
)

# Optimizer
from tensorflow.keras.optimizers import Adam

optimizer = Adam(
    learning_rate=0.001,
    clipnorm=1.0   # protects against exploding gradients
)


In [6]:
EMB_DIM = 96          # was 128 (faster)
LATENT_DIM = 192     # was 256 (faster)
BATCH_SIZE = 64

In [7]:
# ---------------------------------------------------------
# LSTM MODEL
# ---------------------------------------------------------
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, Concatenate
from tensorflow.keras.models import Model



# ----- Encoder -----
encoder_inputs = Input(shape=(MAX_ART_LEN,))
enc_emb = Embedding(src_vocab, EMB_DIM, mask_zero=True)(encoder_inputs)

encoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# ----- Decoder -----
decoder_inputs = Input(shape=(MAX_SUM_LEN,))
dec_emb = Embedding(tgt_vocab, EMB_DIM, mask_zero=True)(decoder_inputs)

decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# ----- Attention -----
attn = Attention()
attn_out = attn([decoder_outputs, encoder_outputs])

decoder_concat = Concatenate(axis=-1)([decoder_outputs, attn_out])

# ----- Output -----
decoder_dense = Dense(tgt_vocab, activation="softmax")
outputs = decoder_dense(decoder_concat)

# ----- Final Model -----
lstm_model = Model([encoder_inputs, decoder_inputs], outputs)

lstm_model.compile(
    optimizer=optimizer,
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

lstm_model.summary()

In [8]:
# ---------------------------------------------------------
# GRU MODEL
# ---------------------------------------------------------
from tensorflow.keras.layers import GRU

# ----- Encoder -----
encoder_gru = GRU(LATENT_DIM, return_sequences=True, return_state=True)
encoder_outputs_gru, state_h_gru = encoder_gru(enc_emb)

# ----- Decoder -----
decoder_gru = GRU(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs_gru, _ = decoder_gru(dec_emb, initial_state=[state_h_gru])

# ----- Attention -----
attn_gru = Attention()
attn_out_gru = attn_gru([decoder_outputs_gru, encoder_outputs_gru])

decoder_concat_gru = Concatenate(axis=-1)([decoder_outputs_gru, attn_out_gru])

# ----- Output -----
outputs_gru = Dense(tgt_vocab, activation="softmax")(decoder_concat_gru)

# ----- Final Model -----
gru_model = Model([encoder_inputs, decoder_inputs], outputs_gru)

gru_model.compile(
    optimizer=optimizer,
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

gru_model.summary()

In [10]:

from sklearn.model_selection import train_test_split

X_train, X_val, Y_train_in, Y_val_in, Y_train_out, Y_val_out = train_test_split(
    enc_seq,
    dec_in_seq,
    dec_out_seq,
    test_size=0.1,
    random_state=42
)

# ----- Train LSTM -----
lstm_model.fit(
    [X_train, Y_train_in],
    Y_train_out,
    batch_size=BATCH_SIZE,
    epochs=5,
    validation_data=([X_val, Y_val_in], Y_val_out),
    callbacks=[early_stop, reduce_lr, ckpt_lstm]
)

# ----- Train GRU -----
gru_model.fit(
    [X_train, Y_train_in],
    Y_train_out,
    batch_size=BATCH_SIZE,
    epochs=5,
    validation_data=([X_val, Y_val_in], Y_val_out),
    callbacks=[early_stop, reduce_lr, ckpt_lstm]
)

Epoch 1/5
[1m 108/1599[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m28:10[0m 1s/step - accuracy: 0.1250 - loss: 7.1320

KeyboardInterrupt: 