# Imports

In [47]:
%load_ext autoreload
%autoreload 2

import sys
import os
import math
import json
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import joblib
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
import datetime as dt

sys.path.append('..')
from scripts import classifier
from services import embeddings, filter, crawler
from db.models import Article
from api import ingest

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data
Load synthetic training data generated using ChatGPT based on real world recent articles.

In [4]:
with open("../data/train.json", "r") as f:
    articles = json.load(f)

print(f"There are {len(articles)} generated articles.")

There are 200 generated articles.


# Classifier

The goal is to leverage LLMs capacity for **zero-shot semantic understanding**, then use it to assign four independent labels to every headline/body pair:

* **Relevance** – tag each article as *relevant* or *irrelevant* for enterprise IT teams.  
* **Severity** – rate the technical danger (*severe* vs. *non-severe*; i.e. "zero-day under active exploit" vs "patch released").  
* **Scope** – flag the breadth of the incident (*wide* vs. *narrow* scope; i.e. a high-tier vendor incident usually hits more organisations).  
* **User impact** – gauge how many users / records are affected (*high* vs. *low* impact; i.e. "millions of user records" vs "internal test").

The resulting label scores become the training targets for the filter and ranking models that will be promoted to production.

In [15]:
processed_articles = []
for article in articles:
    result = classifier.classify_article(article)
    processed_article = {
        "article": article,
        "classification": result
    }
    processed_articles.append(processed_article)

In [20]:
# Save labelled data
with open('../data/train_labelled.json', 'w') as f:
    json.dump(processed_articles, f)

In [5]:
with open("../data/train_labelled.json", "r") as f:
    processed_articles = json.load(f)
    
relevant_articles = [article for article in processed_articles if article['classification'].get('relevant')]
print(f"Found {len(relevant_articles)} relevant articles.")

Found 100 relevant articles.


In [6]:
# Example of relevant article
print(relevant_articles[2]['article'])

{'title': 'Elastic confirms a data breach exposing 488K customer records; incident response is underw', 'body': 'Elastic confirms a data breach exposing 488K customer records; incident response is underway.', 'published_at': '2025-07-12T05:18:14Z', 'id': 'https://example.com/75919e72-8cd3-48aa-a104-65de43370ef7', 'created_at': '2025-07-12T05:20:14Z', 'source': 'Elastic', 'relevant': True}


# Training

### Prepare data for training

In [38]:
X = [text['article']['title'] + text['article']['body'] for text in processed_articles] # Only keep main text from articles for embedding
y = {}
y["relevant"] = [1 if article['classification'].get('relevant') else 0 for article in processed_articles ] # Label for relevant parameter
y["severe"]  = [1 if article['classification'].get('severe') else 0 for article in processed_articles ] # Label for severe parameter
y["wide_scope"]  = [1 if article['classification'].get('wide_scope') else 0 for article in processed_articles ] # Label for wide_scope parameter
y["high_impact"]  = [1 if article['classification'].get('high_impact') else 0 for article in processed_articles ] # Label for high_impact parameter


### Set model up parameters

In [42]:
MODEL_NAME   = "all-MiniLM-L6-v2"
BATCH_SIZE   = 32
SEED = 42

### Embedding

In [43]:
embedder = SentenceTransformer(MODEL_NAME)

def encode_texts(texts):
    return embedder.encode(
        texts,
        batch_size=BATCH_SIZE,
        normalize_embeddings=True,
        show_progress_bar=True,
        convert_to_numpy=True
    )

# Produce embeddings 
X_encoded = encode_texts(X)

Batches: 100%|███████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.08it/s]


### Train

In [44]:
params = ["relevant", "severe", "wide_scope", "high_impact"]
param_grid = {'C': [0.5, 1.0, 2.0]}
X_test_dict = {}
y_test_dict = {}

for param in params:
    # Train / test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded, y[param], test_size=0.20, stratify=y[param], random_state=SEED
    )
    X_test_dict[param] = X_test
    y_test_dict[param] = y_test

    # Class weights
    """
    We bias the classifier towards a higher recall in case relevancy
    is at stake (not to miss important articles)
    """
    class_weight = {0:1, 1:2} if param == "relevant" else {0:1, 1:1}

    # Model
    lr = LogisticRegression(
        solver="lbfgs",
        max_iter=1000,
        random_state=SEED,
        class_weight=class_weight
    )
    grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='f1', n_jobs=1)

    # Fit
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_

    # Save model weights
    joblib.dump(clf, f"../models/{param}_model.joblib")

### Evaluate

In [45]:
def evaluate(param):
    clf = joblib.load(f"../models/{param}_model.joblib")
    y_pred = clf.predict(X_test_dict[param])
    print(classification_report(y_test_dict[param], y_pred, digits=3))

In [46]:
for param in params:
    print(f"Evaluating performance for '{param}' parameter...")
    evaluate(param)

Evaluating performance for 'relevant' parameter...
              precision    recall  f1-score   support

           0      1.000     1.000     1.000        20
           1      1.000     1.000     1.000        20

    accuracy                          1.000        40
   macro avg      1.000     1.000     1.000        40
weighted avg      1.000     1.000     1.000        40

Evaluating performance for 'severe' parameter...
              precision    recall  f1-score   support

           0      1.000     1.000     1.000        26
           1      1.000     1.000     1.000        14

    accuracy                          1.000        40
   macro avg      1.000     1.000     1.000        40
weighted avg      1.000     1.000     1.000        40

Evaluating performance for 'wide_scope' parameter...
              precision    recall  f1-score   support

           0      1.000     0.870     0.930        23
           1      0.850     1.000     0.919        17

    accuracy                 

# Ranking
For filtering articles, we will use the "relevant" classifier that simply states if an article is relevant for an IT professional or not.

For ranking articles by importance we will use a mix of the three other classifiers to build an importance score together with the freshness of the articles (published_at).
Then, arbitrary weights were picked for the different parameters, based on a personal evaluation of what should be put forward :
* Severity : 0.5
* Wide scope : 0.3
* High impact : 0.2

The **final importance score** is calculated as a weighted sum of the three dimensions:

```
Importance Score = (Severity × 0.5) + (Wide Scope × 0.3) + (High Impact × 0.2)
```
On top of this, score is further biased by its freshness (how many hours ago was the article published), the hottest the higher the importance. Again, we arbitrarly pick the wieghts to be :
* Parameters : 0.7
* Freshness : 0.3

```
Final Score = (Importance Score × 0.7) + (Freshness × 0.3)
```

In [74]:
def importance_score(article):
    # Embed article
    text = article['title'] + article['body'].split(".")[0]
    article_embedded = embedder.encode(text, normalize_embeddings=True, convert_to_numpy=True)

    # Score based on parameters
    params = ["severe", "wide_scope", "high_impact"]
    weights = {"severe" : 0.5, "wide_scope" : 0.3, "high_impact" : 0.2}
    scores = {}
    
    for param in params:
        clf = joblib.load(f"../models/{param}_model.joblib")
        y_pred = clf.predict_proba(article_embedded.reshape(1,-1))
        y_prob = y_pred[0][1] # Probability of article being severe or wide_scope or high_impact
        scores[param] = y_prob

    score = sum([weights[param]*scores[param] for param in params])

    return score

In [97]:
def freshness_score(article):
    published_at = article['published_at']
    published = dt.datetime.fromisoformat(published_at.replace("Z", "+00:00"))
    now_utc = dt.datetime.now(dt.timezone.utc)
    age_hours = (now_utc - published).total_seconds() / 3600
    tau_hours = 72
    freshness_score = math.exp(-age_hours / tau_hours)

    return freshness_score

### Rank the 10 first relevant articles

In [105]:
articles_scored = {}

for article in relevant_articles[:10]:
    article_content = article['article']
    score = importance_score(article_content)*0.7 + freshness_score(article_content)*0.3

    articles_scored[article_content['title']] = score

In [106]:
articles_scored

{'Researchers discovered a new exploit kit targeting SSL‑VPN, weaponizing CVE-2025-1166': 0.6480853023311397,
 'Researchers discovered a new exploit kit targeting ASA, weaponizing CVE-2024-7924': 0.6304578930678473,
 'Elastic confirms a data breach exposing 488K customer records; incident response is underw': 0.3978406457143619,
 'Elastic is investigating a critical vulnerability (CVE-2025-2638) allowing remote code exe': 0.6238727570306012,
 'Researchers discovered a new exploit kit targeting ESXi, weaponizing CVE-2025-8484': 0.6076048163854597,
 'Cloudflare confirms a data breach exposing 383K customer records; incident response is und': 0.4142366318348899,
 'Cisco is investigating a major vulnerability (CVE-2024-5335) allowing remote code executio': 0.6000597649246994,
 'Atlassian reports a critical outage in us-west-2, affecting services Cloud Functions, Conf': 0.6014066016557043,
 'Researchers discovered a new exploit kit targeting ESXi, weaponizing CVE-2024-5681': 0.6177824458355

In [109]:
ranked = sorted(
    articles_scored.items(),
    key=lambda kv: kv[1],
    reverse=True
)
ranked

[('AWS is investigating a sev‑1 vulnerability (CVE-2025-4150) allowing remote code execution',
  0.649494431769017),
 ('Researchers discovered a new exploit kit targeting SSL‑VPN, weaponizing CVE-2025-1166',
  0.6480853023311397),
 ('Researchers discovered a new exploit kit targeting ASA, weaponizing CVE-2024-7924',
  0.6304578930678473),
 ('Elastic is investigating a critical vulnerability (CVE-2025-2638) allowing remote code exe',
  0.6238727570306012),
 ('Researchers discovered a new exploit kit targeting ESXi, weaponizing CVE-2024-5681',
  0.6177824458355199),
 ('Researchers discovered a new exploit kit targeting ESXi, weaponizing CVE-2025-8484',
  0.6076048163854597),
 ('Atlassian reports a critical outage in us-west-2, affecting services Cloud Functions, Conf',
  0.6014066016557043),
 ('Cisco is investigating a major vulnerability (CVE-2024-5335) allowing remote code executio',
  0.6000597649246994),
 ('Cloudflare confirms a data breach exposing 383K customer records; incident re

# Tests

In [48]:
new_articles = crawler.crawl_all_sources()

In [50]:
articles_clean = article_objs = [
            Article(
                id=a.get("id"),
                source=a.get("source"),
                title=a.get("title"),
                body=a.get("body", ""),
                published_at=a.get("published_at"),
            )
            for a in new_articles
        ]

In [64]:
labels, embeddings = filter.relevant_articles(articles_clean)

In [65]:
sum([1 for label in labels if label])

31

In [56]:
processed_articles_new = []
for article in new_articles:
    result = classifier.classify_article(article)
    processed_article = {
        "article": article,
        "classification": result
    }
    processed_articles_new.append(processed_article)

In [57]:
relevant_llm = [article for article in processed_articles_new if article["classification"]["relevant"]]

In [58]:
len(relevant_llm)

24

In [59]:
len(processed_articles_new)

110

In [66]:
relevant_articles_new = [a for a, is_relevant in zip(articles_clean, labels) if is_relevant ] # Keep only articles considered relevant
relevant_articles_embedded_new = [emb_a for emb_a, is_relevant in zip(embeddings, labels) if is_relevant] # Same for the embeddings

In [73]:
importance_score_new = filter.importance_score(relevant_articles_embedded_new)

In [75]:
len(importance_score_new)

31

In [80]:
for art, score in zip(relevant_articles_new, importance_score_new):
    art.importance_score = float(score)

In [81]:
relevant_articles_new

[Article(id='https://arstechnica.com/security/2025/07/critical-citrixbleed-2-vulnerability-has-been-under-active-exploit-for-weeks/', title='Critical CitrixBleed 2 vulnerability has been under active exploit for weeks', body='Exploits allow hackers to bypass 2FA and commandeer vulnerable devices.', published_at='Wed, 09 Jul 2025 11:20:24 +0000', url='', source='arstechnica', created_at='', importance_score=0.48878554937545104),
 Article(id='https://arstechnica.com/security/2025/07/no-honor-among-thieves-ms-hacking-group-starts-turf-war/', title='“No honor among thieves”: M&S hacking group starts turf war', body='A clash between criminal ransomware groups could result in victims being extorted twice.', published_at='Mon, 07 Jul 2025 18:12:10 +0000', url='', source='arstechnica', created_at='', importance_score=0.2844433227600779),
 Article(id='https://arstechnica.com/security/2025/06/mexican-drug-cartel-hacked-fbi-officials-phone-to-track-informant-report-says/', title='Drug cartel hack