# Imports

In [74]:
%load_ext autoreload
%autoreload 2

import sys
import os
import math
import json
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score
import numpy as np
import joblib
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
import datetime as dt
from tqdm import tqdm

sys.path.append('..')
from scripts import classifier
from services import embeddings, filter, crawler
from db.models import Article
from api import ingest, retrieve

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data
Load synthetic training data generated using ChatGPT based on real world recent articles.

In [3]:
data_types = ["train", "test"]
articles = {}

for data_type in data_types:
    with open(f"../data/{data_type}.json", "r") as f:
        articles[data_type] = json.load(f)
    
    print(f"There are {len(articles[data_type])} generated articles for {data_type}ing.")

There are 200 generated articles for training.
There are 50 generated articles for testing.


# Classifier

The goal is to leverage LLMs capacity for **zero-shot semantic understanding**, then use it to assign four independent labels to every headline/body pair:

* **Relevance** – tag each article as *relevant* or *irrelevant* for enterprise IT teams.  
* **Severity** – rate the technical danger (*severe* vs. *non-severe*; i.e. "zero-day under active exploit" vs "patch released").  
* **Scope** – flag the breadth of the incident (*wide* vs. *narrow* scope; i.e. a high-tier vendor incident usually hits more organisations).  
* **User impact** – gauge how many users / records are affected (*high* vs. *low* impact; i.e. "millions of user records" vs "internal test").

The resulting label scores become the training targets for the filter and ranking models that will be promoted to production.

In [4]:
# Label data with the LLM
for data_type in data_types:
    processed_articles = []
    for article in tqdm(articles[data_type], desc=f"Labeling {data_type} articles"):
        result = classifier.classify_article(article)
        processed_article = {
            "article": article,
            "classification": result
        }
        processed_articles.append(processed_article)
    
    # Save labelled data
    with open(f'../data/{data_type}_labelled.json', 'w') as f:
        json.dump(processed_articles, f)

Labeling train articles: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [05:00<00:00,  1.50s/it]
Labeling test articles: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:06<00:00,  1.33s/it]


In [6]:
labelled_articles = {}
for data_type in data_types:
    with open(f"../data/{data_type}_labelled.json", "r") as f:
        processed_articles = json.load(f)

    labelled_articles[data_type] = processed_articles
    relevant_articles = [article for article in processed_articles if article['classification'].get('relevant')]
    print(f"Found {len(relevant_articles)} relevant articles in {data_type}ing dataset.")

Found 100 relevant articles in training dataset.
Found 26 relevant articles in testing dataset.


In [7]:
# Example of relevant article
print(relevant_articles[2]['article'])

{'title': 'Okta confirms a breach exposing 73 K session tokens; password resets underway', 'body': 'Okta acknowledged that threat actors exfiltrated 73 000 session tokens via a compromised support account. The company has forced global password resets.', 'published_at': '2025-07-13T22:48:09Z', 'id': 'https://example.com/388c8aa1-5aae-4799-adc0-8191f4397e0e', 'created_at': '2025-07-13T22:50:09Z', 'source': 'Okta', 'relevant': True}


# Training

### Prepare data for training

In [8]:
X = [text['article']['title'] + text['article']['body'] for text in labelled_articles["train"]] # Only keep main text from articles for embedding
y = {}
y["relevant"] = [1 if article['classification'].get('relevant') else 0 for article in labelled_articles["train"] ] # Label for relevant parameter
y["severe"]  = [1 if article['classification'].get('severe') else 0 for article in labelled_articles["train"] ] # Label for severe parameter
y["wide_scope"]  = [1 if article['classification'].get('wide_scope') else 0 for article in labelled_articles["train"] ] # Label for wide_scope parameter
y["high_impact"]  = [1 if article['classification'].get('high_impact') else 0 for article in labelled_articles["train"] ] # Label for high_impact parameter


### Set model up parameters

In [9]:
MODEL_NAME   = "all-MiniLM-L6-v2"
BATCH_SIZE   = 32
SEED = 42

### Embedding

In [10]:
embedder = SentenceTransformer(MODEL_NAME)

def encode_texts(texts):
    return embedder.encode(
        texts,
        batch_size=BATCH_SIZE,
        normalize_embeddings=True,
        show_progress_bar=True,
        convert_to_numpy=True
    )

# Produce embeddings 
X_encoded = encode_texts(X)

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  4.43it/s]


### Train

In [11]:
params = ["relevant", "severe", "wide_scope", "high_impact"]
param_grid = {'C': [0.25, 0.5, 1.0, 2.0, 4.0]}
X_test_dict = {}
y_test_dict = {}

for param in params:
    # Train / test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded, y[param], test_size=0.20, stratify=y[param], random_state=SEED
    )
    X_test_dict[param] = X_test
    y_test_dict[param] = y_test

    # Class weights
    """
    We balance classes since the main class is usually less
    represented in the dataset.
    """
    class_weight = 'balanced'

    # Model
    lr = LogisticRegression(
        solver="lbfgs",
        max_iter=1000,
        random_state=SEED,
        class_weight=class_weight
    )
    grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='f1', n_jobs=1)

    # Fit
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_

    # Save model weights
    joblib.dump(clf, f"../models/{param}_model.joblib")

### Evaluate


In [13]:
# Process test data
X_test_embedded = encode_texts(labelled_articles["test"])

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.46it/s]


In [22]:
y_relevant = [a["classification"]["relevant"] for a in labelled_articles["test"]]
relevant_mask = np.asarray(y_relevant, dtype=bool)
articles_arr = np.array(labelled_articles["test"], dtype=object)
relevant_articles = articles_arr[relevant_mask].tolist()
X_relevant = X_test_embedded[relevant_mask]

In [23]:
def evaluate(param):
    if param == "relevant":
        X = X_test_embedded
        y_gt = y_relevant
    else:
        X = X_relevant
        y_gt = [a["classification"][param] for a in relevant_articles]
        
    clf = joblib.load(f"../models/{param}_model.joblib")
    y_proba = clf.predict_proba(X)[:, 1]
    threshold = 0.55 if param == "relevant" else 0.5
    y_pred = (y_proba >= threshold)
    print(classification_report(y_gt, y_pred, digits=3))

In [24]:
for param in params:
    print(f"Evaluating performance for '{param}' parameter...")
    evaluate(param)

Evaluating performance for 'relevant' parameter...
              precision    recall  f1-score   support

       False      0.958     0.958     0.958        24
        True      0.962     0.962     0.962        26

    accuracy                          0.960        50
   macro avg      0.960     0.960     0.960        50
weighted avg      0.960     0.960     0.960        50

Evaluating performance for 'severe' parameter...
              precision    recall  f1-score   support

       False      0.583     0.636     0.609        11
        True      0.714     0.667     0.690        15

    accuracy                          0.654        26
   macro avg      0.649     0.652     0.649        26
weighted avg      0.659     0.654     0.655        26

Evaluating performance for 'wide_scope' parameter...
              precision    recall  f1-score   support

       False      1.000     0.143     0.250         7
        True      0.760     1.000     0.864        19

    accuracy                 

# Ranking
For filtering articles, we will use the "relevant" classifier that simply states if an article is relevant for an IT professional or not.

For ranking articles by importance we will use a mix of the three other classifiers to build an importance score together with the freshness of the articles (published_at).
Then, arbitrary weights were picked for the different parameters, based on a personal evaluation of what should be put forward :
* Severity : 0.5
* Wide scope : 0.3
* High impact : 0.2

The **final importance score** is calculated as a weighted sum of the three dimensions:

```
Importance Score = (Severity × 0.5) + (Wide Scope × 0.3) + (High Impact × 0.2)
```
On top of this, score is further biased by its freshness (how many hours ago was the article published), the hottest the higher the importance. Again, we arbitrarly pick the wieghts to be :
* Parameters : 0.7
* Freshness : 0.3

```
Final Score = (Importance Score × 0.7) + (Freshness × 0.3)
```

### Rank the test dataset

In [95]:
# Compute importance scores
simple_relevant_articles = [a["article"] for a in relevant_articles]
scored_articles = filter.importance_score(simple_relevant_articles, X_relevant)

In [96]:
ranked_articles, final_score = retrieve.rank(scored_articles)

In [97]:
ranked_articles[:3]

[({'title': 'Researchers spotted a zero-day exploit kit abusing CVE-2025-9911 in SSL-VPN gateways',
   'body': 'Researchers spotted an active zero-day exploit kit abusing CVE-2025-9911 in several SSL-VPN gateways. Administrators are urged to apply interim mitigations.',
   'published_at': '2025-07-14T06:17:12Z',
   'id': 'https://example.com/1d1a4e62-fa2b-47cd-8f6a-1f3af741e601',
   'created_at': '2025-07-14T06:19:12Z',
   'source': 'Cisco Talos',
   'relevant': True,
   'severity_score': 0.6780227720644709,
   'wide_scope_score': 0.657861322159996,
   'high_impact_score': 0.5298952333226373},
  0.6555247439191663),
 ({'title': 'AWS is triaging a sev-1 outage in eu-south-1 affecting S3 and SNS',
   'body': 'AWS reports a sev-1 outage in eu-south-1, impacting S3 and SNS APIs. Engineers are rolling back a faulty networking update.',
   'published_at': '2025-07-14T04:02:45Z',
   'id': 'https://example.com/24b7c7ce-0b3c-4dd2-9b9e-5a64afae0f02',
   'created_at': '2025-07-14T04:04:45Z',
   '

### Evaluate

In [98]:
gt_articles = []

for art in relevant_articles:
    new_art = {
        'id' : art['article']['id'],
        'title' : art['article']['title'],
        'body' : art['article']['body'],
        'published_at' : art['article']['published_at'],
        'severity_score' : 1 if art['classification']['severe'] else 0,
        'wide_scope_score' : 1 if art['classification']['wide_scope'] else 0,
        'high_impact_score' : 1 if art['classification']['high_impact'] else 0
    }
    gt_articles.append(new_art)

In [99]:
ranked_gt, final_score_gt = retrieve.rank(gt_articles)

In [102]:
pred_dict = {art['id'] : score for art, score in ranked_articles}
gt_dict = {art['id'] : score for art, score in ranked_gt}

# Align articles by id
k_vals = [5, 10, 15]
thresh = 0.5 # treat gt > 0.5 as "relevant"
ids = sorted(set(pred_dict) & set(gt_dict))
y_pred = np.array([pred_dict[i] for i in common_ids])
y_true = (np.array([gt_dict[i] for i in ids]) > thresh).astype(int)

In [103]:
total_relevant = y_true.sum()

for k in k_vals:
    topk_idx = np.argsort(y_pred)[-k:][::-1]     # indices of k highest scores
    rel_in_topk = y_true[topk_idx].sum()

    prec_k = rel_in_topk / k
    rec_k  = rel_in_topk / total_relevant if total_relevant else 0

    print(f"P@{k}: {prec_k:0.3f}   R@{k}: {rec_k:0.3f}")

P@5: 0.800   R@5: 0.235
P@10: 0.700   R@10: 0.412
P@15: 0.667   R@15: 0.588
