# Imports

In [51]:
%load_ext autoreload
%autoreload 2

import sys
import os
import json
import numpy as np

sys.path.append('..')
from backend.services.crawler import crawl_all_sources
from backend.services.classifier import classify_article
from backend.services.embeddings import embed_text
from backend.db.models import Article, Filter
from backend.db.crud import add_filter
from backend.services.filter import filter_article

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Crawler
Crawl latest articles from data sources defined in the database

In [67]:
articles = crawl_all_sources()
print(f"Found {len(articles)} articles to classify.")

Found 110 articles to classify.


# Classifier
Classify articles extracted from crawling as relevant or irrelevant

In [68]:
results = []
for article in articles:
    result = classify_article(article)
    complete_result = {
        "article": article,
        "classification": result
    }
    results.append(complete_result)

In [69]:
relevant_articles = [article for article in results if article['classification'].get('relevant')]
print(f"Found {len(relevant_articles)} relevant articles.")

Found 28 relevant articles.


In [70]:
# Example of relevant article
print(relevant_articles[2]['article']['title'])

Provider of covert surveillance app spills passwords for 62,000 users


# Embedding
Embed article summary that will be used as filter and upload to database

In [71]:
for article in results:
    # Build object
    relevant = article['classification'].get('relevant', False)
    embedding = embed_text(article['classification']['summary'])
    
    filter_obj = Filter(
        url=article['article']['id'],
        embedding=embedding,
        relevant=relevant
    )
    
    # Upload to database
    add_filter(filter_obj)


# Test filter

In [21]:
with open("../backend/tests/dummy_articles.json", "r") as f:
    dummy_articles = json.load(f)

In [22]:
dummy_articles[0]

{'title': 'Critical Zero-Day Vulnerability Found in Windows Kernel',
 'body': 'Microsoft has disclosed a zero-day vulnerability (CVE-2025-1234) in the Windows Kernel that allows privilege escalation. A patch is expected to be released in the next security update.',
 'published_at': '2025-07-08T14:20:00Z',
 'url': 'https://www.tomshardware.com/news/windows-kernel-zero-day',
 'created_at': '2025-07-08T14:25:00Z',
 'source': "Tom's Hardware",
 'relevant': True}

In [79]:
score = []
for article in dummy_articles:
    # Create Article object
    article_obj = Article(
        title=article['title'],
        body=article['body'],
        url=article['url'],
        source=article['source'],
        published_at=article['published_at'],
    )
    
    # Filter the article
    relevant = filter_article(article_obj)
    relevant_gt = article['relevant']
    
    print(f"Article: {article['title']}, Relevant: {relevant}, Ground Truth: {relevant_gt}")
    
    score.append(relevant == relevant_gt)

print(f"Filtering accuracy: {np.mean(score)}")

Article: Critical Zero-Day Vulnerability Found in Windows Kernel, Relevant: True, Ground Truth: True
Article: Reddit User Shares Favorite Mechanical Keyboard of 2025, Relevant: False, Ground Truth: False
Article: Outage Hits AWS US-East-1 Region, Affecting Major Services, Relevant: True, Ground Truth: True
Article: Linux 6.8 Released with New Filesystem Features, Relevant: False, Ground Truth: False
Article: Google Chrome Patches Critical V8 Vulnerability Exploited in the Wild, Relevant: True, Ground Truth: True
Filtering accuracy: 1.0
