# Imports

In [43]:
%load_ext autoreload
%autoreload 2

import sys
import os

sys.path.append('..')
from backend.services.crawler import crawl_all_sources
from backend.services.classifier import classify_article
from backend.services.embeddings import embed_text
from backend.db.models import Article, Filter
from backend.db.crud import add_filter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Crawler
Crawl latest articles from data sources defined in the database

In [17]:
articles = crawl_all_sources()
print(f"Found {len(articles)} articles to classify.")

Found 30 articles to classify.


# Classifier
Classify articles extracted from crawling as relevant or irrelevant

In [33]:
results = []
for article in articles:
    result = classify_article(article)
    complete_result = {
        "article": article,
        "classification": result
    }
    results.append(complete_result)

In [37]:
relevant_articles = [article for article in results if article['classification'].get('relevant')]
print(f"Found {len(relevant_articles)} relevant articles.")

Found 11 relevant articles.


In [40]:
# Example of relevant article
print(relevant_articles[2]['article']['title'])

Critical CitrixBleed 2 vulnerability has been under active exploit for weeks


# Embedding
Embed article summary that will be used as filter and upload to database

In [None]:
for article in results:
    # Build object
    relevant = article['classification'].get('relevant', False)
    embedding = embed_text(article['classification']['summary'])
    
    filter_obj = Filter(
        url=article['article']['id'],
        embedding=embedding,
        relevant=relevant
    )
    
    # Upload to database
    add_filter(filter_obj)
