In [2]:
import importlib
import json
import re
import os
import sys
import pprint
import torch
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
from transformers import AutoTokenizer
sys.path.append(os.path.abspath('..'))
from src.utils import build_metadata_blurb, search_articles
from src.mem import check_memory

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

# torch.set_float32_matmul_precision('high') # bloat32

In [3]:
# load data 
with open('../data/articles.json', 'r') as file:
    data = json.load(file)

# instantiate model + tokenizer
model = SentenceTransformer("joe32140/ModernBERT-base-msmarco") # longer context window, can avoid chunking since max article len is 6600 toks
tok = AutoTokenizer.from_pretrained("joe32140/ModernBERT-base-msmarco")

# config 
max_length = model.max_seq_length # 8192 toks 

In [None]:
# # check if truncation will be a problem (we're good!)
# import statistics as s 
# tok_len = [len(tok.encode(rec['bodyText'])) for rec in data]
# print(f"min: {min(tok_len)}\nmax: {max(tok_len)}\nmean: {s.mean(tok_len)}\nmedian: {s.median(tok_len)}")

min: 38
max: 6676
mean: 1343.1263636363637
median: 1165.5


# Data processing 
Here, I pre-process input data, creating embeddings for key article metadata + the body text. I'll average the two tensors, slightly upweighting key, "high-signal" information (e.g. hed, tags, summary) to project each article into representation space. 

I keep these separate initially to **better preserve structure/signal from metadata.**

In [4]:
# meta-data
meta = build_metadata_blurb(data)
meta_embeddings = model.encode(meta, convert_to_tensor = True)

# core text 
body_text = [rec['bodyText'] for rec in data]
body_text_embeddings = model.encode(body_text, convert_to_tensor = True)



In [5]:
# avg. embeddings to create a ~unified representation~ :) 
# this is lossier than concat, but faster and more compact dimension-wise – also good for control over 
# info salience (via alpha) + semantic coherence 
alpha = 0.7 # there's more principled ways to choose this lol
combined_embeddings = alpha * meta_embeddings + (1- alpha) * body_text_embeddings # dim: 1100 x 768


In [6]:
meta = pd.DataFrame(data)

# save to persistent volume – fine for now since it's a small dataset, but in production/when complexity tradeoff 
# makes sense (10,000+ docs), you could do a faiss-based approach: https://github.com/facebookresearch/faiss
torch.save(combined_embeddings, '../data/article_embeddings.pt')
meta.to_parquet('../data/article_metadata.parquet')

# Initial test

Basic test of search to check embedding quality + such. Have filters to handle some basic failure modes (e.g. few relevant results.)

To improve this, you could normalize scores globally to improve differentiation across documents. In production, some ways to handle this include: 
<ul>1. <b>multi-stage retrieval:</b> fast retrieval then re-ranking (cross-encoders, folding in info abt popularity/user prefs)</ul>
<ul>2. <b>hybrid approach:</b> weighted combo of keyword + semantic search scores</ul>

Can also implement something like [query expansion](https://research.google/pubs/learning-for-efficient-supervised-query-expansion-via-two-stage-feature-selection/), by adding in related tokens/phrases to cast a wider net (I'd have done this w/ qwen or something, but considering df size thought it'd be overkill and/or tough to dial in meaningfully.

Less important, but maybe add a check for ood user behavior (e.g. "what's 2+2") – that's dependent on final intended use-case. 

In [7]:
import src
importlib.reload(src.utils)
from src.utils import search_articles

In [8]:
# climate results 
clim = search_articles(model, 
                          "climate change", 
                          combined_embeddings, 
                          meta, 
                          5,
                          0.3,
                          sort_key = 'relevance_score')

clim

Unnamed: 0,relevance_score,headline,summary,timesTags,firstPublished,url,tone,typeOfMaterials,bylines,bodyText
1060,0.439156,"Environmental Changes Are Fueling Human, Anima...","Biodiversity loss, global warming, pollution a...","[your-feed-science, your-feed-health, your-fee...",2024-05-08T15:00:24.000Z,https://www.nytimes.com/2024/05/08/health/envi...,NEWS,[News],By Emily Anthes,"Several large-scale, human-driven changes to t..."
1045,0.374678,"Alarmed by Climate Change, Astronomers Train T...",A growing number of researchers in the field a...,"[Global Warming, Space and Astronomy, Earth, A...",2024-05-14T07:00:24.000Z,https://www.nytimes.com/2024/05/14/science/ast...,NEWS,[News],By Katrina Miller and Delger Erdenesanaa,"On the morning of Jan. 18, 2003, Penny Sackett..."


In [118]:
# irrelevant case 
beans = search_articles(model, 
                          "I like to eat beans", 
                          combined_embeddings, 
                          meta, 
                          5,
                          0.3,
                          sort_key = 'relevance_score')

beans

'no relevant results!'