In [None]:
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel
from typing import List, Literal, Dict
import joblib
import numpy as np
from sentence_transformers import SentenceTransformer
import hashlib
import os
import logging
from functools import lru_cache
from fastapi.testclient import TestClient

os.chdir("/Users/ajohnca/Documents/Applied Data Science/Summer 2025/Python for ML Engineering/assignments/score_headlines")

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define models (stub local models for notebook testing)
@lru_cache()
def get_embedding_model() -> SentenceTransformer:
    return SentenceTransformer("all-MiniLM-L6-v2")  # Load from HF directly in notebook

@lru_cache()
def get_svm_model():
    # Load a dummy or pretrained model
    return joblib.load("models/svm.joblib")  # Update path if needed

embedding_model = get_embedding_model()
svm_model = get_svm_model()

# Helper function
def generate_headline_id(text: str) -> str:
    normalized = text.lower().strip()
    return hashlib.blake2b(normalized.encode("utf-8"), digest_size=10).hexdigest()

# Define FastAPI app
app = FastAPI()

@app.get("/status")
def status():
    return {"status": "OK"}

class HeadlineRequest(BaseModel):
    headlines: List[str]
    return_ids: bool = False

@app.post("/score_headlines")
def score_headlines(request: Request, payload: HeadlineRequest) -> Dict[str, List]:
    headlines = payload.headlines
    return_ids = payload.return_ids

    logger.info(f"Received request from {request.client.host} with {len(headlines)} headline(s).")

    if not headlines:
        raise HTTPException(status_code=400, detail="No headlines provided.")

    try:
        embeddings = embedding_model.encode(headlines)
        preds = svm_model.predict(embeddings)
    except Exception as e:
        logger.exception("Prediction error")
        raise HTTPException(status_code=500, detail=str(e))

    if return_ids:
        results = [{"id": generate_headline_id(text), "label": label} for text, label in zip(headlines, preds)]
        return {"results": results}
    else:
        return {"labels": preds.tolist()}

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
from fastapi.testclient import TestClient

client = TestClient(app)

# Test status
response = client.get("/status")
print("Status:", response.status_code, response.json())

# Test scoring endpoint
payload = {
    "headlines": ["Stocks rally after inflation slows", "War tensions increase in Middle East"],
    "return_ids": True
}
response = client.post("/score_headlines", json=payload)
print("Score Headlines:", response.status_code, response.json())

INFO:httpx:HTTP Request: GET http://testserver/status "HTTP/1.1 200 OK"
INFO:__main__:Received request from testclient with 2 headline(s).


Status: 200 {'status': 'OK'}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://testserver/score_headlines "HTTP/1.1 200 OK"


Score Headlines: 200 {'results': [{'id': 'eac3c48b39894338157b', 'label': 'Neutral'}, {'id': '182d3ee7fdae04e43bd2', 'label': 'Neutral'}]}


In [10]:
embeddings = embedding_model.encode(payload["headlines"])
logger.debug(f"Embeddings shape: {embeddings.shape}")
logger.debug(f"Sample embedding (first headline): {embeddings[0][:5]}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
tokenizer = embedding_model.tokenizer
tokens = tokenizer("Markets are crashing", return_tensors="pt")

print("Input IDs:", tokens["input_ids"])
print("Tokens:", tokenizer.convert_ids_to_tokens(tokens["input_ids"][0]))


Input IDs: tensor([[  101,  6089,  2024, 12894,   102]])
Tokens: ['[CLS]', 'markets', 'are', 'crashing', '[SEP]']


In [15]:
embedding_model = get_embedding_model()
svm_model = get_svm_model()

test_headlines = [
    "7 Sublime Cookies for a Joyous Holiday Season. Lemony turmeric, gingery cheesecake, boozy almond: This year’s Cookie Week has something for everyone.",
    "Markets are soaring"
]

embeddings = embedding_model.encode(test_headlines)
preds = svm_model.predict(embeddings)

logger.info(f"Test predictions: {list(zip(test_headlines, preds))}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Test predictions: [('7 Sublime Cookies for a Joyous Holiday Season. Lemony turmeric, gingery cheesecake, boozy almond: This year’s Cookie Week has something for everyone.', 'Optimistic'), ('Markets are soaring', 'Neutral')]
