In [2]:
# read the csv file into a pandas dataframe
import pandas as pd

df = pd.read_csv("filtered_llmops_database.csv")
df.head()

Unnamed: 0,created_at,title,industry,year,source_url,company,application_tags,tools_tags,extra_tags,techniques_tags,short_summary,full_summary,year_month
0,2024-07-10 14:21:00+00:00,LLM Validation and Testing at Scale: GitLab's ...,Tech,2024.0,https://about.gitlab.com/blog/2024/05/09/devel...,gitlab,"code_generation,high_stakes_application,regula...","monitoring,cicd,devops,continuous_deployment,c...","llm,testing,evaluation,prompt engineering,metr...","prompt_engineering,error_handling,latency_opti...",GitLab developed a robust framework for valida...,# Gitlab: LLM Validation and Testing at Scale:...,2024-07-08/2024-07-14
1,2024-07-10 14:21:00+00:00,Building a Scalable Retriever-Ranker Architect...,Tech,2024.0,https://blog.malt.engineering/super-powering-o...,malt,"structured_output,realtime_application","kubernetes,monitoring,scaling,devops,orchestra...","recommenders,rerankers,vector database,qdrant,...","embeddings,semantic_search,vector_search,model...",Malt's implementation of a retriever-ranker ar...,# Malt: Building a Scalable Retriever-Ranker A...,2024-07-08/2024-07-14
2,2024-07-10 14:38:00+00:00,Building Secure and Private Enterprise LLM Inf...,Tech,2024.0,https://slack.engineering/how-we-built-slack-a...,slack,"regulatory_compliance,legacy_system_integration","security,compliance,guardrails,reliability,sca...","rag,aws,sagemaker,security,privacy,compliance,...","rag,semantic_search,error_handling,latency_opt...",Slack implemented AI features by developing a ...,# Slack: Building Secure and Private Enterpris...,2024-07-08/2024-07-14
3,2024-07-10 14:39:00+00:00,Building and Scaling LLM Applications at Discord,Tech,2024.0,https://discord.com/blog/developing-rapidly-wi...,discord,"chatbot,content_moderation,structured_output,r...","monitoring,scaling,devops,security,compliance,...","prompt engineering,evaluation,deployment,safet...","prompt_engineering,fine_tuning,model_optimizat...",Discord shares their comprehensive approach to...,# Discord: Building and Scaling LLM Applicatio...,2024-07-08/2024-07-14
4,2024-07-31 13:30:00+00:00,Optimizing Text-to-SQL Pipeline Using Agent Ex...,Tech,2024.0,https://idinsight.github.io/tech-blog/blog/aam...,idinsight,"question_answering,data_analysis","fastapi,crewai,databases,scalability,reliability","text to sql,llm,crewai,fastapi,rag,prompt engi...","rag,prompt_engineering,multi_agent_systems,mod...",Ask-a-Metric developed a WhatsApp-based AI dat...,# IDInsight: Optimizing Text-to-SQL Pipeline U...,2024-07-29/2024-08-04


In [None]:
import os, json, math, datetime, yaml, random
from typing import List, Dict, Any

# --- Load taxonomy ---
TAX = yaml.safe_load(open("taxonomy.yaml", "r"))
CATALOG = TAX["categories"]
CAT_BY_KEY = {c["key"]: c for c in CATALOG}
ORDER = TAX["sections_order"]

# --- Model client wrapper (replace with your provider) ---
# main.py
import yaml, json
from llm_openai import OpenAILLM
from your_router_module import categorize_articles, to_markdown  # from the code I shared

TAX = yaml.safe_load(open("taxonomy.yaml"))
CATALOG = TAX["categories"]
CATEGORY_KEYS = [c["key"] for c in CATALOG]

llm = OpenAILLM(model="gpt-4.1-mini", temperature=0.3, seed=17)

# Build prompts you already prepared:
from your_router_module import SYSTEM_PROMPT, build_user_prompt

def llm_adapter(article):
    user_prompt = build_user_prompt(article)  # as previously defined
    return llm.chat_json(SYSTEM_PROMPT, user_prompt, CATEGORY_KEYS)

# Drop-in: replace the old LLM class usage with this adapter inside your pipeline.
# Example single article:
# result = llm_adapter(example_article)
# print(result)

# --- Helpers ---
REPUTABLE = ["arxiv.org","nature.com","engineering.","research.","openai.com","deepmind.com","nvidia.com","techcrunch.com","bloomberg.com","ft.com","wsj.com"]

def days_since(dt_iso: str) -> int:
    try:
        dt = datetime.datetime.fromisoformat(dt_iso.replace("Z","+00:00"))
    except Exception:
        return 30
    now = datetime.datetime.now(datetime.timezone.utc)
    return max(0, (now - dt).days)

def recency_boost(days:int)->float:
    return max(0.0, min(1.0, 1.0/(1+0.05*max(days,1))))

def authority_boost(url:str)->float:
    u = (url or "").lower()
    return 1.0 if any(tok in u for tok in REPUTABLE) else 0.6

def novelty_boost(article:Dict[str,Any])->float:
    tag_buckets = [article.get("application_tags",""), article.get("tools_tags",""),
                   article.get("techniques_tags",""), article.get("extra_tags","")]
    tags = set()
    for b in tag_buckets:
        for t in str(b).split(","):
            t=t.strip().lower()
            if t: tags.add(t)
    return min(1.0, 0.7 + 0.03*len(tags))

def build_user_prompt(article:Dict[str,Any]) -> str:
    return (
        "You will receive:\n"
        "1) Article fields (title, summaries, tags, url, company, created_at).\n"
        "2) Category catalog (key, definition, example_signals).\n\n"
        "Tasks:\n"
        "A) Score every category from 0.0 to 1.0 (two decimals) based on evidence in the article.\n"
        "B) Pick the best category. Provide up to 2 secondaries if close.\n"
        "C) Give a short rationale citing which exact phrases/fields influenced the decision.\n"
        "D) Draft a crisp 1-sentence blurb for the newsletter (<=28 words).\n"
        "E) Compute a newsletter_score in [0,1] using: "
        "0.45*category_score(best) + 0.25*recency_boost + 0.15*authority_boost + 0.15*novelty_boost.\n"
        "- recency_boost: 1/(1+0.05*days_since_created), clipped [0,1].\n"
        "- authority_boost: 1.0 for reputable domains (arxiv.org, nature.com, engineering.*.com, research.*.com, openai.com, nvidia.com, techcrunch.com, bloomberg.com, ft.com); else 0.6.\n"
        "- novelty_boost: 0.7 + 0.03*(unique_tags_count), clipped to 1.0.\n\n"
        "JSON schema (return exactly this):\n"
        "{\n"
        '  "primary_category": "string",\n'
        '  "secondary_categories": ["string", ...],\n'
        '  "scores": {"category_key": 0.00, "...": 0.00},\n'
        '  "rationale": "string",\n'
        '  "blurb": "string",\n'
        '  "newsletter_score": 0.00\n'
        "}\n\n"
        "Article:\n" + json.dumps(article, ensure_ascii=False) + "\n\n"
        "Categories:\n" + json.dumps(CATALOG, ensure_ascii=False)
    )

SYSTEM_PROMPT = (
    "You are an editor routing tech news into newsletter sections. "
    "Be decisive, concise, and consistent with the provided category definitions. "
    "When unsure, still score all categories in [0.0, 1.0]. Favor the most specific category. "
    "Return STRICT JSON that matches the JSON schema and nothing else."pro
)

def llm_route_article(llm: LLM, article: Dict[str,Any]) -> Dict[str,Any]:
    # Precompute boosts (the model uses these definitions; we also verify)
    article = dict(article)  # shallow copy
    article["_precomputed"] = {
        "days_since": days_since(article.get("created_at","")),
        "authority_boost": authority_boost(article.get("source_url","")),
        "novelty_boost": novelty_boost(article)
    }
    result = llm.chat_json(SYSTEM_PROMPT, build_user_prompt(article))
    # Basic sanity checks
    if "primary_category" not in result or "scores" not in result:
        raise ValueError("Model returned invalid JSON.")
    return result

def self_consistent_route(llm: LLM, article: Dict[str,Any], votes:int=3) -> Dict[str,Any]:
    """Run N votes and merge by majority + score average."""
    tallies = {}
    seconds = {}
    rationales = []
    blurbs = []
    scores_agg = {}

    for i in range(votes):
        out = llm_route_article(llm, article)
        pc = out["primary_category"]
        tallies[pc] = tallies.get(pc,0)+1
        for s in out.get("secondary_categories",[]):
            seconds[s] = seconds.get(s,0)+1
        # aggregate per-category scores
        for k,v in out["scores"].items():
            scores_agg.setdefault(k, []).append(float(v))
        rationales.append(out.get("rationale",""))
        blurbs.append(out.get("blurb",""))
    # pick primary by majority then by highest mean score
    primary = max(tallies.items(), key=lambda kv: (kv[1], sum(scores_agg.get(kv[0],[0]))/max(1,len(scores_agg.get(kv[0],[])))))[0]
    # compute averaged per-category scores
    mean_scores = {k: round(sum(v)/len(v), 2) for k,v in scores_agg.items()}
    # secondaries: top two aside from primary
    secondaries = [k for k,_ in sorted(((k,v) for k,v in mean_scores.items() if k!=primary), key=lambda kv: kv[1], reverse=True)[:2]]
    # pick most frequent blurb and a concise rationale
    blurb = max(set(blurbs), key=blurbs.count)
    rationale = rationales[0][:200]
    # recompute newsletter_score deterministically client-side as a guard
    days = days_since(article.get("created_at",""))
    nscore = (
        0.45*mean_scores.get(primary,0.0) +
        0.25*recency_boost(days) +
        0.15*authority_boost(article.get("source_url","")) +
        0.15*novelty_boost(article)
    )
    return {
        "primary_category": primary,
        "secondary_categories": secondaries,
        "scores": mean_scores,
        "rationale": rationale,
        "blurb": blurb,
        "newsletter_score": round(min(1.0, max(0.0, nscore)), 3)
    }

# --- End-to-end ---
def categorize_articles(articles: List[Dict[str,Any]], llm: LLM, votes=3) -> List[Dict[str,Any]]:
    out = []
    for a in articles:
        res = self_consistent_route(llm, a, votes=votes)
        out.append({**a, **res})
    return out

def group_for_newsletter(labeled: List[Dict[str,Any]]) -> Dict[str, List[Dict[str,Any]]]:
    buckets = {k: [] for k in ORDER}
    for item in labeled:
        k = item["primary_category"]
        if k not in buckets:  # unseen category fallback
            buckets.setdefault("opinion_analysis", []).append(item)
        else:
            buckets[k].append(item)
    # sort within sections
    for k in buckets:
        buckets[k].sort(key=lambda x: (x["newsletter_score"], x.get("created_at","")), reverse=True)
    return buckets

def to_markdown(buckets: Dict[str,List[Dict[str,Any]]]) -> str:
    lines = []
    for k in ORDER:
        items = buckets.get(k, [])
        if not items: continue
        lines.append(f"## {CAT_BY_KEY[k]['name']}")
        for it in items:
            date = it.get("created_at","")[:10]
            title = it.get("title","").strip()
            url = it.get("source_url","")
            blurb = it.get("blurb","")
            lines.append(f"- [{title}]({url}) • {date} — {blurb}")
        lines.append("")
    return "\n".join(lines)