In [None]:
# [1] Load Demo Dataset (Missouri Landlord–Tenant Law) from GitHub (JSONL)
# Note: This is a small demo dataset with only 5 clauses to keep the hands-on lightweight.
# In a production system, this would scale to hundreds+ clauses across relevant statutes.

import json
import requests

url = "https://raw.githubusercontent.com/ailingnan/cs5588-week2-RAG/main/data/mo_landlord_tenant_clauses.jsonl"

resp = requests.get(url)
print("status:", resp.status_code)
print("first 80 chars:", resp.text[:80])

clauses = []
for i, line in enumerate(resp.text.splitlines()):
    line = line.strip()
    if not line:
        continue
    clauses.append(json.loads(line))

print("num clauses:", len(clauses))
print("first clause:", clauses[0])


In [None]:
# [2] Keyword Retrieval Baseline (BM25) — Retrieve Top-K Clauses
# Goal: show how keyword-based search can over-weight overlapping terms (e.g., "terminate", "notice").

!pip -q install rank-bm25

from rank_bm25 import BM25Okapi
import re

def tokenize(text: str):
    return re.findall(r"[a-z0-9]+", text.lower())

corpus = [c["text"] for c in clauses]
tokenized_corpus = [tokenize(t) for t in corpus]

bm25 = BM25Okapi(tokenized_corpus)

def bm25_search(query: str, k: int = 5):
    q_tok = tokenize(query)
    scores = bm25.get_scores(q_tok)
    ranked_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    results = []
    for i in ranked_idx:
        c = clauses[i]
        results.append({
            "id": c["id"],
            "section": c["section"],
            "title": c["title"],
            "score": float(scores[i]),
            "text": c["text"],
            "source": c["source"],
        })
    return results

query = "How much notice must a landlord give to terminate a tenancy in Missouri?"
results = bm25_search(query, k=5)

for r in results:
    print(r["id"], r["section"], "|", r["title"], "| score=", round(r["score"], 3))
    print("  ", r["text"])
    print()


In [None]:
# [3] Vector Retrieval (Sentence Embeddings + FAISS)
# Goal: retrieve semantically similar clauses even when wording differs from the statute text.

!pip -q install sentence-transformers faiss-cpu
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [c["text"] for c in clauses]
emb = embed_model.encode(texts, normalize_embeddings=True)  # (N, d)
emb = np.array(emb, dtype="float32")

# FAISS index
d = emb.shape[1]
index = faiss.IndexFlatIP(d)
index.add(emb)

def vector_search(query: str, k: int = 5):
    q_emb = embed_model.encode([query], normalize_embeddings=True)
    q_emb = np.array(q_emb, dtype="float32")
    scores, idx = index.search(q_emb, k)  # scores shape (1,k), idx shape (1,k)
    results = []
    for rank, i in enumerate(idx[0]):
        c = clauses[int(i)]
        results.append({
            "rank": rank + 1,
            "id": c["id"],
            "section": c["section"],
            "title": c["title"],
            "score": float(scores[0][rank]),
            "text": c["text"],
            "source": c["source"],
        })
    return results

query = "How much notice must a landlord give to terminate a tenancy in Missouri?"
v_results = vector_search(query, k=5)

for r in v_results:
    print(f'#{r["rank"]}: {r["id"]} {r["section"]} | {r["title"]} | score={r["score"]:.3f}')
    print(" ", r["text"])
    print()


In [None]:
# [4] Hybrid Retrieval (BM25 + Vector) with Min-Max Normalization and Alpha Fusion
# Score = alpha * normalized_bm25 + (1 - alpha) * normalized_vector
# Goal: balance exact keyword matching and semantic similarity.

import numpy as np
import re
from rank_bm25 import BM25Okapi

def tokenize(text: str):
    return re.findall(r"[a-z0-9]+", text.lower())

# --- build BM25 (if not already built) ---
tokenized_corpus = [tokenize(c["text"]) for c in clauses]
bm25 = BM25Okapi(tokenized_corpus)

def minmax_norm(x: np.ndarray):
    x = x.astype("float32")
    xmin, xmax = float(x.min()), float(x.max())
    if abs(xmax - xmin) < 1e-12:
        return np.zeros_like(x)  # all same -> 0
    return (x - xmin) / (xmax - xmin)

def hybrid_search(query: str, k: int = 5, alpha: float = 0.5):
    # BM25 scores
    bm25_scores = np.array(bm25.get_scores(tokenize(query)), dtype="float32")
    bm25_n = minmax_norm(bm25_scores)

    # Vector scores (cosine via normalized inner product)
    q_emb = embed_model.encode([query], normalize_embeddings=True).astype("float32")
    vec_scores, vec_idx = index.search(q_emb, len(clauses))  # get all for fusion
    vec_scores = vec_scores[0]  # shape (N,)
    vec_idx = vec_idx[0]        # indices of docs in that order

    # reorder vector scores back to doc-order [0..N-1]
    vec_scores_doc = np.zeros(len(clauses), dtype="float32")
    for score, i in zip(vec_scores, vec_idx):
        vec_scores_doc[int(i)] = float(score)

    vec_n = minmax_norm(vec_scores_doc)

    # Hybrid fusion
    hybrid = alpha * bm25_n + (1 - alpha) * vec_n
    ranked = np.argsort(-hybrid)[:k]

    results = []
    for rank, i in enumerate(ranked, start=1):
        c = clauses[int(i)]
        results.append({
            "rank": rank,
            "id": c["id"],
            "section": c["section"],
            "title": c["title"],
            "hybrid_score": float(hybrid[i]),
            "bm25_score": float(bm25_scores[i]),
            "vec_score": float(vec_scores_doc[i]),
            "text": c["text"],
            "source": c["source"],
        })
    return results

# ---- demo: compare BM25 vs Vector vs Hybrid ----
query = "How much notice must a landlord give to terminate a tenancy in Missouri?"
alpha = 0.5

print("=== BM25 top-5 ===")
bm25_res = bm25_search(query, k=5)
for r in bm25_res:
    print(r["id"], r["section"], "|", r["title"], "| bm25=", round(r["score"], 3))

print("\n=== Vector top-5 ===")
v_res = vector_search(query, k=5)
for r in v_res:
    print(r["id"], r["section"], "|", r["title"], "| vec=", round(r["score"], 3))

print(f"\n=== Hybrid top-5 (alpha={alpha}) ===")
h_res = hybrid_search(query, k=5, alpha=alpha)
for r in h_res:
    print(r["id"], r["section"], "|", r["title"],
          "| hybrid=", round(r["hybrid_score"], 3),
          "| bm25=", round(r["bm25_score"], 3),
          "| vec=", round(r["vec_score"], 3))


In [None]:
# [5] Governance Layer: Cross-Encoder Re-ranking (Top-K Candidates)
# Goal: reduce retrieval noise and misalignment by re-ranking with a stronger relevance model.

!pip -q install sentence-transformers
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query: str, candidates, top_k: int = 5):
    """
    candidates: list of dicts, each has keys: id, text, section, title, source, etc.
    returns: reranked list with 'rerank_score'
    """
    pairs = [(query, c["text"]) for c in candidates]
    scores = reranker.predict(pairs)
    scored = []
    for c, s in zip(candidates, scores):
        cc = dict(c)
        cc["rerank_score"] = float(s)
        scored.append(cc)
    scored.sort(key=lambda x: x["rerank_score"], reverse=True)
    return scored[:top_k]

query = "How much notice must a landlord give to terminate a tenancy in Missouri?"
cands = hybrid_search(query, k=5, alpha=0.5)


cands_for_rerank = [{
    "id": r["id"], "section": r["section"], "title": r["title"],
    "text": r["text"], "source": r["source"],
    "hybrid_score": r["hybrid_score"], "bm25_score": r["bm25_score"], "vec_score": r["vec_score"]
} for r in cands]

reranked = rerank(query, cands_for_rerank, top_k=5)

print("=== Reranked top-5 ===")
for i, r in enumerate(reranked, 1):
    print(f"#{i}: {r['id']} {r['section']} | {r['title']} | rerank={r['rerank_score']:.3f} | hybrid={r['hybrid_score']:.3f}")


In [None]:
# [6] Baseline vs RAG-Grounded Output (with Evidence Citations)
# Baseline: no retrieval (no citations) — demonstrates hallucination / ungrounded risk.
# RAG: retrieval + re-ranking + grounded synthesis with citations.

def format_citations(passages):
    # citations like [mo-441.020], [mo-441.030]
    return " ".join([f"[{p['id']}]" for p in passages])

def rag_answer(query: str, alpha=0.5, k_retrieve=5, k_rerank=3):
    # 1) retrieve (hybrid)
    cands = hybrid_search(query, k=k_retrieve, alpha=alpha)
    cands = [{
        "id": r["id"], "section": r["section"], "title": r["title"],
        "text": r["text"], "source": r["source"],
        "hybrid_score": r["hybrid_score"], "bm25_score": r["bm25_score"], "vec_score": r["vec_score"]
    } for r in cands]

    # 2) rerank (governance)
    top = rerank(query, cands, top_k=k_rerank)

    # 3) simple grounded synthesis (no hallucination)
    answer = (
        f"Based on Missouri landlord–tenant statutes, the most relevant clauses suggest:\n"
        + "\n".join([f"- {p['section']} ({p['title']}): {p['text']}" for p in top])
        + f"\n\nCitations: {format_citations(top)}"
    )
    return top, answer

def baseline_answer(query: str):
    # Baseline: intentionally NOT grounded (for demo only)
    return (
        "Baseline (no retrieval): A landlord typically must give around 30 days notice, "
        "but requirements can vary depending on circumstances."
        "\n\n(No citations — not evidence-grounded)"
    )

query = "How much notice must a landlord give to terminate a tenancy in Missouri?"
print("=== Baseline ===")
print(baseline_answer(query))
print("\n=== RAG grounded ===")
top, ans = rag_answer(query, alpha=0.5, k_retrieve=5, k_rerank=3)
print(ans)


In [None]:
# [7] Abstention Policy ("Not enough evidence") for Risky / Uncovered Queries
# Goal: when evidence is insufficient or irrelevant, refuse to answer and recommend official sources.

def should_abstain(reranked_passages, threshold=0.2):
    if not reranked_passages:
        return True
    best = reranked_passages[0]["rerank_score"]
    return best < threshold

def rag_answer_with_abstain(query: str, alpha=0.5, k_retrieve=5, k_rerank=3, abstain_threshold=0.2):
    cands = hybrid_search(query, k=k_retrieve, alpha=alpha)
    cands = [{
        "id": r["id"], "section": r["section"], "title": r["title"],
        "text": r["text"], "source": r["source"],
        "hybrid_score": r["hybrid_score"], "bm25_score": r["bm25_score"], "vec_score": r["vec_score"]
    } for r in cands]
    top = rerank(query, cands, top_k=k_rerank)

    if should_abstain(top, threshold=abstain_threshold):
        return top, (
            "Not enough evidence in the provided Missouri landlord–tenant statute dataset to answer this safely. "
            "Please consult an official Missouri legal source or a qualified legal professional.\n\n"
            f"Citations: {format_citations(top) if top else '(none)'}"
        )

    answer = (
        "Evidence-grounded answer (Missouri statutes):\n"
        + "\n".join([f"- {p['section']} ({p['title']}): {p['text']}" for p in top])
        + f"\n\nCitations: {format_citations(top)}"
    )
    return top, answer


risky_query = "Can my landlord increase rent in the middle of a lease in Missouri?"
top2, ans2 = rag_answer_with_abstain(risky_query, alpha=0.5, k_retrieve=5, k_rerank=3, abstain_threshold=0.2)

print("=== Risky query ===")
print(risky_query)
print("\n=== System output ===")
print(ans2)
print("\nTop passages used:")
for p in top2:
    print(p["id"], p["section"], "rerank=", round(p["rerank_score"], 3))


In [None]:
# [8] Evaluation: Precision@5 and Recall@10 (Small, Manually-Labeled Test Set)
# Note: We use 3 user-story queries and manually specify relevant clause IDs due to the small demo dataset.

def precision_at_k(retrieved_ids, relevant_ids, k):
    retrieved_k = retrieved_ids[:k]
    if k == 0:
        return 0.0
    return sum([1 for x in retrieved_k if x in relevant_ids]) / k

def recall_at_k(retrieved_ids, relevant_ids, k):
    if not relevant_ids:
        return 0.0
    retrieved_k = set(retrieved_ids[:k])
    return len(retrieved_k.intersection(set(relevant_ids))) / len(relevant_ids)

queries = {
    "U1": "What is the maximum security deposit a landlord can charge in Missouri?",
    "U2": "How much notice must a landlord give to terminate a tenancy in Missouri?",
    "U3": "Can my landlord raise rent in the middle of a lease in Missouri?"  # likely abstain / not covered
}


relevant = {
    "U1": ["mo-441.060", "mo-441.065"],
    "U2": ["mo-441.020"],
    "U3": []
}

def run_eval(method="bm25", k=10, alpha=0.5):
    rows = []
    for story, q in queries.items():
        if method == "bm25":
            res = bm25_search(q, k=min(k, len(clauses)))
            ids = [r["id"] for r in res]
        elif method == "vector":
            res = vector_search(q, k=min(k, len(clauses)))
            ids = [r["id"] for r in res]
        elif method == "hybrid":
            res = hybrid_search(q, k=min(k, len(clauses)), alpha=alpha)
            ids = [r["id"] for r in res]
        else:
            raise ValueError("method must be bm25/vector/hybrid")

        p5 = precision_at_k(ids, relevant[story], k=min(5, len(ids)))
        r10 = recall_at_k(ids, relevant[story], k=min(10, len(ids)))
        rows.append((story, method, p5, r10, ids))
    return rows

for method in ["bm25", "vector", "hybrid"]:
    print(f"\n=== Eval: {method} ===")
    rows = run_eval(method=method, k=10, alpha=0.5)
    for story, m, p5, r10, ids in rows:
        print(story, "| P@5=", round(p5, 3), "| R@10=", round(r10, 3), "| top=", ids[:5])


In [None]:
# [9] Failure Case Demonstration: Keyword Bias vs Governance Fix
# Observed: BM25/Hybrid may rank an incorrect clause higher due to keyword overlap.
# Fix: Cross-encoder re-ranking promotes the truly relevant statute.

q_fail = "How much notice must a landlord give to terminate a tenancy in Missouri?"

bm = [r["id"] for r in bm25_search(q_fail, k=5)]
vec = [r["id"] for r in vector_search(q_fail, k=5)]
hyb = [r["id"] for r in hybrid_search(q_fail, k=5, alpha=0.5)]
rer = [r["id"] for r in rerank(q_fail, [{
    "id": r["id"], "section": r["section"], "title": r["title"],
    "text": r["text"], "source": r["source"],
    "hybrid_score": r["hybrid_score"], "bm25_score": r["bm25_score"], "vec_score": r["vec_score"]
} for r in hybrid_search(q_fail, k=5, alpha=0.5)], top_k=5)]

print("Query:", q_fail)
print("BM25 top1:", bm[0])
print("Vector top1:", vec[0])
print("Hybrid top1:", hyb[0])
print("Rerank top1:", rer[0])
print("\nInterpretation:")
print("- Observed failure: keyword-heavy retrieval favored mo-441.030 over the more relevant mo-441.020.")
print("- Proposed fix: governance reranking + adjust alpha + expand clause coverage within Missouri landlord–tenant domain.")


In [None]:
# [10] User-Facing Evidence of Grounding (Show Full Clause Text + Citations)
# This is the product-facing view: show the actual statute language inline (not just IDs).

def user_facing_rag_answer(query, alpha=0.5, k_retrieve=5, k_rerank=3):
    # 1) retrieve
    cands = hybrid_search(query, k=k_retrieve, alpha=alpha)
    cands = [{
        "id": r["id"], "section": r["section"], "title": r["title"],
        "text": r["text"], "source": r["source"],
        "hybrid_score": r["hybrid_score"], "bm25_score": r["bm25_score"], "vec_score": r["vec_score"]
    } for r in cands]

    # 2) governance rerank
    top = rerank(query, cands, top_k=k_rerank)

    # 3) user-facing display
    output = "Answer:\n"
    output += (
        "Based on Missouri landlord–tenant statutes, the following clauses are most relevant:\n\n"
    )
    for p in top:
        output += (
            f"{p['section']} — {p['title']}\n"
            f"\"{p['text']}\"\n\n"
        )

    output += "Citations: " + " ".join([f"[{p['id']}]" for p in top])
    return output

# Demo (this is what you screenshot / paste into README Section 7)
query = "How much notice must a landlord give to terminate a tenancy in Missouri?"
print(user_facing_rag_answer(
    "How much notice must a landlord give to terminate a tenancy in Missouri?",
    alpha=0.5,
    k_retrieve=5,
    k_rerank=1
))

