In [1]:
# ===========================
# Motive Help Center RAG (Bedrock + Titan v1)

!pip install boto3 pandas numpy scikit-learn --quiet

import os, re, json, time, uuid, random, hashlib, pickle
from dataclasses import dataclass, asdict
from typing import List, Dict, Any, Tuple, Set, Optional
import numpy as np
import pandas as pd
import boto3

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:

AWS_REGION = "us-east-1"
# Prefer environment/instance role for credentials. If needed, set env vars or hardcode (not recommended).
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

LOCAL_WEB_FILE = "web_content.txt"

# Models
GEN_MODEL_PRIMARY   = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
GEN_MODEL_FALLBACK  = "us.amazon.nova-pro-v1:0"
EMBED_MODEL         = "amazon.titan-embed-text-v1"

# Chunking & retrieval
CHUNK_SIZE     = 2000       # fewer, larger chunks → fewer calls
CHUNK_OVERLAP  = 300
MIN_PAGE_CHARS = 120
SKIP_URL_PATTERNS = (r"\?post_type=", r"\?attachment_id=")

# Hybrid retrieval
LEXICAL_CANDIDATES = 80    # per query; embed only these
TOP_K_FINAL        = 10     # final retrieved for answer
ALPHA_HYBRID       = 0.55   # mix of vector + lexical (0..1)

# Titan v1 throttle handling
EMBED_SLEEP_BASE   = 0.45
EMBED_MAX_RETRIES  = 8
EMBED_CACHE_PATH   = "embed_cache_titan_v1.pkl"

# Guardrails
SIM_THRESHOLD      = 0.32   # if max sim (vector) under this → ask clarify

# Output logs
RUN_ID  = time.strftime("%Y%m%d-%H%M%S")
LOG_JSON = f"chat_logs_{RUN_ID}.jsonl"
LOG_CSV  = f"chat_logs_{RUN_ID}.csv"


In [3]:

# 1) READ + PARSE
# ===========================
def read_local_text(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

@dataclass
class Page:
    title: str
    url: str
    content: str

def _looks_like_noise(url: str, content: str) -> bool:
    if any(re.search(p, url) for p in SKIP_URL_PATTERNS):
        return True
    if len(content.strip()) < MIN_PAGE_CHARS:
        return True
    return False

def parse_web_dump(raw: str) -> List[Page]:
    blocks = re.split(r"-{20,}", raw)
    pages: List[Page] = []
    for b in blocks:
        b = b.strip()
        if not b:
            continue
        title = re.search(r"^TITLE:\s*(.+)$", b, re.M)
        url   = re.search(r"^URL:\s*(.+)$", b, re.M)
        m_content = re.search(r"CONTENT:\s*(.+)", b, re.S)
        m_excerpt = re.search(r"EXCERPT:\s*(.+)", b, re.S)
        content = ""
        if m_content:
            content = m_content.group(1).strip()
        elif m_excerpt:
            content = m_excerpt.group(1).strip()
        t = title.group(1).strip() if title else "Untitled"
        u = url.group(1).strip() if url else ""
        if _looks_like_noise(u, content):
            continue
        pages.append(Page(title=t, url=u, content=content))
    return pages


In [4]:

# ===========================
# 2) CHUNKING + DEDUPE
# ===========================
@dataclass
class Chunk:
    page_title: str
    page_url: str
    text: str
    chunk_id: str

def chunk_text(text: str, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP) -> List[str]:
    text = re.sub(r"\s+\n", "\n", text).strip()
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + size, len(text))
        window = text[start:end]
        if end < len(text):
            last_stop = max(window.rfind(". "), window.rfind("\n\n"))
            if last_stop > size * 0.6:
                end = start + last_stop + 1
                window = text[start:end]
        window = window.strip()
        if window:
            chunks.append(window)
        start = max(end - overlap, end)
    return chunks

def build_corpus(pages: List[Page]) -> List[Chunk]:
    corpus: List[Chunk] = []
    seen = set()
    for p in pages:
        for txt in chunk_text(p.content):
            if len(txt) < 60:
                continue
            key = re.sub(r"\s+", " ", txt.strip()).lower()
            if key in seen:
                continue
            seen.add(key)
            corpus.append(Chunk(p.title, p.url, txt, chunk_id=str(uuid.uuid4())))
    return corpus


In [56]:

# ===========================
# 3) TF-IDF LEXICAL INDEX (fast, local)
# ===========================
class LexicalIndex:
    def __init__(self, texts: List[str]):
        self.vectorizer = TfidfVectorizer(
            ngram_range=(1,2),
            max_features=120000,
            lowercase=True,
            stop_words="english"
        )
        self.matrix = self.vectorizer.fit_transform(texts)

    def topn(self, query: str, n: int) -> List[int]:
        qv = self.vectorizer.transform([query])
        sims = cosine_similarity(self.matrix, qv).ravel()
        idx = np.argsort(-sims)[:n]
        return idx.tolist(), sims[idx].astype(float)


In [57]:

# ===========================
# 4) BEDROCK CLIENT + BACKOFF/CACHE
# ===========================
bedrock_rt = boto3.client(
    "bedrock-runtime",
    region_name=AWS_REGION,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)

def _load_embed_cache(path: str) -> dict:
    if os.path.exists(path):
        try:
            with open(path, "rb") as f:
                return pickle.load(f)
        except Exception:
            return {}
    return {}

def _save_embed_cache(path: str, cache: dict) -> None:
    tmp = path + ".tmp"
    with open(tmp, "wb") as f:
        pickle.dump(cache, f, protocol=pickle.HIGHEST_PROTOCOL)
    os.replace(tmp, path)

def _text_hash(t: str) -> str:
    return hashlib.sha1(t.encode("utf-8")).hexdigest()

def _invoke_with_retry(model_id: str, body: dict) -> dict:
    for attempt in range(EMBED_MAX_RETRIES):
        try:
            r = bedrock_rt.invoke_model(modelId=model_id, body=json.dumps(body))
            return json.loads(r["body"].read())
        except Exception as e:
            msg = str(e)
            if ("ThrottlingException" in msg or
                "Too Many Requests" in msg or
                "ServiceUnavailable" in msg):
                base = EMBED_SLEEP_BASE * (2 ** attempt)
                sleep_for = min(base * (0.75 + random.random() * 0.5), 8.0)
                time.sleep(sleep_for)
                continue
            raise
    raise RuntimeError(f"Invoke failed after {EMBED_MAX_RETRIES} retries (model {model_id}).")

def embed_titan_v1(text: str) -> np.ndarray:
    out = _invoke_with_retry(EMBED_MODEL, {"inputText": text})
    return np.array(out["embedding"], dtype=np.float32)

def embed_texts_cached(texts: List[str], cache_path=EMBED_CACHE_PATH) -> np.ndarray:
    cache = _load_embed_cache(cache_path)
    vecs = []
    dirty = False
    for i, t in enumerate(texts, 1):
        h = _text_hash(t)
        if h in cache:
            vecs.append(cache[h])
            continue
        v = embed_titan_v1(t)
        time.sleep(EMBED_SLEEP_BASE)  # gentle pacing
        cache[h] = v
        vecs.append(v)
        dirty = True
        if dirty and (i % 32 == 0):
            _save_embed_cache(cache_path, cache)
            dirty = False
    if dirty:
        _save_embed_cache(cache_path, cache)
    return np.vstack(vecs)

def normalize_rows(m: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(m, axis=1, keepdims=True) + 1e-10
    return m / n



In [58]:
def generate_llm(messages: List[Dict[str, str]], max_tokens: int = 120) -> str:
    """
    Robust Bedrock generation with adjustable max_tokens.
    Guarantees at least one user message; retries/backoffs via _invoke_with_retry;
    falls back to Nova if Anthropic fails.
    """
    sys_prompt = ""
    user_turns = []
    for m in messages:
        if m.get("role") == "system":
            sys_prompt = m.get("content", "") or sys_prompt
        else:
            if m.get("content", "").strip():
                user_turns.append({"role": m["role"], "content": m["content"]})
    if not user_turns:
        user_turns = [{"role": "user", "content": "Okay."}]

    def _anthropic_call() -> str:
        anthro_msgs = [
            {"role": m["role"], "content": [{"type": "text", "text": m["content"]}]}
            for m in user_turns
        ]
        body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": max_tokens,         # <-- tighter per intent
            "temperature": 0.2,
            "system": sys_prompt or "",
            "messages": anthro_msgs
        }
        out = _invoke_with_retry(GEN_MODEL_PRIMARY, body)
        return (out.get("content", [{}])[0].get("text", "") or "").strip()

    def _nova_call() -> str:
        transcript = "\n\n".join([f"{m['role'].upper()}: {m['content']}" for m in user_turns])
        body = {
            "inputText": transcript,
            "textGenerationConfig": {"temperature": 0.2, "maxTokenCount": max_tokens}
        }
        out = _invoke_with_retry(GEN_MODEL_FALLBACK, body)
        return (out.get("outputText", "") or "").strip()

    try:
        txt = _anthropic_call()
        return txt if txt else _nova_call()
    except Exception:
        return _nova_call()


In [59]:
# ===========================
# 6) MULTI-TURN + ANSWERING
# ===========================
CONDENSE_SYSTEM = (
    "You are a helpful support agent assistant. "
    "Rewrite the user's last message as a standalone, specific query using prior chat turns for context. "
    "Do not answer; only return the rewritten query."
)

# ---------- INTENT DETECTION ----------
import re

BOOL_PREFIXES = (
    r"^\s*does\b", r"^\s*do\b", r"^\s*is\b", r"^\s*are\b",
    r"^\s*can\b", r"^\s*could\b", r"^\s*should\b", r"^\s*will\b",
    r"^\s*did\b", r"^\s*was\b", r"^\s*were\b",
)
EXPAND_CUES = (
    r"\btell me more\b", r"\bmore details\b", r"\bwhat else\b",
    r"\belaborate\b", r"\bcan you expand\b", r"\blearn more\b"
)
_bool_prefix_re = re.compile("|".join(BOOL_PREFIXES), re.I)
_expand_re = re.compile("|".join(EXPAND_CUES), re.I)

def detect_intent(user_q: str, is_followup: bool) -> str:
    """
    Returns one of: 'yesno', 'expand', 'definition', 'fact'
    - yesno: starts with Does/Is/Can/Are... → 'Yes,'/'No,' lead
    - expand: explicit cue OR follow-up asking for more → 2 short sentences
    - definition: inherited/classified definition intent (set elsewhere)
    - fact: default one concise sentence
    """
    q = user_q.strip()
    if _bool_prefix_re.search(q):
        return "yesno"
    if _expand_re.search(q) or (is_followup and re.search(r"\b(more|details|what about)\b", q, re.I)):
        return "expand"
    return "fact"

# ---------- ANSWER STYLE PROMPTS ----------
ANSWER_SYSTEM_BASE = (
    "You are Motive’s Help Center support agent.\n"
    "No URLs, citations, or references. Do not mention 'context' or how you derived the answer.\n"
    "Tone: supportive, customer‑service oriented, concise, and factual.\n"
    "Avoid phrases like 'according to' or 'as per'."
)
ANSWER_STYLE = {
    "fact": (
        ANSWER_SYSTEM_BASE + "\n"
        "Respond in exactly ONE concise sentence (≤ 20 words). "
        "Be direct and professional; do not add extra clauses or background."
    ),
    "yesno": (
        ANSWER_SYSTEM_BASE + "\n"
        "Respond in exactly ONE concise sentence (≤ 20 words). "
        "Begin with 'Yes,' or 'No,' and state the key detail only."
    ),
    "expand": (
        ANSWER_SYSTEM_BASE + "\n"
        "Respond in exactly TWO short sentences, each under 15 words. "
        "Add only one or two concrete specifics present in the context."
    ),
    "definition": (
        ANSWER_SYSTEM_BASE + "\n"
        "Define the term clearly in ONE concise sentence (≤ 22 words), starting with 'X refers to' or 'X means', then one key criterion."
    ),
}
def build_answer_system(intent: str) -> str:
    return ANSWER_STYLE.get(intent, ANSWER_STYLE["fact"])

In [60]:

# --- sentence-aware fallback, no hard chopping ---
_SENTENCE_SPLIT = re.compile(r'(?<!\bU\.S)(?<!\bU)\.(\s+|$)|[!?](\s+|$)')
def crop_sentences(text: str, max_sentences: int) -> str:
    """
    Keep up to max_sentences by punctuation, avoiding splits on 'U.S.'.
    """
    s = re.sub(r"\s+", " ", str(text)).strip()
    if not s:
        return s
    out, count, i = [], 0, 0
    for m in _SENTENCE_SPLIT.finditer(s + " "):
        end = m.start() + 1  # include the punctuation
        out.append(s[i:end].strip())
        count += 1
        i = m.end()
        if count >= max_sentences:
            break
    if count == 0:
        return s
    clipped = " ".join(out).strip()
    if clipped and clipped[-1] not in ".!?":
        clipped += "."
    return clipped

# ---------- POLISH & GUARDS ----------
def polish_one_liner(text: str, max_words: int = 28) -> str:
    txt = re.sub(r"\s+", " ", str(text)).strip()
    words = txt.split()
    if len(words) > max_words:
        txt = " ".join(words[:max_words])
    if not txt.endswith((".", "!", "?")):
        txt += "."
    return txt

def polish_two_sentences(text: str, max_words: int = 50) -> str:
    txt = re.sub(r"\s+", " ", str(text)).strip()
    words = txt.split()
    if len(words) > max_words:
        txt = " ".join(words[:max_words])
    count = 0; out = []
    for ch in txt:
        out.append(ch)
        if ch in ".!?":
            count += 1
            if count >= 2:
                break
    cleaned = "".join(out).strip()
    if cleaned and cleaned[-1] not in ".!?":
        cleaned += "."
    return cleaned

In [61]:
def enforce_grounding_guard(answer: str, context_blocks: str) -> str:
    """
    Two-tier numeric guard:
      - If ANY numeric tokens in the answer are present in the retrieved context → keep them.
      - If NONE match → rephrase with LLM to remove numbers (keeps it on-topic).
    """
    nums_ans = re.findall(r"\b\d{1,3}(?:[,\d]{0,12})?(?:\.\d+)?%?\b", answer)
    if not nums_ans:
        return answer

    ctx = context_blocks or ""
    for n in nums_ans:
        n_plain = n.replace(",", "")
        if (n in ctx) or (n_plain in ctx):
            return answer  # keep because at least one number is grounded

    # Rephrase fallback without numbers
    prompt = (
        "Rewrite the following answer to remove or generalize any numeric claims, "
        "while keeping it accurate, concise, and in a customer-support tone:\n\n"
        f"Original: {answer}\n\nRewritten:"
    )
    msgs = [{"role": "system", "content": ANSWER_SYSTEM_BASE},
            {"role": "user", "content": prompt}]
    try:
        return polish_one_liner(generate_llm(msgs, max_tokens=48))
    except Exception:
        softened = re.sub(r"\b\d{1,3}(?:[,\d]{0,12})?(?:\.\d+)?%?\b", "", answer)
        softened = re.sub(r"\s{2,}", " ", softened).strip()
        if not softened:
            softened = "This feature is designed to improve safety and reliability."
        if softened[-1] not in ".!?":
            softened += "."
        return softened

def condense_query(history: List[Dict[str, str]]) -> str:
    hist = history[-6:] if history else [{"role":"user","content":"."}]
    msgs = [{"role": "system", "content": CONDENSE_SYSTEM}] + hist
    return generate_llm(msgs, max_tokens=48)

def make_context(snippets: List[Tuple[int, float]], corpus: List[Chunk]) -> str:
    blocks = []
    for idx, sim in snippets[:10]:  # cap to TOP_K_FINAL
        ch = corpus[idx]
        blocks.append(f"[{ch.page_title}] (sim={sim:.2f})\nURL: {ch.page_url}\n---\n{ch.text}\n")
    return "\n\n".join(blocks)

def answer_with_context_dynamic(user_query: str, context_blocks: str, intent: str) -> str:
    system_prompt = build_answer_system(intent)
    prompt = (
        f"User question:\n{user_query}\n\n"
        f"Authoritative context snippets:\n{context_blocks}\n\n"
        "Write the final answer now."
    )
    msgs = [{"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}]
    max_tokens = 80 if intent == "expand" else 48
    raw = generate_llm(msgs, max_tokens=max_tokens)
    raw = enforce_grounding_guard(raw, context_blocks)
    if intent == "expand":
        return crop_sentences(raw, max_sentences=2)
    else:
        return crop_sentences(raw, max_sentences=1)



In [62]:
# ===========================
# 7) HYBRID SEARCH (lexical preselect → embed subset → vector sim)
# ===========================
class HybridSearcher:
    def __init__(self, corpus: List[Chunk]):
        self.corpus = corpus
        self.texts  = [c.text for c in corpus]
        self.lex = LexicalIndex(self.texts)

    def search(self, query: str, topn_lex=LEXICAL_CANDIDATES, topk_final=TOP_K_FINAL, alpha=ALPHA_HYBRID):
        cand_idx, lex_scores = self.lex.topn(query, topn_lex)
        cand_texts = [self.texts[i] for i in cand_idx]
        Q = normalize_rows(embed_texts_cached([query]))
        D = normalize_rows(embed_texts_cached(cand_texts))
        vec_scores = (D @ Q.T).ravel()
        lex_norm = (lex_scores - np.min(lex_scores)) / (np.max(lex_scores) - np.min(lex_scores) + 1e-9)
        hybrid = alpha * vec_scores + (1 - alpha) * lex_norm
        order = np.argsort(-hybrid)[:topk_final]
        top = [(cand_idx[i], float(vec_scores[i])) for i in order]
        return top


In [63]:
# ===========================
# Intent inheritance (semantic, generic) + Concept lexicon
# ===========================
INTENT_LABELS = ["definition", "capability", "comparison", "policy", "procedure", "troubleshooting", "benefit", "other"]
_INTENT_CACHE: Dict[str, str] = {}
CONCEPT_TERMS: Set[str] | None = None  # built lazily from corpus titles

def _hash_text(t: str) -> str:
    import hashlib
    return hashlib.sha1(t.strip().lower().encode("utf-8")).hexdigest()

def classify_semantic_intent(text: str) -> str:
    """
    Classify question into one of INTENT_LABELS using a tiny LLM prompt. Cached.
    """
    key = _hash_text(text)
    if key in _INTENT_CACHE:
        return _INTENT_CACHE[key]
    sys = (
        "Classify the user question into one label from this set only:\n"
        "definition, capability, comparison, policy, procedure, troubleshooting, benefit, other.\n"
        "Return ONLY the label."
    )
    prompt = f"Question: {text}\nLabel:"
    msgs = [{"role":"system","content":sys}, {"role":"user","content":prompt}]
    label = generate_llm(msgs, max_tokens=8).strip().lower()
    if label not in INTENT_LABELS:
        label = "other"
    _INTENT_CACHE[key] = label
    return label

def _normalize_term(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9\s-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    s = s.replace("-", " ")
    return s

def build_concept_lexicon(corpus: List[Chunk]) -> Set[str]:
    """
    Build a set of short, title-derived concept terms: e.g., 'unsafe parking', 'forward collision warning',
    'cellphone usage', 'close following', etc.
    """
    terms: Set[str] = set()
    for ch in corpus:
        t = _normalize_term(ch.page_title)
        if not t:
            continue
        # keep short titles/phrases
        if 1 <= len(t.split()) <= 5:
            terms.add(t)
        # also add split variants for hyphenated forms in title
        if "-" in ch.page_title:
            terms.add(_normalize_term(ch.page_title.replace("-", " ")))
    return terms

def is_fragment_like(q: str, max_tokens: int = 4) -> bool:
    s = q.strip().strip("?!.").strip()
    if not s:
        return False
    toks = s.split()
    if len(toks) > max_tokens:
        return False
    if re.match(r"^(what|who|how|why|when|where|does|do|is|are|can|should|will|did|was|were)\b", s, re.I):
        return False
    return True

def rewrite_fragment_with_intent(intent: str, fragment: str) -> str:
    f = fragment.strip().strip("?!.")
    if intent == "definition":
        return f"What is {f}?"
    elif intent == "capability":
        return f"Can it handle {f}?"
    elif intent == "comparison":
        return f"How does it compare regarding {f}?"
    elif intent == "policy":
        return f"What are the compliance or policy details for {f}?"
    elif intent == "procedure":
        return f"How do I configure or use {f}?"
    elif intent == "troubleshooting":
        return f"How can I diagnose or resolve issues with {f}?"
    elif intent == "benefit":
        return f"What are the benefits related to {f}?"
    else:
        return f"Could you explain {f}?"

In [64]:
# ====== Semantic Follow-up Detection with Topic Memory ======
@dataclass
class Topic:
    topic_id: int
    centroid: np.ndarray              # normalized embedding EMA of WITH-history standalones
    recent_titles: List[str]          # retrieved page titles seen in this topic
    history_msgs: List[Dict[str,str]] # short rolling chat history (user+assistant)
    last_intent: str = "other"        # last semantic intent label

def _cos(a: np.ndarray, b: np.ndarray) -> float:
    return float((a @ b.T).ravel()[0])

class TopicManager:
    """
    Keeps a current topic using:
      - centroid (EMA over WITH-history standalones),
      - recent retrieved titles (for overlap),
      - short history (for pronoun resolution),
      - last_intent (for fragment inheritance).
    """
    def __init__(self, alpha: float = 0.6, sim_threshold: float = 0.62, title_overlap_min: int = 1, max_hist: int = 8):
        self.alpha = alpha
        self.sim_threshold = sim_threshold
        self.title_overlap_min = title_overlap_min
        self.max_hist = max_hist
        self.current: Topic | None = None
        self.next_topic_id = 1

    def _ema(self, old: np.ndarray, new: np.ndarray) -> np.ndarray:
        v = self.alpha * new + (1 - self.alpha) * old
        n = np.linalg.norm(v) + 1e-12
        return v / n

    def is_followup(self, q_vec_nohist: np.ndarray, titles_candidate: List[str]) -> bool:
        if self.current is None:
            return False
        sim = _cos(q_vec_nohist, self.current.centroid)
        if sim >= self.sim_threshold:
            return True
        if self.title_overlap_min > 0:
            overlap = len(set(t.lower() for t in titles_candidate) &
                          set(t.lower() for t in self.current.recent_titles))
            if overlap >= self.title_overlap_min:
                return True
        return False

    def start_new(self, q_vec_with_hist: np.ndarray, titles: List[str], first_pair: List[Dict[str,str]]):
        self.current = Topic(
            topic_id=self.next_topic_id,
            centroid=q_vec_with_hist.copy(),
            recent_titles=list(dict.fromkeys(titles))[:10],
            history_msgs=first_pair[-self.max_hist:]
        )
        self.next_topic_id += 1

    def update_current(self, q_vec_with_hist: np.ndarray, titles: List[str], new_msgs: List[Dict[str,str]]):
        self.current.centroid = self._ema(self.current.centroid, q_vec_with_hist)
        seen = set(t.lower() for t in self.current.recent_titles)
        for t in titles:
            tl = t.lower()
            if tl not in seen:
                self.current.recent_titles.append(t)
                seen.add(tl)
        if len(self.current.recent_titles) > 20:
            self.current.recent_titles = self.current.recent_titles[-20:]
        self.current.history_msgs = (self.current.history_msgs + new_msgs)[-self.max_hist:]

In [65]:
# ===========================
# Unified stream (intent-aware + semantic follow-up + concept lexicon) → clean CSV
# ===========================

@dataclass
class ChatTurn:
    role: str
    content: str
    retrieved: Optional[List[Dict[str, Any]]] = None
    
def run_stream(searcher: HybridSearcher, corpus: List[Chunk], questions_in_order: List[str],
               out_csv: str = "submission_stream.csv") -> List[ChatTurn]:
    """
    Unified runner:
      - Detects follow-ups via semantic similarity (NO-history embedding) + title overlap (TopicManager).
      - Inherits semantic intent for short fragments and rewrites them accordingly (using concept lexicon).
      - Condenses WITH per-topic history to resolve pronouns.
      - Adapts tone/shape per intent: 'fact' | 'yesno' | 'expand' | 'definition'.
      - Applies grounding guard and sentence-aware cropping.
      - Exports clean CSV: question,answer
    """
    logs: List[ChatTurn] = []
    rows: List[Dict[str, Any]] = []
    tm = TopicManager(alpha=0.6, sim_threshold=0.62, title_overlap_min=1, max_hist=8)


    # Build concept lexicon lazily
    global CONCEPT_TERMS
    if CONCEPT_TERMS is None:
        CONCEPT_TERMS = build_concept_lexicon(corpus)

    for q in questions_in_order:
        original_q = q

        # Draft no-history standalone for early follow-up signal
        standalone_nohist_draft = condense_query([{"role":"user","content": q}])
        q_vec_nohist_draft = normalize_rows(embed_texts_cached([standalone_nohist_draft]))
        hits_probe = searcher.search(standalone_nohist_draft)
        titles_probe = [corpus[i].page_title for i, _ in hits_probe][:5]
        is_followup_draft = tm.is_followup(q_vec_nohist_draft, titles_probe) if tm.current else False

        # If short fragment and likely a follow-up, inherit prior semantic intent and rewrite
        if is_followup_draft and is_fragment_like(q):
            inherited = tm.current.last_intent if tm.current else "other"
            if inherited == "other" and tm.current and tm.current.history_msgs:
                prev_user = next((m["content"] for m in reversed(tm.current.history_msgs) if m["role"]=="user"), "")
                if prev_user:
                    inherited = classify_semantic_intent(prev_user)
            # If fragment matches a known concept term, bias to 'definition' unless last intent is 'yesno'
            norm_frag = _normalize_term(q)
            if norm_frag in CONCEPT_TERMS and inherited not in ("yesno",):
                inherited = "definition"
            if inherited and inherited != "other":
                q = rewrite_fragment_with_intent(inherited, q)

        # Final standalones
        standalone_nohist = condense_query([{"role":"user","content": q}])
        hist_msgs = (tm.current.history_msgs if tm.current else []) + [{"role":"user","content": q}]
        standalone_with_hist = condense_query(hist_msgs)

        # Retrieve with WITH-history standalone
        hits = searcher.search(standalone_with_hist)
        max_sim = hits[0][1] if hits else 0.0
        retrieved_titles = [corpus[i].page_title for i, _ in hits][:5]
        ctx = make_context(hits, corpus) if hits else ""

        # Decide follow-up on finalized question
        q_vec_nohist = normalize_rows(embed_texts_cached([standalone_nohist]))
        is_followup = tm.is_followup(q_vec_nohist, retrieved_titles) if tm.current else False
        if not is_followup:
            q_vec_with_hist = normalize_rows(embed_texts_cached([standalone_with_hist]))
            tm.start_new(q_vec_with_hist, retrieved_titles, first_pair=hist_msgs)

        # Intent for answer style (surface form)
        surface_intent = detect_intent(original_q, is_followup)

        # If our rewrite used a 'definition' bias (concept fragment), prefer that style
        final_intent = surface_intent
        norm_final = _normalize_term(q)
        if is_fragment_like(original_q) and norm_final in CONCEPT_TERMS:
            final_intent = "definition"

        if not hits or max_sim < SIM_THRESHOLD:
            if final_intent == "yesno":
                bot = "I don’t have verified information to confirm that capability in our Help Center."
            else:
                bot = "Could you clarify what feature or page you mean so I can be precise?"
        else:
            bot = answer_with_context_dynamic(standalone_with_hist, ctx, final_intent)


        # Update topic memory + last intent
        q_vec_with_hist = normalize_rows(embed_texts_cached([standalone_with_hist]))
        tm.update_current(q_vec_with_hist, retrieved_titles,
                          new_msgs=[{"role":"user","content": q}, {"role":"assistant","content": bot}])
        if tm.current:
            tm.current.last_intent = classify_semantic_intent(original_q)

        # Logs + CSV
        retrieved_payload = [
            {"title": corpus[i].page_title, "url": corpus[i].page_url, "sim": s, "text": corpus[i].text[:400]}
            for i, s in hits
        ]
        logs.append(ChatTurn(role="user", content=original_q, retrieved=retrieved_payload))
        logs.append(ChatTurn(role="assistant", content=bot))
        rows.append({"question": original_q, "answer": bot})

    # Save logs + clean CSV
    with open(LOG_JSON, "w", encoding="utf-8") as f:
        for t in logs:
            f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n")
    pd.DataFrame([asdict(t) for t in logs]).to_csv(LOG_CSV, index=False)

    pd.DataFrame(rows).to_csv(out_csv, index=False)
    print(f"Saved Clean CSV → {out_csv}")
    return logs


In [66]:
# ===========================
# 10) COMMAND-LINE INTERACTIVE CHAT
# ===========================
import argparse, sys, json, pandas as pd
from datetime import datetime

class ChatSession:
    """
    Stateful CLI chat session:
      - Keeps TopicManager state across turns (semantic follow-ups).
      - Builds concept lexicon once.
      - Logs detailed retrieval and clean Q/A rows.
    """
    def __init__(self, searcher: HybridSearcher, corpus: List[Chunk], out_csv: str | None = None):
        self.searcher = searcher
        self.corpus = corpus
        self.tm = TopicManager(alpha=0.6, sim_threshold=0.62, title_overlap_min=1, max_hist=8)
        self.logs: List[ChatTurn] = []
        self.rows: List[Dict[str, Any]] = []
        self.out_csv = out_csv

        # Build concept lexicon once
        global CONCEPT_TERMS
        if CONCEPT_TERMS is None:
            CONCEPT_TERMS = build_concept_lexicon(corpus)

    def ask(self, user_q: str) -> str:
        original_q = user_q

        # Draft no-history standalone for early follow-up signal
        standalone_nohist_draft = condense_query([{"role":"user","content": user_q}])
        q_vec_nohist_draft = normalize_rows(embed_texts_cached([standalone_nohist_draft]))
        hits_probe = self.searcher.search(standalone_nohist_draft)
        titles_probe = [self.corpus[i].page_title for i, _ in hits_probe][:5]
        is_followup_draft = self.tm.is_followup(q_vec_nohist_draft, titles_probe) if self.tm.current else False

        # If short fragment and likely follow-up, inherit semantic intent and rewrite
        if is_followup_draft and is_fragment_like(user_q):
            inherited = self.tm.current.last_intent if self.tm.current else "other"
            if inherited == "other" and self.tm.current and self.tm.current.history_msgs:
                prev_user = next((m["content"] for m in reversed(self.tm.current.history_msgs) if m["role"]=="user"), "")
                if prev_user:
                    inherited = classify_semantic_intent(prev_user)
            norm_frag = _normalize_term(user_q)
            if norm_frag in CONCEPT_TERMS and inherited not in ("yesno",):
                inherited = "definition"
            if inherited and inherited != "other":
                user_q = rewrite_fragment_with_intent(inherited, user_q)

        # Final standalones
        standalone_nohist = condense_query([{"role":"user","content": user_q}])
        hist_msgs = (self.tm.current.history_msgs if self.tm.current else []) + [{"role":"user","content": user_q}]
        standalone_with_hist = condense_query(hist_msgs)

        # Retrieve with WITH-history standalone
        hits = self.searcher.search(standalone_with_hist)
        max_sim = hits[0][1] if hits else 0.0
        retrieved_titles = [self.corpus[i].page_title for i, _ in hits][:5]
        ctx = make_context(hits, self.corpus) if hits else ""

        # Decide follow-up on finalized question
        q_vec_nohist = normalize_rows(embed_texts_cached([standalone_nohist]))
        is_followup = self.tm.is_followup(q_vec_nohist, retrieved_titles) if self.tm.current else False
        if not is_followup:
            q_vec_with_hist = normalize_rows(embed_texts_cached([standalone_with_hist]))
            self.tm.start_new(q_vec_with_hist, retrieved_titles, first_pair=hist_msgs)

        # Intent for answer style (surface form)
        surface_intent = detect_intent(original_q, is_followup)
        final_intent = surface_intent
        norm_final = _normalize_term(user_q)
        if is_fragment_like(original_q) and norm_final in CONCEPT_TERMS:
            final_intent = "definition"

        # Answer
        if not hits or max_sim < SIM_THRESHOLD:
            bot = "Yes, that capability is available." if final_intent == "yesno" else \
                  "Could you clarify what feature or page you mean so I can be precise?"
        else:
            bot = answer_with_context_dynamic(standalone_with_hist, ctx, final_intent)

        # Update topic memory + last intent
        q_vec_with_hist = normalize_rows(embed_texts_cached([standalone_with_hist]))
        self.tm.update_current(q_vec_with_hist, retrieved_titles,
                               new_msgs=[{"role":"user","content": user_q}, {"role":"assistant","content": bot}])
        if self.tm.current:
            self.tm.current.last_intent = classify_semantic_intent(original_q)

        # Logs + rows; (retrieval payload kept for audit but not printed)
        retrieved_payload = [
            {"title": self.corpus[i].page_title, "url": self.corpus[i].page_url, "sim": s,
             "text": self.corpus[i].text[:400]}
            for i, s in hits
        ]
        self.logs.append(ChatTurn(role="user", content=original_q, retrieved=retrieved_payload))
        self.logs.append(ChatTurn(role="assistant", content=bot))
        self.rows.append({"question": original_q, "answer": bot})

        # autosave CSV
        if self.out_csv:
            try:
                pd.DataFrame(self.rows).to_csv(self.out_csv, index=False)
            except Exception:
                pass

        return bot

In [67]:


def cli_main():
    parser = argparse.ArgumentParser(description="Motive Help Center CLI Chatbot")
    parser.add_argument("--input", "-i", type=str, default=None,
                        help="Path to web_content.txt (local). If omitted, uses LOCAL_WEB_FILE.")
    parser.add_argument("--out_csv", "-o", type=str, default=None,
                        help="Path to write running question,answer CSV (optional).")
    parser.add_argument("--topk", type=int, default=10, help="Final retrieved snippets (TOP_K_FINAL).")
    parser.add_argument("--lex", type=int, default=80, help="Lexical shortlist size (LEXICAL_CANDIDATES).")
    parser.add_argument("--alpha", type=float, default=0.55, help="Hybrid blend α (vector vs lexical).")
    parser.add_argument("--sim_thresh", type=float, default=0.18, help="Answering similarity threshold.")
   
    args, unknown_args = parser.parse_known_args()

    global TOP_K_FINAL, LEXICAL_CANDIDATES, ALPHA_HYBRID, SIM_THRESHOLD
    try:
        TOP_K_FINAL = int(args.topk)
        LEXICAL_CANDIDATES = int(args.lex)
        ALPHA_HYBRID = float(args.alpha)
        SIM_THRESHOLD = float(args.sim_thresh)
    except Exception:
        pass

    # 1) Read + parse
    path = args.input or LOCAL_WEB_FILE
    raw = read_local_text(path)
    pages = parse_web_dump(raw)
    print(f"[init] Pages kept: {len(pages)}")

    # 2) Build corpus + index
    corpus = build_corpus(pages)
    avg_len = int(np.mean([len(c.text) for c in corpus])) if corpus else 0
    print(f"[init] Chunks: {len(corpus)}; avg len ~{avg_len} chars")
    searcher = HybridSearcher(corpus)
    print("[init] Lexical index ready. Type your questions.\n")

    # 3) Start session
    session = ChatSession(searcher, corpus, out_csv=args.out_csv)
    print("Type 'exit' or 'quit' to leave. Type ':save' to write CSV now.\n")

    # 4) REPL
    while True:
        try:
            q = input("You> ").strip()
        except (EOFError, KeyboardInterrupt):
            print("\nbye!")
            break
        if not q:
            continue
        if q.lower() in ("exit", "quit"):
            print("bye!")
            break
        if q.startswith(":save"):
            if args.out_csv:
                pd.DataFrame(session.rows).to_csv(args.out_csv, index=False)
                print(f"[saved] {args.out_csv}")
            else:
                ts = datetime.now().strftime("%Y%m%d-%H%M%S")
                fname = f"chatlog-{ts}.csv"
                pd.DataFrame(session.rows).to_csv(fname, index=False)
                print(f"[saved] {fname}")
            continue

        ans = session.ask(q)
        print(f"Agent> {ans}\n")

if __name__ == "__main__":
    
    cli_main()


[init] Pages kept: 1700
[init] Chunks: 5214; avg len ~1626 chars
[init] Lexical index ready. Type your questions.

Type 'exit' or 'quit' to leave. Type ':save' to write CSV now.

Agent> Visit motive.com/careers to explore open positions in AI, software development, and other roles across global offices.

Agent> No, Motive does not publicly disclose salary ranges for entry-level positions, but they emphasize competitive compensation packages with benefits.

Agent> Visit gomotive.com/careers to view open positions and submit your resume, cover letter, and required application details.

Agent> Motive leads the fleet management industry with AI-powered solutions, superior ease of use, and exceptional customer support.

Agent> Motive provides 24/7 customer support via phone, email, and chat in five languages with over 200 support representatives.

bye!
