# Case Study 6 — HR Q&A Assistant (Corrected & Safe)

This notebook includes:
- LangChain 1.0 `ChatOpenAI` usage
- Safe `extract_template` escaping to avoid KeyError
- Robust LLM invocation and JSON parsing
- Mock mode when `OPENAI_API_KEY` is not present


In [20]:
#!pip install --quiet langchain openai faiss-cpu tiktoken || true
print('Dependencies install step (no-op if already installed)')

Dependencies install step (no-op if already installed)


In [21]:
import os, json, re, logging
from typing import List, Dict, Any
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('hr_qa')

OPENAI_KEY = os.getenv('OPENAI_API_KEY')
if OPENAI_KEY:
    logger.info('OPENAI_API_KEY found — real API calls enabled.')
else:
    logger.warning('OPENAI_API_KEY not found — running in MOCK mode.')


INFO:hr_qa:OPENAI_API_KEY found — real API calls enabled.


In [22]:
# --- Safe extract_template creation to avoid KeyError from literal JSON braces ---
orig = globals().get('extract_template', None)
if orig is None:
    orig = """Extract 1-3 important HR policy clauses from the text below.

Return ONLY a JSON array with objects of the form:
[
  { "id": <int>, "title": "<short title>", "summary": "<one-sentence summary>", "keywords": ["<kw1>", "<kw2>"], "contact": "<email>" }
]

Constraints:
- Return ONLY the VALID JSON array. No extra commentary, no markdown.
- id must be an integer.
- summary must be one concise sentence.
- keywords must be short lowercase tokens.

Text:
{chunk}
"""

def make_safe_template(template_str: str, placeholder: str = '__CHUNK_PLACEHOLDER__'):
    template_str = template_str.replace(placeholder, placeholder + '_TMP')
    template_with_token = template_str.replace('{chunk}', placeholder)
    escaped = template_with_token.replace('{', '{{').replace('}', '}}')
    final = escaped.replace(placeholder, '{chunk}')
    final = final.replace(placeholder + '_TMP', placeholder)
    return final

safe_extract_template = make_safe_template(orig)
extract_template = safe_extract_template

# Sanity-check formatting
prompt = PromptTemplate(input_variables=['chunk'], template=extract_template)
test_chunk = 'Employees accrue 12 sick days per year; medical certificate required for >3 days.'
formatted_text = prompt.format(chunk=test_chunk)
print('Safe extract_template formatted (preview):\n', formatted_text[:600])


Safe extract_template formatted (preview):
 Extract 1-3 important HR policy clauses from the text below.

Return ONLY a JSON array with objects of the form:
[
  {{ "id": <int>, "title": "<short title>", "summary": "<one-sentence summary>", "keywords": ["<kw1>", "<kw2>"], "contact": "<email>" }}
]

Constraints:
- Return ONLY the VALID JSON array. No extra commentary, no markdown.
- id must be an integer.
- summary must be one concise sentence.
- keywords must be short lowercase tokens.

Text:
Employees accrue 12 sick days per year; medical certificate required for >3 days.



In [23]:
def robust_parse_clauses(raw: str) -> List[Dict[str,Any]]:
    try:
        parsed = json.loads(raw)
    except Exception:
        m = re.search(r'(\[\s*\{[\s\S]*?\}\s*\])', raw)
        if not m:
            m = re.search(r'(\{[\s\S]*?\})', raw)
        if not m:
            raise ValueError('No JSON detected in model output. Raw output:' + raw[:1000])
        json_text = m.group(1)
        parsed = json.loads(json_text)
    if isinstance(parsed, dict):
        parsed = [parsed]
    if not isinstance(parsed, list):
        raise ValueError('Parsed JSON is not a list.')
    normalized = []
    for i, obj in enumerate(parsed):
        if not isinstance(obj, dict):
            raise ValueError(f'Clause {i} is not an object: {obj}')
        id_val = obj.get('id') or obj.get('ID') or obj.get('Id') or (i+1)
        try:
            id_val = int(id_val)
        except Exception:
            id_val = i+1
        title = (obj.get('title') or obj.get('name') or f'Clause {id_val}').strip()
        summary = (obj.get('summary') or obj.get('desc') or '').strip()
        keywords = obj.get('keywords') or obj.get('tags') or []
        if isinstance(keywords, str):
            keywords = [k.strip().lower() for k in re.split(r'[;,|]', keywords) if k.strip()]
        keywords = [str(k).lower() for k in keywords]
        contact = obj.get('contact') or obj.get('contact_email') or 'hr@company.com'
        normalized.append({'id': id_val, 'title': title, 'summary': summary, 'keywords': keywords, 'contact': contact})
    return normalized


In [24]:
def invoke_extraction_for(chunk_text: str, model_name: str='gpt-4o-mini', temperature: float=0.0, max_tokens: int=600) -> str:
    # Mock mode if no API key
    if not OPENAI_KEY:
        mock = [
            { "id": 1, "title": "Parental Leave", "summary": "Employees may take up to 12 weeks parental leave after 12 months of service.", "keywords": ["parental","leave","12 weeks"], "contact":"hr@company.com" },
            { "id": 2, "title": "Sick Leave", "summary": "Employees accrue up to 12 days sick leave per year; medical note required for >3 days.", "keywords": ["sick","leave"], "contact":"hr@company.com" }
        ]
        return json.dumps(mock)
    # Real mode: call ChatOpenAI from langchain
    llm = ChatOpenAI(temperature=temperature, model_name=model_name)
    prompt = PromptTemplate(input_variables=['chunk'], template=extract_template)
    formatted = prompt.format(chunk=chunk_text)
    try:
        resp = llm(messages=[HumanMessage(content=formatted)])
        if hasattr(resp, 'content'):
            return resp.content
        if hasattr(resp, 'generations'):
            return resp.generations[0][0].text
        return str(resp)
    except Exception:
        try:
            gen = llm.generate([formatted])
            return gen.generations[0][0].text
        except Exception:
            resp2 = llm(formatted)
            if hasattr(resp2, 'content'):
                return resp2.content
            if hasattr(resp2, 'generations'):
                return resp2.generations[0][0].text
            return str(resp2)


In [25]:
sample_texts = [
    "Employees are eligible for 12 weeks parental leave following the birth or adoption of a child. Eligibility requires at least 12 months of continuous employment.",
    "Employees accrue 1 day of sick leave per month, up to a maximum of 12 days per year. A medical certificate is required for absences longer than 3 consecutive days.",
    "Employees must provide at least 30 days notice for voluntary resignation. Remote work up to 2 days per week pending manager approval."
]

all_clauses = []
for i, chunk in enumerate(sample_texts):
    print(f"Processing chunk {i} (len={len(chunk)} chars)")
    raw = invoke_extraction_for(chunk_text=chunk)
    print("Raw model output (truncated):", raw[:300])
    try:
        clauses = robust_parse_clauses(raw)
        print(f"Parsed {len(clauses)} clauses from chunk {i}")
        all_clauses.extend(clauses)
    except Exception as e:
        print(f"Error parsing chunk {i}:", e)

print('\nAll extracted clauses:')
print(json.dumps(all_clauses, indent=2))


Processing chunk 0 (len=160 chars)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw model output (truncated): [
  {
    "id": 1,
    "title": "Parental Leave Policy",
    "summary": "Employees are entitled to 12 weeks of parental leave after the birth or adoption of a child, contingent on 12 months of continuous employment.",
    "keywords": ["parental leave", "employment", "adoption"],
    "contact": "hr@e
Parsed 1 clauses from chunk 0
Processing chunk 1 (len=163 chars)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw model output (truncated): [
  {
    "id": 1,
    "title": "Sick Leave Policy",
    "summary": "Employees earn 1 day of sick leave each month, capped at 12 days annually, with a medical certificate needed for absences over 3 days.",
    "keywords": ["sick leave", "medical certificate"],
    "contact": "hr@example.com"
  }
]
Parsed 1 clauses from chunk 1
Processing chunk 2 (len=133 chars)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw model output (truncated): [
  { "id": 1, "title": "Voluntary Resignation Notice", "summary": "Employees are required to give a minimum of 30 days notice before resigning voluntarily.", "keywords": ["resignation", "notice"], "contact": "hr@example.com" },
  { "id": 2, "title": "Remote Work Policy", "summary": "Employees may w
Parsed 2 clauses from chunk 2

All extracted clauses:
[
  {
    "id": 1,
    "title": "Parental Leave Policy",
    "summary": "Employees are entitled to 12 weeks of parental leave after the birth or adoption of a child, contingent on 12 months of continuous employment.",
    "keywords": [
      "parental leave",
      "employment",
      "adoption"
    ],
    "contact": "hr@example.com"
  },
  {
    "id": 1,
    "title": "Sick Leave Policy",
    "summary": "Employees earn 1 day of sick leave each month, capped at 12 days annually, with a medical certificate needed for absences over 3 days.",
    "keywords": [
      "sick leave",
      "medical certificate"
    

In [26]:
out_path = 'extracted_clauses_safe.json'
with open(out_path, 'w', encoding='utf-8') as f:
    json.dump(all_clauses, f, indent=2)
print('Saved extracted clauses to', out_path)


Saved extracted clauses to extracted_clauses_safe.json


In [27]:
import os, json
data_path = 'extracted_clauses_safe.json'
if not os.path.exists(data_path):
    # Try alternative path
    data_path = 'extracted_clauses_safe.json'
if not os.path.exists(data_path):
    # Create sample clauses as fallback
    sample_clauses = [
        {'id':1,'title':'Parental Leave','summary':'Employees may take up to 12 weeks parental leave after 12 months of service.','keywords':['parental','leave','12 weeks'],'contact':'hr@company.com'},
        {'id':2,'title':'Sick Leave','summary':'Employees accrue up to 12 days sick leave per year; medical note required for >3 days.','keywords':['sick','leave'],'contact':'hr@company.com'}
    ]
    clauses = sample_clauses
    print('No extracted_clauses file found — using sample_clauses.')
else:
    with open(data_path,'r',encoding='utf-8') as f:
        clauses = json.load(f)
    print(f'Loaded {len(clauses)} clauses from', data_path)

# Display first clause
clauses[:2]


Loaded 4 clauses from extracted_clauses_safe.json


[{'id': 1,
  'title': 'Parental Leave Policy',
  'summary': 'Employees are entitled to 12 weeks of parental leave after the birth or adoption of a child, contingent on 12 months of continuous employment.',
  'keywords': ['parental leave', 'employment', 'adoption'],
  'contact': 'hr@example.com'},
 {'id': 1,
  'title': 'Sick Leave Policy',
  'summary': 'Employees earn 1 day of sick leave each month, capped at 12 days annually, with a medical certificate needed for absences over 3 days.',
  'keywords': ['sick leave', 'medical certificate'],
  'contact': 'hr@example.com'}]

In [28]:
# Prepare texts to embed (we'll use 'summary' + title as the document text)
texts = []
metadatas = []
for c in clauses:
    text = (c.get('title','') + ': ' + c.get('summary','')).strip()
    texts.append(text)
    metadatas.append({'id': c['id'], 'title': c.get('title'), 'contact': c.get('contact')})

print('Prepared', len(texts), 'texts for embedding. Sample:') 
print(texts[0][:200])

Prepared 4 texts for embedding. Sample:
Parental Leave Policy: Employees are entitled to 12 weeks of parental leave after the birth or adoption of a child, contingent on 12 months of continuous employment.


In [29]:
# Generate embeddings (real via OpenAIEmbeddings if key present; otherwise mock random vectors)
import random
EMBED_DIM = 1536  # typical dimension for many OpenAI embedding models; adjust if using a different model
if OPENAI_KEY:
    emb = OpenAIEmbeddings()  # uses env key
    vectors = emb.embed_documents(texts)
    # OpenAIEmbeddings returns list of vectors; ensure numpy array
    vectors = [list(map(float, v)) for v in vectors]
    print('Generated real embeddings for', len(vectors), 'documents. Vector dim:', len(vectors[0]))
else:
    # mock deterministic random vectors (seed for reproducibility)
    rng = random.Random(42)
    vectors = []
    for t in texts:
        # generate EMBED_DIM floats between -1 and 1
        vec = np.array([rng.random()*2-1 for _ in range(EMBED_DIM)], dtype=float).tolist()
        vectors.append(vec)
    print('Generated mock embeddings for', len(vectors), 'documents. Vector dim:', EMBED_DIM)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generated real embeddings for 4 documents. Vector dim: 1536


In [30]:
# Ensure faiss-cpu is installed
%pip install faiss-cpu

# Create a FAISS index from vectors and save it (with fallback for missing dependencies)
import numpy as np
index_path = 'faiss_hr_clauses_index'

try:
    # Try using FAISS if available
    import faiss
    dim = len(vectors[0])
    xb = np.array(vectors).astype('float32')
    index = faiss.IndexFlatL2(dim)
    index.add(xb)
    faiss.write_index(index, index_path + '.index')
    # Save metadata and texts
    with open(index_path + '_metadatas.json', 'w', encoding='utf-8') as f:
        json.dump({'metadatas': metadatas, 'texts': texts}, f, indent=2)
    print(f'✓ FAISS index created and saved to {index_path}.index')
    print(f'  - Indexed {len(vectors)} documents with dimension {dim}')
except ImportError:
    print('⚠ FAISS library not available. Creating alternative vector store fallback...')
    try:
        # Alternative: Save vectors and metadata as JSON for later retrieval
        vector_data = {
            'vectors': vectors,
            'texts': texts,
            'metadatas': metadatas,
            'dimension': len(vectors[0]) if vectors else 0
        }
        with open(index_path + '_vectors.json', 'w', encoding='utf-8') as f:
            json.dump(vector_data, f, indent=2)
        print(f'✓ Vector store saved as JSON to {index_path}_vectors.json')
        print(f'  - Stored {len(vectors)} documents')
    except Exception as e3:
        print(f'✗ Vector store fallback failed: {e3}')
except Exception as e:
    print(f'✗ FAISS creation failed: {e}')

Note: you may need to restart the kernel to use updated packages.
✓ FAISS index created and saved to faiss_hr_clauses_index.index
  - Indexed 4 documents with dimension 1536


In [33]:
# Retrieval demo: given a query, find top-k relevant clauses
def retrieve(query: str, k: int = 3):
    # Generate embedding for query
    if OPENAI_KEY:
        q_emb = OpenAIEmbeddings().embed_query(query)
    else:
        # mock embedding deterministic from query (hash-based)
        import hashlib, numpy as _np
        h = hashlib.sha256(query.encode()).digest()
        rng = random.Random(int.from_bytes(h[:4], 'big'))
        q_emb = _np.array([rng.random()*2-1 for _ in range(EMBED_DIM)], dtype=float).tolist()
    # Load FAISS index
    try:
        vs = FAISS.load_local('faiss_hr_clauses_index', OpenAIEmbeddings() if OPENAI_KEY else None)
        docs = vs.similarity_search_by_vector(q_emb, k=k)
        return docs
    except Exception as e:
        # Fallback to manual nearest neighbor with saved index
        try:
            import faiss, numpy as _np
            idx = faiss.read_index('faiss_hr_clauses_index.index')
            xb = _np.array(vectors).astype('float32')
            qv = _np.array(q_emb).astype('float32').reshape(1, -1)
            D, I = idx.search(qv, k)
            results = []
            for i in I[0]:
                results.append({'score': float(D[0][list(I[0]).index(i)]), 'metadata': metadatas[i], 'text': texts[i]})
            return results
        except Exception as e2:
            print('Retrieval failed:', e, e2)
            return []

# Try a few queries
queries = ['parental leave eligibility', 'sick leave medical certificate', 'remote work policy']
for q in queries:
    print('\nQuery:', q)
    docs = retrieve(q, k=2)
    for d in docs:
        # LangChain docs may be Document objects with page_content and metadata
        if isinstance(d, dict):
            print(' -', d.get('metadata'), '-', d.get('text')[:200])
        else:
            # attempt Document object interface
            try:
                print(' -', d.metadata, '-', d.page_content[:200])
            except Exception:
                print(' -', str(d)[:200])



Query: parental leave eligibility


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


 - {'id': 1, 'title': 'Parental Leave Policy', 'contact': 'hr@example.com'} - Parental Leave Policy: Employees are entitled to 12 weeks of parental leave after the birth or adoption of a child, contingent on 12 months of continuous employment.
 - {'id': 1, 'title': 'Sick Leave Policy', 'contact': 'hr@example.com'} - Sick Leave Policy: Employees earn 1 day of sick leave each month, capped at 12 days annually, with a medical certificate needed for absences over 3 days.

Query: sick leave medical certificate


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


 - {'id': 1, 'title': 'Sick Leave Policy', 'contact': 'hr@example.com'} - Sick Leave Policy: Employees earn 1 day of sick leave each month, capped at 12 days annually, with a medical certificate needed for absences over 3 days.
 - {'id': 1, 'title': 'Parental Leave Policy', 'contact': 'hr@example.com'} - Parental Leave Policy: Employees are entitled to 12 weeks of parental leave after the birth or adoption of a child, contingent on 12 months of continuous employment.

Query: remote work policy


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


 - {'id': 2, 'title': 'Remote Work Policy', 'contact': 'hr@example.com'} - Remote Work Policy: Employees may work remotely up to 2 days per week with prior approval from their manager.
 - {'id': 1, 'title': 'Parental Leave Policy', 'contact': 'hr@example.com'} - Parental Leave Policy: Employees are entitled to 12 weeks of parental leave after the birth or adoption of a child, contingent on 12 months of continuous employment.
