In [None]:
# Notebook Cell: Analyze Data and Generate Answerable Questions
# Run this BEFORE notebook 3 to create data-driven questions

import os
import re
import json
import random
import pandas as pd
from pathlib import Path
from collections import Counter
from typing import List, Dict
import openai

# ====== API KEY ======
OPEN_API_KEY = "your key here"
os.environ["OPENAI_API_KEY"] = OPEN_API_KEY
openai.api_key = OPEN_API_KEY

# Correct v1 client
from openai import OpenAI
client = OpenAI(api_key=OPEN_API_KEY)

# Paths
ROOT_DIR = Path('..').resolve()
RAW_DATA_DIR = ROOT_DIR / 'data' / 'raw'
PROCESSED_DATA_DIR = ROOT_DIR / 'data' / 'processed'

# ============================================================================
# Step 1: Load and Analyze Your Corpus
# ============================================================================

print("="*80)
print("ANALYZING YOUR MEDICAL CORPUS")
print("="*80)

# Load processed documents
with open(PROCESSED_DATA_DIR / 'processed_documents.json', 'r', encoding='utf-8') as f:
    docs_data = json.load(f)

print(f"\nTotal documents: {len(docs_data)}")

# Analyze by source
sources = Counter([doc['metadata']['source'] for doc in docs_data])
print(f"\nDocuments by source:")
for source, count in sources.items():
    print(f"  {source}: {count} documents")

# Analyze FDA files
fda_files = Counter([doc['metadata'].get('file', 'unknown') for doc in docs_data if doc['metadata']['source'] == 'FDA'])
print(f"\nFDA files:")
for file, count in sorted(fda_files.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"  {file}: {count} chunks")

# Sample content from each source
print("\n" + "="*80)
print("SAMPLE CONTENT FROM EACH SOURCE")
print("="*80)

for source in sources.keys():
    source_docs = [doc for doc in docs_data if doc['metadata']['source'] == source]
    if not source_docs:
        continue
    print(f"\n{source} - Sample content:")
    print((source_docs[0].get('content') or "")[:500] + "...")

# ============================================================================
# Step 2: Extract Medical Topics from Your Data
# ============================================================================

def extract_medical_topics(docs_data: List[Dict]) -> Dict[str, List[str]]:
    """Extract medical topics that are actually in your corpus"""
    topics = {
        'medications': set(),
        'conditions': set(),
        'procedures': set(),
        'side_effects': set()
    }

    # Simple heuristic patterns (can be improved)
    medication_patterns = [
        r'\b([A-Z][a-z]+(?:ine|ol|pril|sartan|ide|mycin|cillin))\b',  # Drug-ish suffixes
        r'\b([A-Z][a-z]+ (?:tablets|capsules|injection))\b'
    ]
    condition_patterns = [
        r'\b(diabetes|hypertension|heart disease|high blood pressure|cholesterol)\b',
        r'\b(type 2 diabetes|cardiovascular disease)\b'
    ]

    for doc in docs_data:
        content = (doc.get('content') or "")
        content_lower = content.lower()

        # Medications
        for pattern in medication_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            topics['medications'].update(matches)

        # Conditions
        for pattern in condition_patterns:
            matches = re.findall(pattern, content_lower)
            topics['conditions'].update(matches)

        # You could add more extraction rules here for procedures / side effects

    # Convert sets to sorted lists
    return {k: sorted(list(v))[:20] for k, v in topics.items()}  # Top 20 each

topics_found = extract_medical_topics(docs_data)

print("\n" + "="*80)
print("MEDICAL TOPICS FOUND IN YOUR CORPUS")
print("="*80)

for category, items in topics_found.items():
    print(f"\n{category.UPPER() if hasattr(category,'UPPER') else category.upper()} ({len(items)}):")
    for item in items[:10]:  # Show top 10
        print(f"  - {item}")

# ============================================================================
# Step 3: Generate Questions Based on YOUR Data (robust)
# ============================================================================

random.seed(42)

def _sample_corpus(docs_data, per_source=3, max_sources=4, clip_chars=600):
    """Small, diverse, token-safe corpus sample."""
    uniq_sources = list({doc["metadata"]["source"] for doc in docs_data})
    random.shuffle(uniq_sources)
    uniq_sources = uniq_sources[:max_sources]

    samples = []
    for src in uniq_sources:
        source_docs = [d for d in docs_data if d["metadata"]["source"] == src]
        if not source_docs:
            continue
        random.shuffle(source_docs)
        take = min(per_source, len(source_docs))
        for d in source_docs[:take]:
            content = (d.get("content") or "")[:clip_chars]
            samples.append({"source": src, "content": content})
    return samples

def _normalize_category(cat: str) -> str:
    c = (cat or "").strip().lower()
    if c in {"medication","condition","treatment","side_effect"}:
        return c
    if "side" in c: return "side_effect"
    if "treat" in c: return "treatment"
    if "cond" in c: return "condition"
    return "medication"

def _normalize_source(src: str) -> str:
    s = (src or "").lower()
    if "cdc" in s: return "CDC"
    if "medline" in s or "nih" in s: return "MedlinePlus (NIH)"
    return "FDA"

def generate_questions_from_corpus(docs_data: List[Dict], num_questions: int = 30) -> List[Dict]:
    """Use GPT-4o to generate Hindi/English Qs grounded in your corpus."""
    samples = _sample_corpus(docs_data, per_source=3, max_sources=4, clip_chars=600)
    if not samples:
        print("[Question gen] No samples found; returning empty list.")
        return []

    corpus_context = "\n\n---SAMPLE FROM CORPUS---\n\n".join(
        [f"Source: {s['source']}\n{s['content']}" for s in samples]
    )

    prompt = f"""You are a medical question generator. Based ONLY on the corpus samples below, generate EXACTLY {num_questions} Hindi medical questions that CAN BE ANSWERED from these samples.

CORPUS SAMPLES:
{corpus_context}

RULES:
1) Every question must be answerable using the samples above (no outside knowledge).
2) Cover medications, conditions, treatments, and side effects actually present.
3) Provide both Hindi and English versions for each question.
4) category ∈ ["medication","condition","treatment","side_effect"].
5) complexity ∈ ["simple","moderate","complex"].
6) expected_source ∈ ["FDA","CDC","MedlinePlus (NIH)"] chosen from the samples’ sources.
7) Respond ONLY as a JSON OBJECT with a top-level key "questions" whose value is a JSON ARRAY of exactly {num_questions} items.
8) No extra keys or commentary outside "questions".

JSON SHAPE:
{{
  "questions": [
    {{
      "hindi": "string",
      "english": "string",
      "category": "medication|condition|treatment|side_effect",
      "complexity": "simple|moderate|complex",
      "expected_source": "FDA|CDC|MedlinePlus (NIH)",
      "answerable": true
    }}
  ]
}}
"""

    raw = None
    try:
        # Enforce a JSON object response
        resp = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You generate answerable medical questions strictly grounded in the provided corpus."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.5,
            response_format={"type": "json_object"},
        )
        raw = resp.choices[0].message.content
        data = json.loads(raw)
        qs = data.get("questions", [])
    except Exception as e:
        print(f"[Question gen] Error parsing json_object: {e}")
        # Fallback: try without enforced format and extract JSON
        try:
            resp2 = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "Output ONLY valid JSON per the user instruction."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.5,
            )
            raw = resp2.choices[0].message.content
            # Attempt to find outermost JSON object
            m = re.search(r"\{.*\}", raw, flags=re.S)
            data = json.loads(m.group(0)) if m else json.loads(raw)
            qs = data.get("questions", data if isinstance(data, list) else [])
        except Exception as e2:
            print(f"[Question gen] Fallback parse failed: {e2}")
            if raw:
                print("[Assistant raw output]:\n", raw[:2000])
            return []

    out = []
    for q in qs:
        if not isinstance(q, dict):
            continue
        hi = (q.get("hindi") or "").strip()
        en = (q.get("english") or "").strip()
        if not hi or not en:
            continue
        out.append({
            "hindi": hi,
            "english": en,
            "category": _normalize_category(q.get("category")),
            "complexity": (q.get("complexity") or "simple").strip().lower(),
            "expected_source": _normalize_source(q.get("expected_source")),
            "answerable": True
        })

    return out[:num_questions]

print("\n" + "="*80)
print("GENERATING DATA-DRIVEN QUESTIONS")
print("="*80)

generated_questions = generate_questions_from_corpus(docs_data, num_questions=30)
print(f"\n✓ Generated {len(generated_questions)} questions")

# Show a few
print("\nSample generated questions:")
for i, q in enumerate(generated_questions[:5], 1):
    print(f"\n{i}. {q['hindi']}")
    print(f"   ({q['english']})  |  {q['category']}  •  {q['complexity']}  •  {q['expected_source']}")

# ============================================================================
# Step 4: Validate Questions Against Corpus
# ============================================================================

def validate_question_answerability(question_english: str, docs_data: List[Dict]) -> Dict:
    """Check if a question can likely be answered from the corpus"""
    # Extract key terms from question
    key_terms = re.findall(r'\b\w+\b', question_english.lower())
    key_terms = [t for t in key_terms if len(t) > 4]  # Filter short words

    # Search for documents containing key terms
    relevant_docs = []
    for doc in docs_data:
        content_lower = (doc.get('content') or "").lower()
        matches = sum(1 for term in key_terms if term in content_lower)
        if matches >= 2:  # At least 2 key terms match
            relevant_docs.append({
                'content': (doc.get('content') or "")[:300],
                'source': doc['metadata']['source'],
                'match_count': matches
            })

    return {
        'answerable': len(relevant_docs) > 0,
        'relevant_docs_count': len(relevant_docs),
        'top_docs': sorted(relevant_docs, key=lambda x: x['match_count'], reverse=True)[:3]
    }

print("\n" + "="*80)
print("VALIDATING QUESTION ANSWERABILITY")
print("="*80)

validated_questions = []
for q in generated_questions:
    validation = validate_question_answerability(q['english'], docs_data)
    q['validation'] = validation
    if validation['answerable']:
        validated_questions.append(q)

print(f"\n✓ Answerable questions: {len(validated_questions)}/{len(generated_questions)}")

# Show validation results for first few
print("\nValidation samples:")
for i, q in enumerate(validated_questions[:3], 1):
    print(f"\n{i}. {q['english']}")
    print(f"   Answerable: {q['validation']['answerable']}")
    print(f"   Relevant docs: {q['validation']['relevant_docs_count']}")
    if q['validation']['top_docs']:
        print(f"   Top match source: {q['validation']['top_docs'][0]['source']}")

# ============================================================================
# Step 5 (continued): Produce a copy-pasteable Python dict for Notebook 3
# ============================================================================

from collections import defaultdict
from pprint import pformat

print("\n" + "="*80)
print("COPY THIS INTO NOTEBOOK 3:")
print("="*80)

# Group by category and keep only the fields you need there
grouped = defaultdict(list)
for q in validated_questions:
    grouped[q['category']].append({
        'hindi': q['hindi'],
        'english': q['english'],
        'category': q['category'],
        'complexity': q['complexity'],
    })

# Pretty-print as a real Python dict (no fragile manual escaping)
print("HINDI_TEST_QUESTIONS = " + pformat(dict(grouped), width=120, compact=False))


ANALYZING YOUR MEDICAL CORPUS

Total documents: 16036

Documents by source:
  FDA: 15942 documents
  MedlinePlus (NIH): 94 documents

FDA files:
  hypertension_drugs.json: 4365 chunks
  diabetes_drugs.json: 3373 chunks
  asthma_drugs.json: 2847 chunks
  high_blood_pressure_drugs.json: 2683 chunks
  heart_disease_drugs.json: 2674 chunks

SAMPLE CONTENT FROM EACH SOURCE

FDA - Sample content:
INDICATIONS AND USAGE: 1 INDICATIONS AND USAGE Naproxen tablets and naproxen sodium tablets are indicated for: the relief of the signs and symptoms of: • rheumatoid arthritis • osteoarthritis • ankylosing spondylitis • Polyarticular Juvenile Idiopathic Arthritis Naproxen tablets and naproxen sodium tablets are also indicated for: the relief of signs and symptoms of: • tendonitis • bursitis • acute gout the management of: • pain • primary dysmenorrhea Naproxen tablets and naproxen sodium tablets ...

MedlinePlus (NIH) - Sample content:
SUMMARY: What is asthma?Asthma is a chronic (long-term) lung dise