Live Policy Intelligence Agent (RAG + Live Data)
Overview

This project implements a live, self-updating policy agent that connects directly to an authoritative public health website, detects content changes, refreshes its internal knowledge base automatically, and answers user queries using retrieval-augmented generation (RAG).

Unlike static document-based systems, this agent ensures that responses are always grounded in the latest available guidelines.

In [None]:
pip install groq


Project

In [None]:
import requests

url = "https://www.mohfw.gov.in/"

response = requests.get(url, timeout=10)

print("Status Code:", response.status_code)
print("Response length:", len(response.text))
print("\nFirst 500 characters:\n")
print(response.text[:500])


Status Code: 200
Response length: 122394

First 500 characters:

﻿

<!-- THEME DEBUG -->
<!-- CALL: theme('html') -->
<!-- FILE NAME SUGGESTIONS:
   * html--front.tpl.php
   * html--node.tpl.php
   x html.tpl.php
-->
<!-- BEGIN OUTPUT from 'sites/all/themes/dhfw/templates/html.tpl.php' -->
<!DOCTYPE html>
<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->
<!--[if lte IE 6]><html class="lt-ie9 lt-ie8 lt-ie7"  lang="en" dir="ltr"><![endif]-->
<!--[if (IE 7)&(!IEMobile)]><html class="lt-ie9 lt-ie8"  lang="en" dir="ltr"><![endif]-->
<!--[if 


In [None]:
from bs4 import BeautifulSoup

html = response.text
soup = BeautifulSoup(html, "html.parser")

text = soup.get_text(separator=" ", strip=True)

print("Clean text length:", len(text))
print("\nFirst 1000 characters of clean text:\n")
print(text[:1000])


In [None]:
import hashlib

def get_hash(text):
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

current_hash = get_hash(text)
print("Document hash:", current_hash)


Document hash: 906bdbf9d4c2fd5de829c0e6eb8ed0bd26cde47d1bf7434f84188153547a6a92


In [None]:
try:
  with open("last_hash.txt", "r") as f:
    last_hash = f.read().strip()
except FileNotFoundError:
  last_hash=None
if last_hash is None:
  print("first run")
elif last_hash!=current_hash:
  print("document changed")
else:
  print("document not changed")

first run


In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.5.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.5.0-py3-none-any.whl (329 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.6/329.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.5.0


In [None]:
from pypdf import PdfReader
pdf="/content/WHO-2019-nCoV-clinical-2023.2-eng.pdf"
reader=PdfReader(pdf)
all_text=""
for page in reader.pages:
  text=page.extract_text()
  if text:
    all_text+=text+"/n"
    print("/n total page length",len(all_text))
    print("/n first 1000 line")
    print(all_text[:1000])

In [None]:
# for who guidlines
import hashlib

def get_hash(text):
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

current_hash = get_hash(all_text)
print("Document hash:", current_hash)

Document hash: aaf31319690b3df43187a09acf48ad064b1e152234bf80cb57963d1f58a99017


In [None]:
#guidlines specific
try:
  with open("last_hash.txt", "r") as f:
    last_hash = f.read().strip()
except FileNotFoundError:
  last_hash=None
if last_hash is None:
  print("first run")
elif last_hash!=current_hash:
  print("document changed")
else:
  print("document not changed")

first run


Sample chunking

In [None]:
def chunking(text,chunk_size=500,overlap=100):
  chunks=[]
  start=0
  text_length=len(text)
  while start < text_length:
        end = min(start + chunk_size, text_length)
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap)  # always move forward

  return chunks
chunks=(chunking(all_text))
print(len(chunks))
print(chunks[0])

1590
Clinical management of COVID-19 
Living guideline 
18 August 2023 
 
 /nWHO/2019-nCoV/clinical/2023.2 
© World Health Organization 2023 
Some rights reserved. This work is available under the Creative Commons Attribution-NonCommercial-
ShareAlike 3.0 IGO licence (CC  BY-NC-SA 3.0 IGO; https://creativecommons.org/licenses/by-nc-sa/
3.0/igo).  
Under the terms of this licence, you may copy, redistribute and adapt the work for non- commercial 
purposes, provided the work is appropriately cited, as 


In [None]:
!pip install -U sentence-transformers




In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
model=SentenceTransformer("all-MiniLM-L6-v2")
max_chunks=1000
safe_chunks=chunks[:max_chunks]
embeddings=model.encode(
    safe_chunks,
    show_progress_bar=True,
    batch_size=32
)
embeddings=np.array(embeddings)
print(embeddings.shape)



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

(1000, 384)


Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def search_db(query,model,chunks,embeddings,top_k=3):
  query_embeddings=model.encode([query])
  similarities=cosine_similarity(query_embeddings,embeddings)
  top_indices=np.argsort(similarities[0])[-top_k:][::-1]
  results=[]
  for idx in top_indices:
    results.append({
        "score":similarities[0, idx],
        "text":chunks[idx]
    })
  return results

In [None]:
query = "What are the isolation rules for COVID-19?"

results = search_db(
    query=query,
    model=model,
    chunks=safe_chunks,
    embeddings=embeddings,
    top_k=3
)

for i, r in enumerate(results, 1):
    print(f"\nResult {i} | Score: {r['score']:.4f}")
    print(r["text"][:500])



Result 1 | Score: 0.7093
n the 
period of isolation. 
Research Needs 
Uncertainties, emerging evidence, and future research 
Despite the guidance for discontinuation of transmission-based precautions (including isolation) and release from the COVID-19 
care pathway, there remain uncertain outcomes associated with the onward transmission of SARS-CoV-2 infection, as well as 
implications for the duration of isolation required for patients. Future research could be influenced by these uncertainties, i.e. 
the generation of

Result 2 | Score: 0.7047
s 
Clinical management of COVID-19: living guideline - World Health Organization (WHO)
21 of 186/nJustification 
There is very low certainty evidence that using rapid antigen tests to decrease duration of isolation will have trivial effects on 
transmission and subsequent hospitalization. 
With values and preferences of the GDG preferring shorter periods of isolation, given the uncertainty of the data, incorporating 
rapid antigen tests into a

In [None]:
def build_rag_prompt(query, retrieved_chunks):
    context = "\n\n".join(
        [f"Source {i+1}:\n{c['text']}" for i, c in enumerate(retrieved_chunks)]
    )

    prompt = f"""
You are a health information assistant.

Use ONLY the information provided in the sources below.
Do NOT add medical advice, diagnosis, or treatment.
If the answer is not found, say so clearly.

User question:
{query}

Sources:
{context}

Answer in clear, simple language.
Add this disclaimer at the end:
"This is informational only. Please consult a healthcare professional."
"""
    return prompt


In [None]:
prompt=build_rag_prompt(query,results)
print(prompt)

what chnaged


In [None]:
old_text=all_text.replace("isolation","quarantine",5)
new_text=all_text
print(len(old_text))
print(len(new_text))

635681
635676


In [None]:
import difflib



In [None]:
old_lines=old_text.splitlines()
new_lines=new_text.splitlines()
diff=list(
    difflib.unified_diff(
        old_lines,
        new_lines,
        fromfile="old",
        tofile="new",
        lineterm=""
    )
)

In [None]:
print("Total diff lines:", len(diff))
print("\nSample diff output:\n")
for line in diff[:50]:
    print(line)


In [None]:
added=[]
removed=[]
for line in diff:
  if line.startswith("+"):
    added.append(line[1:].strip())
  elif line.startswith("-"):
    removed.append(line[1:].strip())
print(len(added))
print(len(removed))

5
5


In [None]:
print("\nSample ADDED lines:")
for l in added[:5]:
    print("-", l)

print("\nSample REMOVED lines:")
for l in removed[:5]:
    print("-", l)



Sample ADDED lines:
- ++ new
- Infectious period and de-isolation ..............................................................................................................................................................................18
- discussion, infectious period and de-isolation.
- a scoping review (3). PICOs were: 1) isolation for 5 days; and 2) removal of isolation based on negative rapid antigen test as
- symptomatic patients, and the use of antigen testing for de-isolation and estimation of number for hospitalization and/or death by

Sample REMOVED lines:
- -- old
- Infectious period and de-quarantine ..............................................................................................................................................................................18
- discussion, infectious period and de-quarantine.
- a scoping review (3). PICOs were: 1) quarantine for 5 days; and 2) removal of quarantine based on negative rapid antigen test as
- symptomatic p

In [None]:
changed_summary={
    "added":added,
    "removed":removed

}

In [None]:
def decide_response_mode(query,changed_summary):
  q=query.lower()
  has_change=bool(changed_summary["added"]or changed_summary["removed"])
  ask_about_change=any(
      phrase in q for phrase in[
          "what changed",
          "updated",
          "new",
          "difference",
          "latest update"

      ]

  )
  if ask_about_change and has_change:
    return "changes"
  elif has_change:
    return 'both'
  else:
    return "normal"

In [None]:
test_queries = [
    "What are the isolation rules for COVID-19?",
    "What changed in the COVID-19 isolation rules?",
    "Is there any update in the guideline?"
]

for tq in test_queries:
    mode = decide_response_mode(tq, changed_summary)
    print(f"Query: {tq}")
    print("Mode:", mode)
    print("-" * 40)


Query: What are the isolation rules for COVID-19?
Mode: both
----------------------------------------
Query: What changed in the COVID-19 isolation rules?
Mode: changes
----------------------------------------
Query: Is there any update in the guideline?
Mode: both
----------------------------------------


Agent Logic

In [None]:
def agent_answer(query):
    """
    Main agent entry point.
    Decides how to answer based on guideline updates.
    """
    # 1. Decide response mode
    mode = decide_response_mode(query, changed_summary)

    # 2. Retrieve relevant chunks
    retrieved = search_db(
        query=query,
        model=model,
        chunks=safe_chunks,
        embeddings=embeddings,
        top_k=3
    )

    # 3. Build response payload
    response = {
        "mode": mode,
        "retrieved_chunks": retrieved,
        "changes": changed_summary if mode in ["CHANGES", "BOTH"] else None
    }

    return response


In [None]:
test_queries = [
    "What are the isolation rules for COVID-19?",
    "What changed in the COVID-19 isolation rules?"
]

for q in test_queries:
    output = agent_answer(q)
    print("\nQUERY:", q)
    print("MODE:", output["mode"])
    print("Retrieved chunks:", len(output["retrieved_chunks"]))
    if output["changes"]:
        print("Changes detected:", len(output["changes"]["added"]), "added")



QUERY: What are the isolation rules for COVID-19?
MODE: both
Retrieved chunks: 3

QUERY: What changed in the COVID-19 isolation rules?
MODE: changes
Retrieved chunks: 3


render response

In [None]:
def render_response(answer, mode, change_summary):
    response_str = ""

    # The LLM's answer already contains the RAG response and the disclaimer from build_rag_prompt
    response_str += answer

    # If the mode indicates changes should be shown and there are actual changes
    if mode in ["changes", "both"] and (change_summary["added"] or change_summary["removed"]):
        response_str += "\n\n--- Detected Changes in Guideline ---\n"
        if change_summary["removed"]:
            response_str += "\nRemoved lines (first 5):\n"
            for r in change_summary["removed"][:5]:
                response_str += f"- {r}\n"
        if change_summary["added"]:
            response_str += "\nAdded lines (first 5):\n"
            for a in change_summary["added"][:5]:
                response_str += f"- {a}\n"
        else:
            response_str += "\nNo explicit text changes to summarize from the diff."
    elif mode in ["changes", "both"] and not (change_summary["added"] or change_summary["removed"]):
        response_str += "\n\nNo significant text changes detected to summarize (perhaps first run or no textual changes in diff).\n"

    return response_str


In [None]:
import requests
from bs4 import BeautifulSoup

def refresh_guideline_text(url):
    """
    Fetches latest guideline text from a live health webpage.
    Returns clean extracted text.
    """
    response = requests.get(url, timeout=10)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    # Remove scripts/styles
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    text = soup.get_text(separator=" ", strip=True)

    return text


In [None]:
live_url = "https://www.who.int/emergencies/diseases/novel-coronavirus-2019"

new_text = refresh_guideline_text(live_url)




In [None]:
import requests
from bs4 import BeautifulSoup
import hashlib
import numpy as np

def refresh_guideline_data(url):
    global all_text, chunks, safe_chunks, embeddings

    # 1. Fetch live page
    response = requests.get(url, timeout=10)
    response.raise_for_status()

    # 2. Extract clean text
    soup = BeautifulSoup(response.text, "html.parser")
    new_text = soup.get_text(separator=" ", strip=True)

    # 3. Detect change
    def get_hash(text):
        return hashlib.sha256(text.encode("utf-8")).hexdigest()

    new_hash = get_hash(new_text)

    try:
        with open("guideline_hash.txt", "r") as f:
            old_hash = f.read().strip()
    except FileNotFoundError:
        old_hash = None

    if old_hash == new_hash:
        print("No guideline update detected.")
        return False

    print("Guideline update detected. Re-indexing...")

    # 4. Save new hash
    with open("guideline_hash.txt", "w") as f:
        f.write(new_hash)

    # 5. Update global text
    all_text = new_text

    # 6. Re-chunk
    chunks = chunking(all_text)
    safe_chunks = chunks[:1000]

    # 7. Re-embed
    embeddings = model.encode(
        safe_chunks,
        batch_size=32,
        show_progress_bar=True
    )
    embeddings = np.array(embeddings)

    print("Guideline refreshed successfully.")
    return True


In [None]:
def llm_generate(prompt, client):
    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": "You are a safe health information assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content


In [None]:
def retrieve_chunks(query, top_k=3):
    return search_db(
        query=query,
        model=model,
        chunks=safe_chunks,
        embeddings=embeddings,
        top_k=top_k
    )


In [None]:
def agent_answer(query):
    refresh_guideline_data(url="https://www.who.int/emergencies/diseases/novel-coronavirus-2019")   # updates chunks if changed
    return llm_generate(query)


In [None]:
def LLM_generate(query):
    retrieved = retrieve_chunks(query)
    prompt = build_rag_prompt(query, retrieved)
    response = llm_generate.generate(prompt)

    return response


Observe → Decide → Act → Respond loop:

Observe: Fetch live data and compute content hash

Decide: Determine response mode based on detected changes

Act: Refresh knowledge base and retrieve relevant chunks

Respond: Generate and format the final answer




In [None]:
#live agent

import os
from groq import Groq
import difflib

SOURCE_URL = "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/advice-for-public"

agent_state = {
    "last_hash": None,
    "previous_text": ""
}

def live_guideline_agent(query):
    global all_text, chunks, safe_chunks, embeddings, model, max_chunks
    # ---- FETCH ----
    new_live_text = refresh_guideline_text(SOURCE_URL)

    # ---- HASH ----
    current_hash = get_hash(new_live_text)
    data_changed = agent_state["last_hash"] != current_hash
    change_summary = {"added": [], "removed": []}
    if data_changed:
        print("Guideline update detected. Re-indexing...")
        if agent_state["previous_text"]:
            old_lines = agent_state["previous_text"].splitlines()
            new_lines = new_live_text.splitlines()

            diff = difflib.unified_diff(
                old_lines,
                new_lines,
                fromfile="previous",
                tofile="current",
                lineterm=""
            )

            for line in diff:
                if line.startswith("+") and not line.startswith("+++"):
                    change_summary["added"].append(line[1:].strip())
                elif line.startswith("-") and not line.startswith("---"):
                    change_summary["removed"].append(line[1:].strip())

        all_text = new_live_text
        chunks = chunking(all_text)
        safe_chunks = chunks[:max_chunks]
        embeddings = model.encode(safe_chunks, batch_size=32, show_progress_bar=True)
        embeddings = np.array(embeddings)
        agent_state["previous_text"] = all_text
        agent_state["last_hash"] = current_hash
        print("Guideline refreshed successfully.")
    else:
        if not ('model' in globals() and hasattr(model, 'encode') and 'all_text' in globals() and all_text != "" and 'embeddings' in globals() and len(embeddings) > 0):
            print("Initializing RAG components (no update detected, but globals might be empty)...")
            # Load the model if not already loaded
            if 'model' not in globals():
                model = SentenceTransformer("all-MiniLM-L6-v2") # Assuming SentenceTransformer is imported and model is meant to be global

            all_text = new_live_text
            chunks = chunking(all_text)
            safe_chunks = chunks[:max_chunks]
            embeddings = model.encode(safe_chunks, batch_size=32, show_progress_bar=True)
            embeddings = np.array(embeddings)
            agent_state["previous_text"] = all_text
            agent_state["last_hash"] = current_hash
            print("RAG components initialized.")
        else:
            print("No guideline update detected. Using existing RAG components.")


    # ---- DECIDE ----
    mode = decide_response_mode(query, change_summary)
    retrieved_chunks = retrieve_chunks(query, top_k=3)
    prompt = build_rag_prompt(
        query=query,
        retrieved_chunks=retrieved_chunks,
    )

    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    llm_response = llm_generate(prompt, client)
    return render_response(llm_response, mode, change_summary)
# WRAPPER FUNCTIONS
def agent_answer(query):
    return live_guideline_agent(query)

def ask_agent(query):
    return agent_answer(query)
ask_agent("What are the current isolation and quarantine guidelines?")


Guideline update detected. Re-indexing...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Guideline refreshed successfully.


'Based on the provided sources, here are the current isolation and quarantine guidelines:\n\nThere is no specific mention of isolation or quarantine guidelines in the provided sources. According to Source 3, it\'s recommended to get information from trusted sources such as WHO or local and national health authorities for guidance on protecting oneself from COVID-19, including quarantine or isolation.\n\nHowever, here are some general guidelines that can be inferred from the provided sources:\n\n- Avoid crowded and close contact areas (Source 1)\n- Maintain a physical distance of at least 1 meter from others (Source 2)\n- Wear a properly fitted mask when physical distancing is not possible and in poorly ventilated settings (Source 2)\n\nFor specific guidance on isolation and quarantine, it\'s best to consult local and national health authorities.\n\n"This is informational only. Please consult a healthcare professional."'