In [None]:
# Install dependencies (run once in Colab)
!pip install ibm-watsonx-ai beautifulsoup4 requests

Collecting ibm-watsonx-ai
  Downloading ibm_watsonx_ai-1.4.4-py3-none-any.whl.metadata (3.3 kB)
Collecting lomond (from ibm-watsonx-ai)
  Downloading lomond-0.3.3-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting ibm-cos-sdk<2.15.0,>=2.12.0 (from ibm-watsonx-ai)
  Downloading ibm_cos_sdk-2.14.3.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ibm-cos-sdk-core==2.14.3 (from ibm-cos-sdk<2.15.0,>=2.12.0->ibm-watsonx-ai)
  Downloading ibm_cos_sdk_core-2.14.3.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ibm-cos-sdk-s3transfer==2.14.3 (from ibm-cos-sdk<2.15.0,>=2.12.0->ibm-watsonx-ai)
  Downloading ibm_cos_sdk_s3transfer-2.14.3.tar.gz (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [19]:
from google.colab import userdata

api_key = userdata.get('WATSONX_API_KEY')
project_id = userdata.get('WATSONX_PROJECT_ID')

In [None]:
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import DecodingMethods

instance_url = "https://us-south.ml.cloud.ibm.com"

credentials = {"url": instance_url, "apikey": api_key}

gen_params = {
    GenParams.DECODING_METHOD: DecodingMethods.SAMPLE,
    GenParams.MAX_NEW_TOKENS: 400,
    "temperature": 0.7
}

model = ModelInference(
    model_id="ibm/granite-3-8b-instruct",
    credentials=credentials,
    project_id=project_id,
    params=gen_params,
    verify=False
)


In [None]:
model.generate_text(prompt="What is the meaning of life?")

'\n\nThe meaning of life is a philosophical and metaphysical question related to the purpose or significance of life or existence in general. This question has been asked for centuries and does not have a definitive answer. Some people find meaning through personal growth, relationships, love, or through contributing to the betterment of humanity. Ultimately, the meaning of life may be a personal and subjective concept.'

In [None]:
import json
import requests
from bs4 import BeautifulSoup
from getpass import getpass
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import DecodingMethods
import re

# -----------------------
#  Watsonx credentials
# -----------------------
instance_url = "https://us-south.ml.cloud.ibm.com"

credentials = {"url": instance_url, "apikey": api_key}

# -----------------------
# Granite model setup
# -----------------------
gen_params = {
    GenParams.DECODING_METHOD: DecodingMethods.SAMPLE,
    GenParams.MAX_NEW_TOKENS: 400,
    "temperature": 0.7
}

model = ModelInference(
    model_id="ibm/granite-3-8b-instruct",
    credentials=credentials,
    project_id=project_id,
    params=gen_params,
    verify=False
)

# -----------------------
# Scraping public sources
# -----------------------
def scrape_text(url):
    print(f"Scraping {url}")
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    paragraphs = soup.find_all('p')
    # return paragraphs as list of strings
    print(soup)

    return [p.get_text().strip() for p in paragraphs if p.get_text().strip()]

# -----------------------
# Health paragraph filter
# -----------------------
health_keywords = ["health", "exercise", "diet", "nutrition", "disease",
                   "prevention", "well-being", "diabetes", "fitness", "cardiovascular", "obesity"]

def is_health_paragraph(text):
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in health_keywords)

# -----------------------
# Generate QA pairs (text parsing)
# -----------------------
qa_list = []

def generate_qa_text(paragraph, source, topic):
    prompt = (
        "Create 3-5 question-answer pairs from the following paragraph. "
        "Output in plain text only with the format:\n"
        "Q: <question>\nA: <answer>\n\n"
        "Do NOT output JSON, numbering, or extra text.\n\n"
        f"{paragraph}"
    )
    response = model.generate_text(prompt=prompt)

    # Regex to extract Q/A pairs
    qa_matches = re.findall(r"Q:\s*(.+?)\nA:\s*(.+?)(?:\n\n|$)", response, flags=re.DOTALL)

    for q, a in qa_matches:
        # difficulty = "Basic" if len(a.split()) <= 20 else "Intermediate" if len(a.split()) <= 40 else "Advanced"
        qa_list.append({
            "question": q.strip(),
            "answer": a.strip(),
            "source": source,
            "topic": topic
        })
# -----------------------
# Load URLs from JSON and process
# -----------------------
with open("/content/first-aid.json") as f:
    urls = json.load(f)


for entry in urls:
    source = entry.get("url")
    topic = entry.get("Topic")
    url = entry["url"]
    paragraphs = scrape_text(url)
    for para in paragraphs:
        if is_health_paragraph(para):
            generate_qa_text(para, source, topic)

# -----------------------
# Save JSON
# -----------------------
with open("preventive_health_faq.json", "w") as f:
    json.dump(qa_list, f, indent=2)

print(f"Saved {len(qa_list)} QA pairs to preventive_health_faq.json")

# Optional: download in Colab
from google.colab import files
files.download("/content/preventive_health_faq.json")


Scraping https://www.globalfirstaidcentre.org/general-approach
<html><head><title>Request Rejected</title></head>
<body>The requested URL was rejected. Please consult with your administrator.<br/><br/>
Your support ID is abf1ac6c-93fa-44e0-b63b-24cfff2ee15b<br/><br/><a href="javascript:history.back();">[Go Back]</a></body></html>

Scraping https://www.globalfirstaidcentre.org/hand-hygiene
<html><head><title>Request Rejected</title></head>
<body>The requested URL was rejected. Please consult with your administrator.<br/><br/>
Your support ID is ad8dd7bf-9198-4db0-aa40-e09011a95982<br/><br/><a href="javascript:history.back();">[Go Back]</a></body></html>

Scraping https://www.globalfirstaidcentre.org/psychological-first-aid
<html><head><title>Request Rejected</title></head>
<body>The requested URL was rejected. Please consult with your administrator.<br/><br/>
Your support ID is 304dca9e-9dec-40ff-80f7-c32f7d450210<br/><br/><a href="javascript:history.back();">[Go Back]</a></body></html>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
!pip install docling

Collecting docling
  Downloading docling-2.60.0-py3-none-any.whl.metadata (11 kB)
Collecting docling-core<3.0.0,>=2.48.2 (from docling-core[chunking]<3.0.0,>=2.48.2->docling)
  Downloading docling_core-2.50.0-py3-none-any.whl.metadata (6.7 kB)
Collecting docling-parse<5.0.0,>=4.7.0 (from docling)
  Downloading docling_parse-4.7.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Downloading docling_ibm_models-3.10.2-py3-none-any.whl.metadata (7.3 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting rapidocr<4.0.0,>=3.3 (from docling)
  Downloading

In [20]:
import json
import requests
from docling.document_converter import DocumentConverter
from getpass import getpass
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import DecodingMethods
import re

# -----------------------
# Configuration
# -----------------------
INSTANCE_URL = "https://us-south.ml.cloud.ibm.com"
HEALTH_KEYWORDS = [
    "health", "exercise", "diet", "nutrition", "disease",
    "prevention", "well-being", "diabetes", "fitness",
    "cardiovascular", "obesity"
]

# -----------------------
# Watsonx Model Setup
# -----------------------
def setup_watsonx_model(api_key, project_id):
    """Initialize Watsonx Granite model with credentials."""
    credentials = {"url": INSTANCE_URL, "apikey": api_key}

    gen_params = {
        GenParams.DECODING_METHOD: DecodingMethods.SAMPLE,
        GenParams.MAX_NEW_TOKENS: 400,
        "temperature": 0.7
    }

    return ModelInference(
        model_id="ibm/granite-3-8b-instruct",
        credentials=credentials,
        project_id=project_id,
        params=gen_params,
        verify=False
    )

# -----------------------
# Document Processing with Docling
# -----------------------
def extract_text_from_url(url):
    """Extract text from URL using Docling."""
    print(f"Processing {url} with Docling...")

    try:
        converter = DocumentConverter()
        doc = converter.convert(url).document
        markdown_content = doc.export_to_markdown()

        # Split into paragraphs (simple approach)
        paragraphs = [
            p.strip() for p in markdown_content.split('\n\n')
            if p.strip() and len(p.strip()) > 50
        ]

        print(f"Extracted {len(paragraphs)} paragraphs from {url}")
        return paragraphs

    except Exception as e:
        print(f"Error processing {url}: {e}")
        return []

# -----------------------
# Content Filtering
# -----------------------
def is_health_paragraph(text, keywords=HEALTH_KEYWORDS):
    """Check if paragraph contains health-related keywords."""
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in keywords)

# -----------------------
# QA Generation
# -----------------------
def generate_qa_pairs(paragraph, model, source, topic):
    """Generate Q&A pairs from a paragraph using Granite model."""
    prompt = (
        "Create 3-5 question-answer pairs from the following paragraph. "
        "Output in plain text only with the format:\n"
        "Q: <question>\nA: <answer>\n\n"
        "Do NOT output JSON, numbering, or extra text.\n\n"
        f"{paragraph}"
    )

    try:
        response = model.generate_text(prompt=prompt)
        qa_matches = re.findall(
            r"Q:\s*(.+?)\nA:\s*(.+?)(?:\n\n|$)",
            response,
            flags=re.DOTALL
        )

        qa_pairs = []
        for q, a in qa_matches:
            qa_pairs.append({
                "question": q.strip(),
                "answer": a.strip(),
                "source": source,
                "topic": topic
            })

        return qa_pairs

    except Exception as e:
        print(f"Error generating QA pairs: {e}")
        return []

# -----------------------
# Main Processing Pipeline
# -----------------------
def process_urls(url_file, model, output_file="preventive_health_faq.json"):
    """Process URLs and generate QA pairs."""
    # Load URLs
    with open(url_file) as f:
        urls = json.load(f)

    qa_list = []

    # Process each URL
    for entry in urls:
        url = entry["url"]
        topic = entry.get("Topic", "General Health")

        # Extract paragraphs using Docling
        paragraphs = extract_text_from_url(url)

        # Filter and generate QA pairs
        for para in paragraphs:
            if is_health_paragraph(para):
                qa_pairs = generate_qa_pairs(para, model, url, topic)
                qa_list.extend(qa_pairs)

    # Save results
    with open(output_file, "w") as f:
        json.dump(qa_list, f, indent=2)

    print(f"\n✅ Saved {len(qa_list)} QA pairs to {output_file}")
    return qa_list

# -----------------------
# Main Execution
# -----------------------
if __name__ == "__main__":
    # Get credentials
    # api_key = getpass("Enter Watsonx API key: ")
    # project_id = getpass("Enter project ID: ")

    # Setup model
    model = setup_watsonx_model(api_key, project_id)

    # Process URLs and generate QA pairs
    qa_list = process_urls("/content/first-aid.json", model)

    # Optional: Download in Colab
    try:
        from google.colab import files
        files.download("/content/preventive_health_faq.json")
    except ImportError:
        print("Not in Colab environment - file saved locally")

Processing https://www.globalfirstaidcentre.org/general-approach with Docling...
Extracted 81 paragraphs from https://www.globalfirstaidcentre.org/general-approach
Processing https://www.globalfirstaidcentre.org/hand-hygiene with Docling...
Extracted 54 paragraphs from https://www.globalfirstaidcentre.org/hand-hygiene
Processing https://www.globalfirstaidcentre.org/psychological-first-aid with Docling...
Extracted 68 paragraphs from https://www.globalfirstaidcentre.org/psychological-first-aid
Processing https://www.globalfirstaidcentre.org/de-escalation-techniques-for-violent-behaviour with Docling...
Extracted 22 paragraphs from https://www.globalfirstaidcentre.org/de-escalation-techniques-for-violent-behaviour
Processing https://www.globalfirstaidcentre.org/medication-administration with Docling...
Extracted 19 paragraphs from https://www.globalfirstaidcentre.org/medication-administration
Processing https://www.globalfirstaidcentre.org/oxygen-administration with Docling...
Extracted 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>