<a href="https://colab.research.google.com/github/cahide/ibm-skillsbuild-preventive-healthcare-chatbot/blob/main/IBM_SkillsBuild_HealthCare_Chatbot_Data_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install dependencies (run once in Colab)
!pip install ibm-watsonx-ai beautifulsoup4 requests

Collecting ibm-watsonx-ai
  Downloading ibm_watsonx_ai-1.4.4-py3-none-any.whl.metadata (3.3 kB)
Collecting lomond (from ibm-watsonx-ai)
  Downloading lomond-0.3.3-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting ibm-cos-sdk<2.15.0,>=2.12.0 (from ibm-watsonx-ai)
  Downloading ibm_cos_sdk-2.14.3.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ibm-cos-sdk-core==2.14.3 (from ibm-cos-sdk<2.15.0,>=2.12.0->ibm-watsonx-ai)
  Downloading ibm_cos_sdk_core-2.14.3.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ibm-cos-sdk-s3transfer==2.14.3 (from ibm-cos-sdk<2.15.0,>=2.12.0->ibm-watsonx-ai)
  Downloading ibm_cos_sdk_s3transfer-2.14.3.tar.gz (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [17]:
from google.colab import userdata

api_key = userdata.get('WATSONX_API_KEY')
project_id = userdata.get('WATSONX_PROJECT_ID')

In [20]:
import json
import requests
from bs4 import BeautifulSoup
from getpass import getpass
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import DecodingMethods
import re

# -----------------------
#  Watsonx credentials
# -----------------------
instance_url = "https://us-south.ml.cloud.ibm.com"

credentials = {"url": instance_url, "apikey": api_key}

# -----------------------
# Granite model setup
# -----------------------
gen_params = {
    GenParams.DECODING_METHOD: DecodingMethods.SAMPLE,
    GenParams.MAX_NEW_TOKENS: 400,
    "temperature": 0.7
}

model = ModelInference(
    model_id="ibm/granite-3-8b-instruct",
    credentials=credentials,
    project_id=project_id,
    params=gen_params,
    verify=False
)

# -----------------------
# Scraping public sources
# -----------------------
def scrape_text(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    paragraphs = soup.find_all('p')
    # return paragraphs as list of strings
    return [p.get_text().strip() for p in paragraphs if p.get_text().strip()]

# -----------------------
# Health paragraph filter
# -----------------------
health_keywords = ["health", "exercise", "diet", "nutrition", "disease",
                   "prevention", "well-being", "diabetes", "fitness", "cardiovascular", "obesity"]

def is_health_paragraph(text):
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in health_keywords)

# -----------------------
# Generate QA pairs (text parsing)
# -----------------------
qa_list = []

def generate_qa_text(paragraph, source, topic):
    prompt = (
        "Create 3-5 question-answer pairs from the following paragraph. "
        "Output in plain text only with the format:\n"
        "Q: <question>\nA: <answer>\n\n"
        "Do NOT output JSON, numbering, or extra text.\n\n"
        f"{paragraph}"
    )
    response = model.generate_text(prompt=prompt)

    # Regex to extract Q/A pairs
    qa_matches = re.findall(r"Q:\s*(.+?)\nA:\s*(.+?)(?:\n\n|$)", response, flags=re.DOTALL)

    for q, a in qa_matches:
        difficulty = "Basic" if len(a.split()) <= 20 else "Intermediate" if len(a.split()) <= 40 else "Advanced"
        qa_list.append({
            "question": q.strip(),
            "answer": a.strip(),
            "source": source,
            "topic": topic
        })
# -----------------------
# Load URLs from JSON and process
# -----------------------
with open("scrape_url.json") as f:
    urls = json.load(f)


for entry in urls:
    source = entry.get("source", "Health")
    topic = entry.get("topic", "Health")
    url = entry["url"]
    paragraphs = scrape_text(url)
    for para in paragraphs:
        if is_health_paragraph(para):
            generate_qa_text(para, source, topic)

# -----------------------
# Save JSON
# -----------------------
with open("preventive_health_faq.json", "w") as f:
    json.dump(qa_list, f, indent=2)

print(f"Saved {len(qa_list)} QA pairs to preventive_health_faq.json")

# Optional: download in Colab
from google.colab import files
files.download("/content/preventive_health_faq.json")


Saved 1620 QA pairs to preventive_health_faq.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>