In [1]:
!pip install langchain langchain-chroma langchain_google_genai langchain_community faiss-cpu

Collecting langchain-chroma
  Downloading langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-2.0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.12-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting chromadb!=0.5.4,!=0.5.5,<0.6.0,>=0.4.0 (from langchain-chroma)
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting fastapi<1,>=0.95.2 (from langchain-chroma)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse

In [2]:
import getpass
import os

os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

Enter your Google AI API key: ··········


In [10]:
import os
import time
import csv
import requests
import json
from bs4 import BeautifulSoup
from pydantic import BaseModel
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain import PromptTemplate
from langchain.schema import Document



# Initialize Google Generative AI Models
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
llm = ChatGoogleGenerativeAI(model="models/gemini-2.0-flash-exp")

# CSV File paths
VISITED_CATEGORIES_FILE = "visited_categories.csv"
VISITED_LINKS_FILE = "visited_links.csv"
OUTPUT_CSV_FILE = "medical_advices.csv"

# Ensure CSV files exist
for file_path in [VISITED_CATEGORIES_FILE, VISITED_LINKS_FILE, OUTPUT_CSV_FILE]:
    if not os.path.exists(file_path):
        with open(file_path, "w", newline="") as f:
            writer = csv.writer(f)
            if file_path == VISITED_CATEGORIES_FILE:
                writer.writerow(["category"])
            elif file_path == VISITED_LINKS_FILE:
                writer.writerow(["link"])
            else:
                writer.writerow(["q_type", "question", "answer"])

def get_category_links(main_url):
    """
    Scrape the main page to extract all category links.
    """
    try:
        response = requests.get(main_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract category links
        category_links = []
        categories = soup.find_all('li', class_='link-list-items__deprecated')
        for category in categories:
            link_tag = category.find('a', class_='link-list-link__deprecated')
            if link_tag:
                link = link_tag.get('href')
                if link and link.startswith('http'):
                    category_links.append(link)

        return category_links
    except Exception as e:
        print(f"Error fetching category links: {e}")
        return []

def get_card_links(category_url):
    """
    Scrape a category page to extract all card links.
    """
    try:
        response = requests.get(category_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract card links from the category page
        card_links = []
        cards = soup.find_all('a', class_='mntl-card-list-items')
        for card in cards:
            link = card.get('href')
            if link and link.startswith('http'):
                card_links.append(link)

        return card_links
    except Exception as e:
        print(f"Error fetching card links from {category_url}: {e}")
        return []

def scrape_card_content(card_url):
    """
    Visit each card link and scrape the content from the specified <p> tag onward.
    """
    try:
        response = requests.get(card_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Locate the starting <p> tag
        start_paragraph = soup.find('p', id='mntl-sc-block_1-0')
        if not start_paragraph:
            print(f"Start paragraph not found in {card_url}")
            return None

        # Collect content from the starting point
        content = ""
        stop_phrases = ["Summary", "A Word From Verywell"]

        # Traverse siblings from the starting point
        for element in start_paragraph.find_all_next():
            if element.name == "h2" and any(phrase in element.get_text() for phrase in stop_phrases):
                break
            content += element.get_text() + "\n"

        return content.strip()
    except Exception as e:
        print(f"Error scraping {card_url}: {e}")
        return None

def create_medical_advices_from_text(text):
    """
    Generate meaningful data in the format q_type, question, answer.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    documents = text_splitter.split_documents([Document(page_content=text)])

    db = FAISS.from_documents(documents, embeddings)
    retriever = db.as_retriever()

    prompt = PromptTemplate(
        template="""
        Generate meaningful data from the extracted data in the json format:
        [
            {{
                "q_type": "treatment",
                "question": "What are some discharge instructions following an asthma attack?",
                "answer": "Avoid allergy triggers and follow up with their provider to adjust the asthma plan as needed."
            }},
            {{
                "q_type": "trigger",
                "question": "What are some less common triggers for asthma attacks?",
                "answer": "Cold and flu, sinus issues, strenuous exercise, weather changes, stress and emotional distress."
            }}
            etc..
        ]

        Remove * from the output.

        Generate the data related only to the medical based only on this context: \n\n{context}\n
        """
    )

    rag_chain = (
        {"context": retriever}
        | prompt
        | llm
        | StrOutputParser()
    )

    response = rag_chain.invoke("")
    return response

def append_to_csv(file_path, data, mode="a"):
    """
    Append data to a CSV file.
    """
    with open(file_path, mode, newline="") as f:
        writer = csv.writer(f)
        writer.writerows(data)

def main():
    main_url = "https://www.verywellhealth.com/health-a-z-4014770"
    print("Fetching category links from the main page...")

    category_links = get_category_links(main_url)
    if not category_links:
        print("No category links found.")
        return

    # Load visited links
    with open(VISITED_LINKS_FILE, "r") as f:
        visited_links = set(row[0] for row in csv.reader(f) if row)

    print(f"Found {len(category_links)} category links. Starting to scrape each category...\n")

    for category_idx, category_url in enumerate(category_links, start=1):
        print(f"Processing category {category_idx}/{len(category_links)}: {category_url}")

        # Fetch card links within the category
        card_links = get_card_links(category_url)
        if not card_links:
            print(f"No card links found in category: {category_url}")
            continue

        scraped_content = []
        for idx, card_url in enumerate(card_links, start=1):
            if card_url in visited_links:
                print(f"Skipping already visited link: {card_url}")
                continue

            print(f"Scraping card {idx}/{len(card_links)}: {card_url}")
            content = scrape_card_content(card_url)
            if content:
                scraped_content.append(content)
                # Mark this link as visited
                append_to_csv(VISITED_LINKS_FILE, [[card_url]])

            # Combine and use model every two iterations
            if len(scraped_content) == 2:
                combined_text = "\n".join(scraped_content)
                print("Generating meaningful data using AI model...")
                time.sleep(15)  # Wait for 15 seconds before invoking the model
                raw_response = create_medical_advices_from_text(combined_text)

                try:
                    # Clean the response and parse JSON
                    clean_response = raw_response.strip().lstrip("```json").rstrip("```").strip()
                    medical_data = json.loads(clean_response)

                    # Process and save data
                    advice_rows = []
                    for entry in medical_data:
                        advice_rows.append([
                            entry["q_type"],
                            entry["question"],
                            entry["answer"]
                        ])

                    append_to_csv(OUTPUT_CSV_FILE, advice_rows)
                    print("Data generated and saved successfully.")
                except json.JSONDecodeError as e:
                    print(f"Failed to parse AI response: {e}\nResponse: {raw_response}")

                # Reset scraped content
                scraped_content = []

    print("Scraping and data generation completed.")

if __name__ == "__main__":
    main()


Fetching category links from the main page...
Found 334 category links. Starting to scrape each category...

Processing category 1/334: https://www.dotdashmeredith.com/brands-privacy
No card links found in category: https://www.dotdashmeredith.com/brands-privacy
Processing category 2/334: https://www.verywellhealth.com/how-to-take-the-abortion-pill-6363005
Skipping already visited link: https://www.verywellhealth.com/abortion-rights-measures-election-2024-8741895
Skipping already visited link: https://www.verywellhealth.com/uspstf-draft-recommendation-vitamin-d-calcium-supplements-8762509
Skipping already visited link: https://www.verywellhealth.com/semaglutide-eligibility-8759889
Skipping already visited link: https://www.verywellhealth.com/sunset-anxiety-8760547
Skipping already visited link: https://www.verywellhealth.com/adhd-meds-and-tums-8762002
Skipping already visited link: https://www.verywellhealth.com/blowing-your-nose-make-congestion-worse-8760833
Skipping already visited l

KeyboardInterrupt: 