In [5]:
import sys
from dotenv import load_dotenv
load_dotenv('../../.env')

sys.path.append('../../')
sys.path.append('../../server')
sys.path.append('../../server/app')

from server.app.SIWeaviateClient import SIWeaviateClient
from server.app.schemas import Document, WebsiteQAChunk, WebsiteQAMetadata

from playwright.sync_api import sync_playwright, Page, Playwright, Browser

import os
import uuid
from typing import List, Tuple
from weaviate import WeaviateClient
from server.app.processing.BaseDocumentProcessor import BaseDocumentProcessor

from tenacity import retry, stop_after_attempt, wait_exponential
import logging
import traceback

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CiteHealthQA(BaseDocumentProcessor):
    def __init__(self, client: WeaviateClient, browser: Browser, qa_url: str):
        super().__init__(client)
        self.browser = browser
        self.qa_url = qa_url

    def extract_document(self) -> Tuple[Document, List[WebsiteQAChunk]]:
        page = self.browser.new_page()
        document, chunk = self.get_qa_from_url(self.qa_url, page)
        page.close()
        return document, [chunk]

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    def get_qa_from_url(self, url: str, page: Page):
        try:
            page.goto(url)
            page.wait_for_selector("article>h1")
            title = page.query_selector("article>h1").text_content()

            post_parts = page.query_selector_all(".post-question")
            question_elem = post_parts[0].query_selector('p')
            question = question_elem.inner_text()
            
            answer_elem = post_parts[1].query_selector(">div")
            
            children_info = answer_elem.evaluate("""
                el => Array.from(el.children).map(child => ({
                    tagName: child.tagName.toLowerCase(),
                    className: child.className,
                    textContent: child.textContent
                }))
            """)
            answer_parts = [f"```{child['textContent']}```" if child['tagName'] == 'blockquote' else child['textContent'] for child in children_info]
            answer = "\n".join(answer_parts)
            
            document = Document(
                document_id=self.get_random_uuid(),
                public_path=url,
                original_public_path=url,
                media_name=title,
            )
            chunk = WebsiteQAChunk(
                chunk_id=self.id,
                document_id=document.document_id,
                title=title,
                text=f"Question: {question}\nAnswer: {answer}",
                metadata=WebsiteQAMetadata(
                    question=question,
                    answer=answer,
                    url=url,
                )
            )
            return document, chunk
        except TimeoutError:
            print(f"Timeout occurred for URL: {url}. Retrying...")
            raise

    def process_document(self):
        document, chunks = self.extract_document()
        self.save_document(document, chunks)

def get_qas_urls(base_url, browser: Browser):
    from urllib.parse import urljoin
    page = browser.new_page()
    page.goto(base_url)
    qas_urls = []
    while True:
        page.wait_for_selector(".card-list")
        questions_elems = page.query_selector_all(".card-list>li")
        for elem in questions_elems:
            relative_url = elem.get_attribute('data-document-url')
            if relative_url:
                absolute_url = urljoin(base_url, relative_url)
                qas_urls.append(absolute_url)
        
        next_page_button = page.query_selector(".page-item.page-item-arrow .icon-arrow-right")
        if next_page_button:
            next_page = page.query_selector(".page-item.page-item-arrow:has(.icon-arrow-right) > a")
            next_page_link = next_page.get_attribute('href')
            next_page_full_link = urljoin(base_url, next_page_link)
            print("LEN URL", len(qas_urls))
            print("GOING NEXT PAGE", next_page_full_link)
            page.goto(next_page_full_link)
            page.wait_for_selector(".card-list")
        else:
            break
        
    print("last LEN URL", len(qas_urls))
    page.close()
    return qas_urls

def main():
    with SIWeaviateClient() as client:
        with sync_playwright() as playwright:
            browser = playwright.chromium.launch()
            base_url = "https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions?tx_questionssante_search%5Bpage%5D=208#results-list"
            urls = get_qas_urls(base_url, browser)
            processor = CiteHealthQA(client, browser, urls[0])
            processor.process_document()

main()


INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"


Error: It looks like you are using Playwright Sync API inside the asyncio loop.
Please use the Async API instead.