In [1]:
import sys
from dotenv import load_dotenv
load_dotenv('../../.env')

sys.path.append('../../')
sys.path.append('../../server')
sys.path.append('../../server/app')


from server.app.SIWeaviateClient import SIWeaviateClient
from server.app.schemas import Document, WebsiteQAChunk, WebsiteQAMetadata

import asyncio
from playwright.async_api import async_playwright, Page, Playwright

import os
import uuid
from typing import Awaitable, List, Tuple
from weaviate import WeaviateClient
from server.app.processing.BaseDocumentProcessor import BaseDocumentProcessor

from tenacity import retry, stop_after_attempt, wait_exponential
import nest_asyncio
import logging
import traceback
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CiteHealthQA(BaseDocumentProcessor):
    def __init__(self, client: WeaviateClient, base_url: str):
        self.base_url = base_url
        super().__init__(client)

    async def extract_document(self) -> Tuple[Document, List[WebsiteQAChunk]]:
        async with async_playwright() as playwright:
            browser = await playwright.chromium.launch()
            page = await browser.new_page()
            
            qas_urls = await self.get_qas_urls(self.base_url, page)
            results = await self.process_urls_in_batches(qas_urls, playwright, batch_size=10)
            
            await browser.close()

        # Convert results to Document and WebsiteQAChunks
        document = Document(
            document_id=self.get_random_uuid(),
            public_path=self.base_url,
            original_public_path=self.base_url,
            media_name="Cité de la santé Q&A"
        )
        chunks = [self.create_document_chunk(doc, chunk) for doc, chunk in results]
        
        return document, chunks

    def create_document_chunk(self, document, chunk):
        return WebsiteQAChunk(
            chunk_id=self.get_random_uuid(),
            document_id=document,
            text=chunk.text,
            metadata=chunk.metadata.dict()
        )

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def get_qa_from_url(self, url: str, page: Page):
        try:
            await page.goto(url)
            await page.wait_for_selector("article>h1")
            title = await page.query_selector("article>h1")
            title = await title.text_content()

            post_parts = await page.query_selector_all(".post-question")
            question_elem = await post_parts[0].query_selector('p')
            question = await question_elem.inner_text()
            
            answer_elem = await post_parts[1].query_selector(">div")
            
            children_info = await answer_elem.evaluate("""
                el => Array.from(el.children).map(child => ({
                    tagName: child.tagName.toLowerCase(),
                    className: child.className,
                    textContent: child.textContent
                }))
            """)
            answer_parts = [f"```{child['textContent']}```" if child['tagName'] == 'blockquote' else child['textContent'] for child in children_info]
            answer = "\n".join(answer_parts)
            
            document = Document(
                document_id=self.get_random_uuid(),
                public_path=url,
                original_public_path=url,
                media_name=title,
            )
            chunk = WebsiteQAChunk(
                chunk_id=self.id,
                document_id=document.document_id,
                title=title,
                text=f"Question: {question}\nAnswer: {answer}",
                metadata=WebsiteQAMetadata(
                    question=question,
                    answer=answer,
                    url=url,
                )
            )
            return document, chunk
        except TimeoutError:
            print(f"Timeout occurred for URL: {url}. Retrying...")
            raise


    def process_document(self):
        nest_asyncio.apply()
        loop = asyncio.get_event_loop()
        document, chunks = loop.run_until_complete(self.extract_document())
        self.save_document(document, chunks)


    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def get_qa_from_url(self, url: str, page: Page):
        try:
            await page.goto(url)
            await page.wait_for_selector("article>h1")
            title = await page.query_selector("article>h1")
            title = await title.text_content()

            post_parts = await page.query_selector_all(".post-question")
            question_elem = await post_parts[0].query_selector('p')
            question = await question_elem.inner_text()
            
            answer_elem = await post_parts[1].query_selector(">div")
            
            children_info = await answer_elem.evaluate("""
                el => Array.from(el.children).map(child => ({
                    tagName: child.tagName.toLowerCase(),
                    className: child.className,
                    textContent: child.textContent
                }))
            """)
            answer_parts = [f"```{child['textContent']}```" if child['tagName'] == 'blockquote' else child['textContent'] for child in children_info]
            answer = "\n".join(answer_parts)
            
            document = Document(
                document_id=self.get_random_uuid(),
                public_path=url,
                original_public_path=url,
                media_name=title,
            )
            chunk = WebsiteQAChunk(
                chunk_id=self.id,
                document_id=document.document_id,
                title=title,
                text=f"Question: {question}\nAnswer: {answer}",
                metadata=WebsiteQAMetadata(
                    question=question,
                    answer=answer,
                    url=url,
                )
            )
            return document, chunk
        except TimeoutError:
            print(f"Timeout occurred for URL: {url}. Retrying...")
            raise










async def process_url_batch(urls, playwright: Playwright):
    browser = await playwright.chromium.launch()
    
    async def process_single_url(url):
        page = await browser.new_page()
        try:
            return await get_qa_from_url(url, page)
        except Exception as e:
            logger.error(f"Failed to process URL {url} after retries: {str(e)}")
            logger.error(traceback.format_exc())
            return None
        finally:
            await page.close()

    tasks = [process_single_url(url) for url in urls]
    results = await asyncio.gather(*tasks)
    await browser.close()
    return [result for result in results if result is not None]


async def process_urls_in_batches(urls, playwright: Playwright, batch_size=10) -> List[Tuple[Document, WebsiteQAChunk]]:
    all_results = []
    for i in range(0, len(urls), batch_size):
        batch = urls[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1} ({len(batch)} URLs)")
        results = await process_url_batch(batch, playwright)
        all_results.extend(results)
    return all_results

async def get_qas_urls(self, base_url, page: Page):
    from urllib.parse import urljoin
    await page.goto(base_url)
    qas_urls = []
    while True:
        await page.wait_for_selector(".card-list")
        questions_elems = await page.query_selector_all(".card-list>li")
        for elem in questions_elems:
            relative_url = await elem.get_attribute('data-document-url')
            if relative_url:
                absolute_url = urljoin(base_url, relative_url)
                qas_urls.append(absolute_url)
        
        next_page_button = await page.query_selector(".page-item.page-item-arrow .icon-arrow-right")
        if next_page_button:
            next_page = await page.query_selector(".page-item.page-item-arrow:has(.icon-arrow-right) > a")
            next_page_link = await next_page.get_attribute('href')
            next_page_full_link = urljoin(base_url, next_page_link)
            print("LEN URL", len(qas_urls))
            print("GOING NEXT PAGE", next_page_full_link)
            await page.goto(next_page_full_link)
            await page.wait_for_selector(".card-list")
        else:
            break
        
    print("last LEN URL", len(qas_urls))
    return qas_urls


async def main():
    with SIWeaviateClient() as client:
        base_url = "https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions?tx_questionssante_search%5Bpage%5D=21#results-list"        
        processor = CiteHealthQA(client, base_url)
        processor.process_document()

await main()


  class ChunkWithScore(Generic[ChunkType], BaseDocumentChunk):
  class ChunkWithScore(Generic[ChunkType], BaseDocumentChunk):
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"


LEN URL 10
GOING NEXT PAGE https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions?tx_questionssante_search%5Bpage%5D=22#results-list
LEN URL 20
GOING NEXT PAGE https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions?tx_questionssante_search%5Bpage%5D=23#results-list
LEN URL 30
GOING NEXT PAGE https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions?tx_questionssante_search%5Bpage%5D=24#results-list
last LEN URL 35
Processing batch 1 (10 URLs)


ERROR:__main__:Failed to process URL https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions/2024/04/ulcere-gastrique-metaplasie-intestinale after retries: RetryError[<Future at 0x7fba946beb20 state=finished raised ValidationError>]
ERROR:__main__:Traceback (most recent call last):
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/tenacity/asyncio/__init__.py", line 114, in __call__
    result = await fn(*args, **kwargs)
  File "/tmp/ipykernel_639875/3371637067.py", line 94, in get_qa_from_url
    chunk = WebsiteQAChunk(
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/pydantic/main.py", line 176, in __init__
    self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for WebsiteQAChunk
document
  Field required [type=missing, input_value={'chunk_id': '188fe617-ca...etaplas

Processing batch 2 (10 URLs)


ERROR:__main__:Failed to process URL https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions/2024/03/covid-long-je-ne-suis-pas-vaccinee after retries: RetryError[<Future at 0x7fba94540f10 state=finished raised ValidationError>]
ERROR:__main__:Traceback (most recent call last):
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/tenacity/asyncio/__init__.py", line 114, in __call__
    result = await fn(*args, **kwargs)
  File "/tmp/ipykernel_639875/3371637067.py", line 94, in get_qa_from_url
    chunk = WebsiteQAChunk(
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/pydantic/main.py", line 176, in __init__
    self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for WebsiteQAChunk
document
  Field required [type=missing, input_value={'chunk_id': '188fe617-ca...-ne-suis-pas

Processing batch 3 (10 URLs)


ERROR:__main__:Failed to process URL https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions/2024/03/recherche-goujerot-sjrogen after retries: RetryError[<Future at 0x7fba940c2c40 state=finished raised ValidationError>]
ERROR:__main__:Traceback (most recent call last):
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/tenacity/asyncio/__init__.py", line 114, in __call__
    result = await fn(*args, **kwargs)
  File "/tmp/ipykernel_639875/3371637067.py", line 94, in get_qa_from_url
    chunk = WebsiteQAChunk(
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/pydantic/main.py", line 176, in __init__
    self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for WebsiteQAChunk
document
  Field required [type=missing, input_value={'chunk_id': '188fe617-ca...rche-goujerot-sjroge

Processing batch 4 (5 URLs)


ERROR:__main__:Failed to process URL https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions/2023/07/fausse-couche after retries: RetryError[<Future at 0x7fba8ff34d30 state=finished raised ValidationError>]
ERROR:__main__:Traceback (most recent call last):
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/tenacity/asyncio/__init__.py", line 114, in __call__
    result = await fn(*args, **kwargs)
  File "/tmp/ipykernel_639875/3371637067.py", line 94, in get_qa_from_url
    chunk = WebsiteQAChunk(
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/pydantic/main.py", line 176, in __init__
    self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for WebsiteQAChunk
document
  Field required [type=missing, input_value={'chunk_id': '188fe617-ca...2023/07/fausse-couche')}, input_t

LEN URL 10
GOING NEXT PAGE https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions?tx_questionssante_search%5Bpage%5D=22#results-list
LEN URL 20
GOING NEXT PAGE https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions?tx_questionssante_search%5Bpage%5D=23#results-list
LEN URL 30
GOING NEXT PAGE https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions?tx_questionssante_search%5Bpage%5D=24#results-list
last LEN URL 35
Processing batch 1 (10 URLs)


ERROR:__main__:Failed to process URL https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions/2024/04/post-operatoire-du-pudendal after retries: RetryError[<Future at 0x7fba940c5ee0 state=finished raised ValidationError>]
ERROR:__main__:Traceback (most recent call last):
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/tenacity/asyncio/__init__.py", line 114, in __call__
    result = await fn(*args, **kwargs)
  File "/tmp/ipykernel_639875/3371637067.py", line 94, in get_qa_from_url
    chunk = WebsiteQAChunk(
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/pydantic/main.py", line 176, in __init__
    self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for WebsiteQAChunk
document
  Field required [type=missing, input_value={'chunk_id': '188fe617-ca...peratoire-du-pudend

Processing batch 2 (10 URLs)


ERROR:__main__:Failed to process URL https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions/2024/03/y-a-til-un-delai-pour-reouvrir-un-dossi after retries: RetryError[<Future at 0x7fba941cd5e0 state=finished raised ValidationError>]
ERROR:__main__:Traceback (most recent call last):
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/tenacity/asyncio/__init__.py", line 114, in __call__
    result = await fn(*args, **kwargs)
  File "/tmp/ipykernel_639875/3371637067.py", line 94, in get_qa_from_url
    chunk = WebsiteQAChunk(
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/pydantic/main.py", line 176, in __init__
    self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for WebsiteQAChunk
document
  Field required [type=missing, input_value={'chunk_id': '188fe617-ca...our-reo

Processing batch 3 (10 URLs)


ERROR:__main__:Failed to process URL https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions/2024/03/sexualite after retries: RetryError[<Future at 0x7fba943200d0 state=finished raised ValidationError>]
ERROR:__main__:Traceback (most recent call last):
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/tenacity/asyncio/__init__.py", line 114, in __call__
    result = await fn(*args, **kwargs)
  File "/tmp/ipykernel_639875/3371637067.py", line 94, in get_qa_from_url
    chunk = WebsiteQAChunk(
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/pydantic/main.py", line 176, in __init__
    self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for WebsiteQAChunk
document
  Field required [type=missing, input_value={'chunk_id': '188fe617-ca...ons/2024/03/sexualite')}, input_type=

Processing batch 4 (5 URLs)


ERROR:__main__:Failed to process URL https://www.cite-sciences.fr/fr/au-programme/lieux-ressources/cite-de-la-sante/une-question-en-sante/questions-sante/toutes-les-questions/2024/03/avortement-1 after retries: RetryError[<Future at 0x7fba94545160 state=finished raised ValidationError>]
ERROR:__main__:Traceback (most recent call last):
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/tenacity/asyncio/__init__.py", line 114, in __call__
    result = await fn(*args, **kwargs)
  File "/tmp/ipykernel_639875/3371637067.py", line 94, in get_qa_from_url
    chunk = WebsiteQAChunk(
  File "/home/erwan/anaconda3/envs/scienceinfuse/lib/python3.9/site-packages/pydantic/main.py", line 176, in __init__
    self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for WebsiteQAChunk
document
  Field required [type=missing, input_value={'chunk_id': '188fe617-ca.../2024/03/avortement-1')}, input_ty