In [None]:
import json
import random
import os
import asyncio
from typing import List, Dict, Any, Optional
from playwright.async_api import async_playwright, Page, Browser, Playwright

# Playwright is essential for handling modern, dynamic websites,
# security obstacles, and providing a robust interaction environment.
from playwright.sync_api import sync_playwright, Page, Browser

# --- CONFIGURATION & SETUP ---
PAGES = {'doctoralia' : 'https://www.doctoralia.co', 
         'topdoctors' : 'https://www.topdoctors.co',
         'compensar' : 'https://www.topdoctors.com.co/seguro-medico/compensar-cuadro-medico/especialidad/',
         'colsanitas' : 'https://www.colsanitas.com/directorio-medico', 
         'nuevaeps' : ' https://directoriopac.nuevaeps.com.co/#/home',
         'famisanar' : 'https://www.famisanar.com.co/afiliados/pbs/directorio-medico', 
         'clinica de la sabana' : 'https://clinicadelasabanas.org/directorio-medico',
         'sociedad de cardiologia' : 'https://scc.org.co/miembros'
}



In [None]:
import json
import random
import os
import asyncio
from typing import List, Dict, Any, Optional

# Playwright is essential for handling modern, dynamic websites,
# security obstacles, and providing a robust interaction environment.
# SWITCHED TO ASYNC API to resolve "inside the asyncio loop" error.
from playwright.async_api import async_playwright, Page, Browser, Playwright

# --- CONFIGURATION & SETUP ---

# Fields loaded from the config file as a scraper's attribute
TARGET_FIELDS = ["doctor_name", "specialty", "contact_info", "address"]

#Initializes a name for the web page to crawl into

CACHE_FILE = "successful_paths_cache.json"

class RLScraperAgent:
    """
    A conceptual Reinforcement Learning Scraper Agent.
    It manages the web environment, the policy cache, and the core interaction loop.
    All core methods are now asynchronous to be compatible with Python event loops.
    """
    def __init__(self):
        """Initializes the cache and sets up placeholders for async Playwright objects."""
        self.playwright: Optional[Playwright] = None
        self.browser: Optional[Browser] = None
        self.log_term_memory = None #self._load_cache()
        self.short_term_meory = None
        self.current_user_id = "user_123" 

    def _load_memory(self) -> Dict[str, Any]:
        """Loads successful paths from the cache file."""
        if os.path.exists(CACHE_FILE):
            print(f"Loading cached paths from {CACHE_FILE}...")
            with open(CACHE_FILE, 'r') as f:
                return json.load(f)
        return {}

    def _save_cache(self):
        """Saves the current successful paths to the cache file."""
        print(f"Saving successful paths to {CACHE_FILE}...")
        with open(CACHE_FILE, 'w') as f:
            json.dump(self.cache, f, indent=4)

    async def start(self):
        """Initializes the asynchronous Playwright instance and launches the browser."""
        if not self.browser:
            self.playwright = await async_playwright().start()
            #modificar configurar segun la configuracion slow_mo, headless <- False permite visualizar navagacion
            self.browser = await self.playwright.chromium.launch(
                headless=False, 
                slow_mo=500 
            )
            print("Asynchronous browser launched in headful (visible) mode.")
    
    
    async def _handle_initial_obstacles(self, page: Page):
        """
        NEW FUNCTION: Attempts to click common consent or cookie banners that might 
        block subsequent interaction on the initial page load.
        """
        consent_selectors = [
            'button:has-text("Aceptar")',
            'button:has-text("Accept")',
            'button:has-text("Allow")',
            '[aria-label*="consent"] button',
            '[id*="cookie"] button',
            '.cc-btn:has-text("Accept")', # experiemntar ir a'nadiendo variaciones
        ]
        
        print("Checking for initial obstacles (cookie/consent banners)...")
        
        for selector in consent_selectors:
            try:
                # Use a short timeout, as the banner is usually instant
                await page.click(selector, timeout=3000)
                print(f"Successfully clicked consent button using selector: {selector}")
                # Wait for the DOM to settle after clicking
                await page.wait_for_load_state("domcontentloaded", timeout=5000)
                return True
            except:
                continue # Try the next selector if this one fails
        
        print("No blocking consent banner found or clicked.")
        return False      
    # JavaScript snippet to find the nearest ancestor with a usable ID or class for clustering
    JS_GET_CONTAINER = """
    (element) => {
        let current = element;
        // Iterate up to 5 levels to find a meaningful container (id or class)
        for (let i = 0; i < 5 && current; i++) {
            if (current.id) {
                return `#${current.id}`;
            }
            if (current.className) {
                // Find a common, descriptive class name for structural clustering
                const classes = current.className.split(' ').filter(c => c.length > 0);
                const descriptive_classes = classes.filter(c => 
                    c.toLowerCase().includes('nav') || 
                    c.toLowerCase().includes('menu') || 
                    c.toLowerCase().includes('bar') || 
                    c.toLowerCase().includes('footer') ||
                    c.toLowerCase().includes('filter') ||
                    c.toLowerCase().includes('main')
                );
                if (descriptive_classes.length > 0) {
                    // Use the first descriptive class found
                    return `.${descriptive_classes[0]}`;
                }
            }
            current = current.parentElement;
        }
        return 'root_body'; // Default if no defining container found
    }
    """

    async def _get_state(self, page: Page) -> Dict[str, Any]:
        """
        [RL COMPONENT: STATE]
        Translates the current web page into a structured state representation
        for the RL model, now featuring structural clustering.
        """
        elements = []
        try:
            # MODIFICATION: Filter for ONLY visible click-based and selection-based interactive elements: 
            # links (a), buttons (button), and selection boxes (select). Input/text areas are excluded.
            locators = page.locator('a:visible, button:visible, [role="button"]:visible, [role="link"]:visible, select:visible')
            count = await locators.count()

            for i in range(min(count, 50)): # Limit to 50 elements for speed
                element = locators.nth(i)
                
                # Await the evaluation result, then use .lower() (Python native)
                tag_name = (await element.evaluate("e => e.tagName")).lower()
                
                # 1. Start with inner text (best for links/buttons)
                text = await element.text_content()
                descriptive_text = text.strip() if text and text.strip() else ""

                # 2. Get the containing cluster ID
                container_id = await element.evaluate(self.JS_GET_CONTAINER)

                # 3. Fallback to tag name if no other text is found
                if not descriptive_text or descriptive_text.startswith('<'):
                    descriptive_text = f"<{tag_name.upper()}> (No Text/Label)"
                
                # --- FILTERING LOGIC ---
                # Only include the element if it has descriptive text.
                if "(No Text/Label)" in descriptive_text:
                    continue # Skip to the next element if it lacks meaningful text
                # --- END FILTERING LOGIC ---
                
                # Create a specific selector that the agent can use for the action
                specific_selector = f'{tag_name}:visible:nth-of-type({i + 1})'
                
                elements.append({
                    "id": i,
                    "text": descriptive_text,
                    "selector": specific_selector, # The precise locator for the action (reference)
                    "tag": tag_name, # Useful for determining if the action should be 'click' or 'select'
                    "container_id": container_id # New field for clustering
                })
        except Exception as e:
            # Print the error for debugging, but prevent crash
            print(f"Error getting state: {e}")

        # --- CLUSTERING LOGIC (Structural/Container Based) ---
        clustered_elements: Dict[str, List[Dict[str, Any]]] = {}
        for element in elements:
            # Use and remove the container_id as the cluster key
            container = element.pop('container_id') 
            if container not in clustered_elements:
                clustered_elements[container] = []
            clustered_elements[container].append(element)
        
        # --- NEW LLM SUMMARY GENERATION (Cluster-focused) ---
        llm_summary_parts = []
        cluster_id_counter = 0
        
        # Format the summary by cluster
        for container_name, cluster in clustered_elements.items():
            llm_summary_parts.append(f"\n--- CLUSTER {cluster_id_counter}: {container_name} ---")
            for e in cluster:
                # Format: [Element ID | TAG]: Descriptive Text
                llm_summary_parts.append(f"[{e['id']} | {e['tag'].upper()}]: {e['text']}")
            cluster_id_counter += 1

        llm_summary = (
            "No active interactive elements with meaningful text found."
            if not llm_summary_parts
            else "\n".join(llm_summary_parts)
        )
        # -----------------------------------
        
        return {
            "url": page.url,
            "title": await page.title(),
            "interactive_elements": elements, # Elements without the container_id
            "clustered_actions": clustered_elements, # The new clustered structure
            "llm_summary": llm_summary, # Added for LLM consumption
            "target_found": await self._check_success(page)
        }

    async def _take_action(self, page: Page, action: Dict[str, Any]) -> bool:
        """
        [RL COMPONENT: ACTION & TRANSITION]
        Executes an action on the page and waits for navigation/changes.
        
        Now supports 'click' (for A/BUTTON) and a placeholder for 'select'.
        """
        action_type = action.get('type')
        target_selector = action.get('selector')
        target_tag = action.get('tag') 

        try:
            if action_type == 'click' and target_selector:
                await page.click(target_selector, timeout=5000) 
            elif action_type == 'select' and target_selector:
                # Placeholder for selecting an option in a drop-down/scroll box
                # For real implementation, the LLM must choose the value to select
                print(f"ACTION: Attempting to select value '1' from {target_selector}")
                await page.select_option(target_selector, value='1') # Dummy selection
            elif action_type == 'navigate':
                await page.goto(action.get('url'))
            else:
                return False # Unknown action
            
            # Wait for the page to stabilize after the action
            await page.wait_for_load_state("networkidle", timeout=5000)
            return True

        except Exception as e:
            # Check for common Playwright errors (like element not interactable)
            print(f"Action failed ({action_type} on '{target_selector}'): {e}")
            return False

    async def _get_reward(self, prev_state: Dict[str, Any], current_state: Dict[str, Any]) -> float:
        """
        [RL COMPONENT: REWARD]
        Calculates the reward based on the transition between states.
        """
        if current_state.get('target_found'):
            print("*** SUCCESS: TARGET FIELDS FOUND! (Reward +100) ***")
            return 100.0
        
        # Check for non-terminal rewards/punishments
        if current_state['url'] != prev_state['url'] or len(current_state['interactive_elements']) != len(prev_state['interactive_elements']):
            # Positive change, suggesting progress (e.g., navigated to a result page)
            return 1.0 
        
        # Default low punishment for wasting an action
        return -0.1

    async def _check_success(self, page: Page) -> bool:
        """Checks if all target fields are present on the page."""
        try:
            # Simplified check: look for common selectors that might contain the data
            name_count = await page.locator('h1:has-text("Dr."), h2:has-text("Dr.")').count()
            contact_count = await page.locator('a[href^="tel"], a[href^="mailto"]').count()

            return name_count > 0 and contact_count > 0
        except:
            return False

    async def _learn_path(self, page: Page, max_steps: int = 15) -> Optional[List[Dict[str, Any]]]:
        """
        [RL CORE LOOP]
        Uses a simple Random Walk to simulate exploration and find a successful path.
        """
        current_path: List[Dict[str, Any]] = []
        
        for step in range(max_steps):
            prev_state = await self._get_state(page)

            if prev_state['target_found']:
                print(f"Path learned successfully in {step} steps.")
                return current_path

            print(f"\n--- Step {step+1}/{max_steps} | Current URL: {prev_state['url']} ---")
            
            # --- LLM STATE SUMMARY (Human/LLM-Readable Representation) ---
            print("--- Current LLM State Summary (Elements Clustered by DOM Container) ---")
            print(prev_state['llm_summary'])
            print("-------------------------------------------------------------")

            available_elements = prev_state['interactive_elements']
            if not available_elements:
                 print("No visible interactive elements found. Exploration terminated.")
                 break
            
            # 1. Choose an Action (RL decision point - currently Random Walk)
            # This is where your future LLM integration will use prev_state['llm_summary']
            chosen_element = random.choice(available_elements)
            
            # Determine action type based on tag
            action_type = 'select' if chosen_element['tag'] == 'select' else 'click'
            
            action = {
                'type': action_type, 
                'selector': chosen_element['selector'],
                'clicked_text': chosen_element['text'], # Store for logging
                'tag': chosen_element['tag']
            }
            
            cleaned_text = action['clicked_text'].replace('\n', ' ').strip()
            print(f"\n>> AGENT CHOOSES TO: {action_type.upper()} on element: '{cleaned_text[:60]}...' (<{action['tag'].upper()}>) [ID: {chosen_element['id']}]")

            # 2. Take the Action
            success = await self._take_action(page, action)
            
            if success:
                current_path.append(action)

            # 3. Observe the Next State and Reward
            current_state = await self._get_state(page)
            reward = await self._get_reward(prev_state, current_state)
            
            # LOGGING THE RESULTING STATE
            print(f"| URL After Action: {current_state['url']}")
            print(f"| Reward Received: {reward}")
            
            # (RL Model update logic goes here)
            
            if reward < -5.0: # Significant punishment (e.g., hitting a captcha wall)
                print("Encountered significant punishment. Terminating path exploration.")
                break

        print(f"Exploration finished after {max_steps} steps without finding all targets.")
        return None

    async def scrape_page(self, directory_url: str) -> Optional[Dict[str, Any]]:
        """
        Main function to scrape a medical directory page using cached paths or RL exploration.
        """
        if not self.browser:
            await self.start()
            
        page: Page = await self.browser.new_page()
        page_key = directory_url # Unique key for the cache

        try:
            print(f"Starting scrape for URL: {directory_url}")
            # MODIFICATION: Increased timeout and changed wait_until for reliable page load
            await page.goto(directory_url, wait_until="networkidle", timeout=60000)
            
            self._handle_initial_obstacles(page)
            ans = await self._get_state(page)
            print("Current state :", ans)
            return ans

            # --- PHASE 1: Try Cached Path (Exploitation) ---
            # NOTE: This will likely fail on the first run as no path is cached yet.
            if self.cache and page_key in self.cache:
                print("Cached path found. Attempting fast-track exploitation...")
                successful_path = self.cache[page_key]
                
                for step, action in enumerate(successful_path):
                    if not await self._take_action(page, action):
                        print(f"Cached step {step} failed. Reverting to exploration.")
                        break # Go to exploration if cached path fails

                    if await self._check_success(page):
                        print("Cached path executed successfully and data found.")
                        return await self._extract_data(page)

            # --- PHASE 2: RL Exploration (Learning) ---
            print("Starting RL exploration to learn a new path...")
            
            
            
            print("Current state :", ans)

            
        except Exception as e:
            print(f"An unexpected error occurred during scraping: {e}")
            return None
        finally:
            await page.close()


    async def _extract_data(self, page: Page) -> Dict[str, Any]:
        """
        Placeholder for the final extraction step.
        """
        print("\n--- FINAL DATA EXTRACTION ---")
        data = {}
        try:
            # Example extraction logic based on reaching a successful state
            data['doctor_name'] = await page.locator('h1').first.text_content() if await page.locator('h1').count() else "N/A"
            specialty_locator = page.locator('p:has-text("Specialty")').first
            
            # Check if locator exists before trying to extract text
            if await specialty_locator.count() > 0:
                specialty_text = await specialty_locator.text_content()
                data['specialty'] = specialty_text.replace("Specialty:", "").strip()
            else:
                data['specialty'] = "N/A"
                
            data['url_found'] = page.url
            data['raw_html_size'] = len(await page.content())
        except Exception as e:
            print(f"Error during final extraction: {e}")
            data['error'] = 'Extraction failed'
        
        return data

    async def close(self):
        """Closes the browser and stops Playwright."""
        if self.browser:
            await self.browser.close()
        if self.playwright:
            await self.playwright.stop()
        print("Browser and Playwright stopped.")

# --- ASYNC EXAMPLE USAGE ---
async def main():
    """Wrapper function to run the async scraping process."""
    # Example list of target pages with different structures
    TARGET_PAGES = [
        # To test the TopDoctors site:
        "https://corporativo.compensar.com/salud/plan-complementario/red-especialistas",
        # # Example URLs:
        # "https://playwright.dev/python/docs/intro",
        # "https://playwright.dev/python/docs/api-getting-started", 
        # "https://playwright.dev/python/docs/intro"    
    ]

    agent = RLScraperAgent()
    await agent.start()
    
    # 1. Scrape the first page (will explore and cache a path)
    results1 = await agent.scrape_page(TARGET_PAGES[0])
    print(f"\nResult for Page 1 (Exploration): {json.dumps(results1, indent=2)}")
    

    await agent.close()


In [4]:

await main()

Asynchronous browser launched in headful (visible) mode.
Starting scrape for URL: https://corporativo.compensar.com/salud/plan-complementario/red-especialistas


  self._handle_initial_obstacles(page)


Current state : {'url': 'https://corporativo.compensar.com/salud/plan-complementario/red-especialistas', 'title': 'Red de especialistas| Plan Complementario Especial', 'interactive_elements': [{'id': 0, 'text': 'Activar el modo de accesibilidad', 'selector': 'a:visible:nth-of-type(1)', 'tag': 'a'}, {'id': 6, 'text': 'SaludEPS y Planes Adicionales', 'selector': 'a:visible:nth-of-type(7)', 'tag': 'a'}, {'id': 7, 'text': 'PersonaAfiliación y Servicios de Caja', 'selector': 'a:visible:nth-of-type(8)', 'tag': 'a'}, {'id': 8, 'text': 'EmpresaTodo sobre empleadores', 'selector': 'a:visible:nth-of-type(9)', 'tag': 'a'}, {'id': 9, 'text': 'Sobre CompensarInformación Institucional', 'selector': 'a:visible:nth-of-type(10)', 'tag': 'a'}, {'id': 10, 'text': 'AfíliateAfiliación a Caja y Salud', 'selector': 'a:visible:nth-of-type(11)', 'tag': 'a'}, {'id': 17, 'text': 'A+', 'selector': 'button:visible:nth-of-type(18)', 'tag': 'button'}, {'id': 18, 'text': 'A−', 'selector': 'button:visible:nth-of-type(

In [10]:
[{'id': 1, 'text': 'Show details', 'selector': 'a:visible:nth-of-type(2)', 'tag': 'a'}, 
 {'id': 2, 'text': 'Allow all', 'selector': 'button:visible:nth-of-type(3)', 'tag': 'button'}, 
 {'id': 3, 'text': 'Customize', 'selector': 'button:visible:nth-of-type(4)', 'tag': 'button'}, 
 {'id': 4, 'text': 'Deny', 'selector': 'button:visible:nth-of-type(5)', 'tag': 'button'}, 
 {'id': 7, 'text': 'Doctores y Centros', 'selector': 'a:visible:nth-of-type(8)', 'tag': 'a'}, 
 {'id': 8, 'text': 'Especialidades', 'selector': 'a:visible:nth-of-type(9)', 'tag': 'a'}, 
 {'id': 9, 'text': 'Enfermedades', 'selector': 'a:visible:nth-of-type(10)', 'tag': 'a'}, 
 {'id': 10, 'text': 'Dentistas', 'selector': 'a:visible:nth-of-type(11)', 'tag': 'a'}, 
 {'id': 11, 'text': 'Zona para profesionales', 'selector': 'a:visible:nth-of-type(12)', 'tag': 'a'}, 
 {'id': 12, 'text': 'Pide una cita', 'selector': 'a:visible:nth-of-type(13)', 'tag': 'a'}, 
 {'id': 13, 'text': 'Iniciar sesión', 'selector': 'a:visible:nth-of-type(14)', 'tag': 'a'}, 
 {'id': 14, 'text': 'Cómo pedir cita médica online', 'selector': 'a:visible:nth-of-type(15)', 'tag': 'a'}, 
 {'id': 15, 'text': 'Opiniones verificadas de pacientes', 'selector': 'a:visible:nth-of-type(16)', 'tag': 'a'}, 
 {'id': 16, 'text': 'Doctores y Centros', 'selector': 'a:visible:nth-of-type(17)', 'tag': 'a'}, 
 {'id': 17, 'text': 'Doctores', 'selector': 'a:visible:nth-of-type(18)', 'tag': 'a'}, 
 {'id': 18, 'text': 'Centros', 'selector': 'a:visible:nth-of-type(19)', 'tag': 'a'}, 
 {'id': 19, 'text': 'Dentistas', 'selector': 'a:visible:nth-of-type(20)', 'tag': 'a'}, 
 {'id': 20, 'text': 'Prepagadas', 'selector': 'a:visible:nth-of-type(21)', 'tag': 'a'}, 
 {'id': 21, 'text': 'Artículos médicos', 'selector': 'a:visible:nth-of-type(22)', 'tag': 'a'}, 
 {'id': 22, 'text': 'Diccionario médico', 'selector': 'a:visible:nth-of-type(23)', 'tag': 'a'}, 
 {'id': 23, 'text': 'Telemedicina', 'selector': 'a:visible:nth-of-type(24)', 'tag': 'a'}, 
 {'id': 24, 'text': 'Proceso de selección', 'selector': 'a:visible:nth-of-type(25)', 'tag': 'a'}, 
 {'id': 25, 'text': 'Telemedicina', 'selector': 'a:visible:nth-of-type(26)', 'tag': 'a'}, 
 {'id': 26, 'text': 'Quiero ser un doctor de alto nivel', 'selector': 'a:visible:nth-of-type(27)', 'tag': 'a'},
 {'id': 27, 'text': 'Nomine doctores', 'selector': 'a:visible:nth-of-type(28)', 'tag': 'a'}, 
 {'id': 28, 'text': 'Zona para profesionales', 'selector': 'a:visible:nth-of-type(29)', 'tag': 'a'}, 
 {'id': 29, 'text': '¿Quiénes somos?', 'selector': 'a:visible:nth-of-type(30)', 'tag': 'a'}, 
 {'id': 30, 'text': '¿Por qué elegirnos?', 'selector': 'a:visible:nth-of-type(31)', 'tag': 'a'}, 
 {'id': 31, 'text': 'Top Doctors Awards', 'selector': 'a:visible:nth-of-type(32)', 'tag': 'a'}, 
 {'id': 32, 'text': 'Top Doctors en los medios', 'selector': 'a:visible:nth-of-type(33)', 'tag': 'a'},
 {'id': 33, 'text': 'España', 'selector': 'a:visible:nth-of-type(34)', 'tag': 'a'}, 
 {'id': 34, 'text': 'Reino Unido', 'selector': 'a:visible:nth-of-type(35)', 'tag': 'a'}, 
 {'id': 35, 'text': 'Italia', 'selector': 'a:visible:nth-of-type(36)', 'tag': 'a'}, 
 {'id': 36, 'text': 'México', 'selector': 'a:visible:nth-of-type(37)', 'tag': 'a'},
 {'id': 37, 'text': 'Colombia', 'selector': 'a:visible:nth-of-type(38)', 'tag': 'a'}, 
 {'id': 38, 'text': 'Chile', 'selector': 'a:visible:nth-of-type(39)', 'tag': 'a'}, 
 {'id': 39, 'text': 'Argentina', 'selector': 'a:visible:nth-of-type(40)', 'tag': 'a'},
 {'id': 40, 'text': 'Arabia Saudi', 'selector': 'a:visible:nth-of-type(41)', 'tag': 'a'}, 
 {'id': 41, 'text': 'Doctores', 'selector': 'a:visible:nth-of-type(42)', 'tag': 'a'}, 
 {'id': 42, 'text': 'Telemedicina', 'selector': 'a:visible:nth-of-type(43)', 'tag': 'a'}, 
 {'id': 43, 'text': 'Clínicas y centros', 'selector': 'a:visible:nth-of-type(44)', 'tag': 'a'}, 
 {'id': 44, 'text': 'Clínicas Dentales', 'selector': 'a:visible:nth-of-type(45)', 'tag': 'a'}, 
 {'id': 47, 'text': 'Alergólogos', 'selector': 'a:visible:nth-of-type(48)', 'tag': 'a'}, 
 {'id': 48, 'text': 'Cardiólogos', 'selector': 'a:visible:nth-of-type(49)', 'tag': 'a'},
 {'id': 49, 'text': 'Cirujanos Bariátricos', 'selector': 'a:visible:nth-of-type(50)', 'tag': 'a'}]
state = {'url': 'https://www.doctoralia.co/', 'title': 'Encuentra especialista, agenda cita médica - Doctoralia', 'interactive_elements': [{'id': 1, 'text': '¿Cómo protegemos los datos?', 'selector': 'a:visible:nth-of-type(2)', 'tag': 'a'}, {'id': 2, 'text': 'Pregunta al Experto', 'selector': 'a:visible:nth-of-type(3)', 'tag': 'a'}, {'id': 3, 'text': 'Registrarse gratuitamente', 'selector': 'a:visible:nth-of-type(4)', 'tag': 'a'}, {'id': 4, 'text': 'Iniciar sesión', 'selector': 'a:visible:nth-of-type(5)', 'tag': 'a'}, {'id': 5, 'text': '¿Eres un especialista?', 'selector': 'button:visible:nth-of-type(6)', 'tag': 'button'}, {'id': 6, 'text': 'Visita presencial\n\t\t\t\t\t\t\t\t\t\t\tVisita presencial', 'selector': 'button:visible:nth-of-type(7)', 'tag': 'button'}, {'id': 7, 'text': 'En línea\n\t\t\t\t\t\t\t\t\t\t\tEn línea', 'selector': 'button:visible:nth-of-type(8)', 'tag': 'button'}, {'id': 8, 'text': 'Buscar', 'selector': 'button:visible:nth-of-type(9)', 'tag': 'button'}, {'id': 9, 'text': 'Psicólogo', 'selector': 'a:visible:nth-of-type(10)', 'tag': 'a'}, {'id': 10, 'text': 'Ginecólogo', 'selector': 'a:visible:nth-of-type(11)', 'tag': 'a'}, {'id': 11, 'text': 'Dermatólogo', 'selector': 'a:visible:nth-of-type(12)', 'tag': 'a'}, {'id': 12, 'text': 'Pediatra', 'selector': 'a:visible:nth-of-type(13)', 'tag': 'a'}, {'id': 13, 'text': 'Ortopedista y traumatólogo', 'selector': 'a:visible:nth-of-type(14)', 'tag': 'a'}, {'id': 14, 'text': 'Otorrinolaringólogo', 'selector': 'a:visible:nth-of-type(15)', 'tag': 'a'}, {'id': 15, 'text': 'Urólogo', 'selector': 'a:visible:nth-of-type(16)', 'tag': 'a'}, {'id': 16, 'text': 'Cirujano plástico', 'selector': 'a:visible:nth-of-type(17)', 'tag': 'a'}, {'id': 17, 'text': 'Oftalmólogo', 'selector': 'a:visible:nth-of-type(18)', 'tag': 'a'}, {'id': 18, 'text': 'Internista', 'selector': 'a:visible:nth-of-type(19)', 'tag': 'a'}, {'id': 19, 'text': 'Endocrinólogo', 'selector': 'a:visible:nth-of-type(20)', 'tag': 'a'}, {'id': 20, 'text': 'Neurólogo', 'selector': 'a:visible:nth-of-type(21)', 'tag': 'a'}, {'id': 21, 'text': 'Gastroenterólogo', 'selector': 'a:visible:nth-of-type(22)', 'tag': 'a'}, {'id': 22, 'text': 'Cirujano general', 'selector': 'a:visible:nth-of-type(23)', 'tag': 'a'}, {'id': 23, 'text': 'Cardiólogo', 'selector': 'a:visible:nth-of-type(24)', 'tag': 'a'}, {'id': 24, 'text': 'Ver más\n\t\t\t\t\tEspecialidades más populares', 'selector': 'button:visible:nth-of-type(25)', 'tag': 'button'}, {'id': 25, 'text': 'Implante dental', 'selector': 'a:visible:nth-of-type(26)', 'tag': 'a'}, {'id': 26, 'text': 'Ortodoncia', 'selector': 'a:visible:nth-of-type(27)', 'tag': 'a'}, {'id': 27, 'text': 'Bichectomía', 'selector': 'a:visible:nth-of-type(28)', 'tag': 'a'}, {'id': 28, 'text': 'Blefaroplastia', 'selector': 'a:visible:nth-of-type(29)', 'tag': 'a'}, {'id': 29, 'text': 'Liposucción', 'selector': 'a:visible:nth-of-type(30)', 'tag': 'a'}, {'id': 30, 'text': 'Rinoplastia', 'selector': 'a:visible:nth-of-type(31)', 'tag': 'a'}, {'id': 31, 'text': 'Masaje relajante', 'selector': 'a:visible:nth-of-type(32)', 'tag': 'a'}, {'id': 32, 'text': 'Colonoscopia', 'selector': 'a:visible:nth-of-type(33)', 'tag': 'a'}, {'id': 33, 'text': 'Sueroterapia', 'selector': 'a:visible:nth-of-type(34)', 'tag': 'a'}, {'id': 34, 'text': 'Invisalign', 'selector': 'a:visible:nth-of-type(35)', 'tag': 'a'}, {'id': 35, 'text': 'Ver más\n\t\t\t\t\tServicios', 'selector': 'button:visible:nth-of-type(36)', 'tag': 'button'}, {'id': 36, 'text': 'Hola buenos días.\nEl kid cal le puede ayudar a aumentar el apetito?', 'selector': 'a:visible:nth-of-type(37)', 'tag': 'a'}, {'id': 37, 'text': 'Dr. Cesar Augusto Mayorga Molina', 'selector': 'a:visible:nth-of-type(38)', 'tag': 'a'}, {'id': 38, 'text': 'Hola yo tenía la t de cobre 8 años me la quité este martes y enseguida el mismo día mi ginecóloga me manda unas inyecciones , quería preguntar cuando puedo mantener relaciones sexuales con mi…', 'selector': 'a:visible:nth-of-type(39)', 'tag': 'a'}, {'id': 39, 'text': 'Dra. Nury Santiago Fonseca', 'selector': 'a:visible:nth-of-type(40)', 'tag': 'a'}, {'id': 40, 'text': 'Laura Valentina Salas Huertas', 'selector': 'a:visible:nth-of-type(41)', 'tag': 'a'}, {'id': 41, 'text': 'Angelica Tatiana Llanes', 'selector': 'a:visible:nth-of-type(42)', 'tag': 'a'}, {'id': 42, 'text': 'Paola Andrea Marín Gómez', 'selector': 'a:visible:nth-of-type(43)', 'tag': 'a'}, {'id': 43, 'text': 'Dra. Leidy Tatiana Mora Polindara\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tPsicólogo, Buga\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(44)', 'tag': 'a'}, {'id': 44, 'text': 'Dr. John Eder Martinez Borda\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tOptómetra, Fusagasugá\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(45)', 'tag': 'a'}, {'id': 45, 'text': 'Prof. Jose David Tavera Vega\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tPsicólogo, Bogotá\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(46)', 'tag': 'a'}, {'id': 46, 'text': 'Rafael Santiago  Cárdenas Urbano\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tPsiquiatra, Medellín\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(47)', 'tag': 'a'}, {'id': 47, 'text': 'Dra. Vanessa Usuga Gomez\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tOdontólogo, Medellín\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(48)', 'tag': 'a'}, {'id': 48, 'text': 'Dr. Juan Manuel Tobar Parra\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tGinecólogo, Popayán\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(49)', 'tag': 'a'}, {'id': 49, 'text': 'Prof. María Camila Jiménez Ramirez\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tFisioterapeuta, Envigado\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(50)', 'tag': 'a'}], 'clustered_actions': {'.nav-link': [{'id': 1, 'text': '¿Cómo protegemos los datos?', 'selector': 'a:visible:nth-of-type(2)', 'tag': 'a'}, {'id': 2, 'text': 'Pregunta al Experto', 'selector': 'a:visible:nth-of-type(3)', 'tag': 'a'}, {'id': 3, 'text': 'Registrarse gratuitamente', 'selector': 'a:visible:nth-of-type(4)', 'tag': 'a'}, {'id': 4, 'text': 'Iniciar sesión', 'selector': 'a:visible:nth-of-type(5)', 'tag': 'a'}], '.nav-item': [{'id': 5, 'text': '¿Eres un especialista?', 'selector': 'button:visible:nth-of-type(6)', 'tag': 'button'}], '.navigation': [{'id': 6, 'text': 'Visita presencial\n\t\t\t\t\t\t\t\t\t\t\tVisita presencial', 'selector': 'button:visible:nth-of-type(7)', 'tag': 'button'}, {'id': 7, 'text': 'En línea\n\t\t\t\t\t\t\t\t\t\t\tEn línea', 'selector': 'button:visible:nth-of-type(8)', 'tag': 'button'}], '#search': [{'id': 8, 'text': 'Buscar', 'selector': 'button:visible:nth-of-type(9)', 'tag': 'button'}], '#popular-queries-short': [{'id': 9, 'text': 'Psicólogo', 'selector': 'a:visible:nth-of-type(10)', 'tag': 'a'}, {'id': 10, 'text': 'Ginecólogo', 'selector': 'a:visible:nth-of-type(11)', 'tag': 'a'}, {'id': 11, 'text': 'Dermatólogo', 'selector': 'a:visible:nth-of-type(12)', 'tag': 'a'}, {'id': 12, 'text': 'Pediatra', 'selector': 'a:visible:nth-of-type(13)', 'tag': 'a'}, {'id': 13, 'text': 'Ortopedista y traumatólogo', 'selector': 'a:visible:nth-of-type(14)', 'tag': 'a'}, {'id': 14, 'text': 'Otorrinolaringólogo', 'selector': 'a:visible:nth-of-type(15)', 'tag': 'a'}, {'id': 15, 'text': 'Urólogo', 'selector': 'a:visible:nth-of-type(16)', 'tag': 'a'}, {'id': 16, 'text': 'Cirujano plástico', 'selector': 'a:visible:nth-of-type(17)', 'tag': 'a'}, {'id': 17, 'text': 'Oftalmólogo', 'selector': 'a:visible:nth-of-type(18)', 'tag': 'a'}, {'id': 18, 'text': 'Internista', 'selector': 'a:visible:nth-of-type(19)', 'tag': 'a'}, {'id': 19, 'text': 'Endocrinólogo', 'selector': 'a:visible:nth-of-type(20)', 'tag': 'a'}, {'id': 20, 'text': 'Neurólogo', 'selector': 'a:visible:nth-of-type(21)', 'tag': 'a'}, {'id': 21, 'text': 'Gastroenterólogo', 'selector': 'a:visible:nth-of-type(22)', 'tag': 'a'}, {'id': 22, 'text': 'Cirujano general', 'selector': 'a:visible:nth-of-type(23)', 'tag': 'a'}, {'id': 23, 'text': 'Cardiólogo', 'selector': 'a:visible:nth-of-type(24)', 'tag': 'a'}, {'id': 24, 'text': 'Ver más\n\t\t\t\t\tEspecialidades más populares', 'selector': 'button:visible:nth-of-type(25)', 'tag': 'button'}], '#most-demanded-services-short': [{'id': 25, 'text': 'Implante dental', 'selector': 'a:visible:nth-of-type(26)', 'tag': 'a'}, {'id': 26, 'text': 'Ortodoncia', 'selector': 'a:visible:nth-of-type(27)', 'tag': 'a'}, {'id': 27, 'text': 'Bichectomía', 'selector': 'a:visible:nth-of-type(28)', 'tag': 'a'}, {'id': 28, 'text': 'Blefaroplastia', 'selector': 'a:visible:nth-of-type(29)', 'tag': 'a'}, {'id': 29, 'text': 'Liposucción', 'selector': 'a:visible:nth-of-type(30)', 'tag': 'a'}, {'id': 30, 'text': 'Rinoplastia', 'selector': 'a:visible:nth-of-type(31)', 'tag': 'a'}, {'id': 31, 'text': 'Masaje relajante', 'selector': 'a:visible:nth-of-type(32)', 'tag': 'a'}, {'id': 32, 'text': 'Colonoscopia', 'selector': 'a:visible:nth-of-type(33)', 'tag': 'a'}, {'id': 33, 'text': 'Sueroterapia', 'selector': 'a:visible:nth-of-type(34)', 'tag': 'a'}, {'id': 34, 'text': 'Invisalign', 'selector': 'a:visible:nth-of-type(35)', 'tag': 'a'}, {'id': 35, 'text': 'Ver más\n\t\t\t\t\tServicios', 'selector': 'button:visible:nth-of-type(36)', 'tag': 'button'}], 'root_body': [{'id': 36, 'text': 'Hola buenos días.\nEl kid cal le puede ayudar a aumentar el apetito?', 'selector': 'a:visible:nth-of-type(37)', 'tag': 'a'}, {'id': 37, 'text': 'Dr. Cesar Augusto Mayorga Molina', 'selector': 'a:visible:nth-of-type(38)', 'tag': 'a'}, {'id': 38, 'text': 'Hola yo tenía la t de cobre 8 años me la quité este martes y enseguida el mismo día mi ginecóloga me manda unas inyecciones , quería preguntar cuando puedo mantener relaciones sexuales con mi…', 'selector': 'a:visible:nth-of-type(39)', 'tag': 'a'}, {'id': 39, 'text': 'Dra. Nury Santiago Fonseca', 'selector': 'a:visible:nth-of-type(40)', 'tag': 'a'}, {'id': 40, 'text': 'Laura Valentina Salas Huertas', 'selector': 'a:visible:nth-of-type(41)', 'tag': 'a'}, {'id': 41, 'text': 'Angelica Tatiana Llanes', 'selector': 'a:visible:nth-of-type(42)', 'tag': 'a'}, {'id': 42, 'text': 'Paola Andrea Marín Gómez', 'selector': 'a:visible:nth-of-type(43)', 'tag': 'a'}], '#new-doctors-carousel-item0': [{'id': 43, 'text': 'Dra. Leidy Tatiana Mora Polindara\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tPsicólogo, Buga\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(44)', 'tag': 'a'}], '#new-doctors-carousel-item1': [{'id': 44, 'text': 'Dr. John Eder Martinez Borda\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tOptómetra, Fusagasugá\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(45)', 'tag': 'a'}], '#new-doctors-carousel-item2': [{'id': 45, 'text': 'Prof. Jose David Tavera Vega\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tPsicólogo, Bogotá\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(46)', 'tag': 'a'}], '#new-doctors-carousel-item3': [{'id': 46, 'text': 'Rafael Santiago  Cárdenas Urbano\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tPsiquiatra, Medellín\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(47)', 'tag': 'a'}], '#new-doctors-carousel-item4': [{'id': 47, 'text': 'Dra. Vanessa Usuga Gomez\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tOdontólogo, Medellín\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(48)', 'tag': 'a'}], '#new-doctors-carousel-item5': [{'id': 48, 'text': 'Dr. Juan Manuel Tobar Parra\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tGinecólogo, Popayán\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(49)', 'tag': 'a'}], '#new-doctors-carousel-item6': [{'id': 49, 'text': 'Prof. María Camila Jiménez Ramirez\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tFisioterapeuta, Envigado\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'selector': 'a:visible:nth-of-type(50)', 'tag': 'a'}]}, 'llm_summary': '\n--- CLUSTER 0: .nav-link ---\n[1 | A]: ¿Cómo protegemos los datos?\n[2 | A]: Pregunta al Experto\n[3 | A]: Registrarse gratuitamente\n[4 | A]: Iniciar sesión\n\n--- CLUSTER 1: .nav-item ---\n[5 | BUTTON]: ¿Eres un especialista?\n\n--- CLUSTER 2: .navigation ---\n[6 | BUTTON]: Visita presencial\n\t\t\t\t\t\t\t\t\t\t\tVisita presencial\n[7 | BUTTON]: En línea\n\t\t\t\t\t\t\t\t\t\t\tEn línea\n\n--- CLUSTER 3: #search ---\n[8 | BUTTON]: Buscar\n\n--- CLUSTER 4: #popular-queries-short ---\n[9 | A]: Psicólogo\n[10 | A]: Ginecólogo\n[11 | A]: Dermatólogo\n[12 | A]: Pediatra\n[13 | A]: Ortopedista y traumatólogo\n[14 | A]: Otorrinolaringólogo\n[15 | A]: Urólogo\n[16 | A]: Cirujano plástico\n[17 | A]: Oftalmólogo\n[18 | A]: Internista\n[19 | A]: Endocrinólogo\n[20 | A]: Neurólogo\n[21 | A]: Gastroenterólogo\n[22 | A]: Cirujano general\n[23 | A]: Cardiólogo\n[24 | BUTTON]: Ver más\n\t\t\t\t\tEspecialidades más populares\n\n--- CLUSTER 5: #most-demanded-services-short ---\n[25 | A]: Implante dental\n[26 | A]: Ortodoncia\n[27 | A]: Bichectomía\n[28 | A]: Blefaroplastia\n[29 | A]: Liposucción\n[30 | A]: Rinoplastia\n[31 | A]: Masaje relajante\n[32 | A]: Colonoscopia\n[33 | A]: Sueroterapia\n[34 | A]: Invisalign\n[35 | BUTTON]: Ver más\n\t\t\t\t\tServicios\n\n--- CLUSTER 6: root_body ---\n[36 | A]: Hola buenos días.\nEl kid cal le puede ayudar a aumentar el apetito?\n[37 | A]: Dr. Cesar Augusto Mayorga Molina\n[38 | A]: Hola yo tenía la t de cobre 8 años me la quité este martes y enseguida el mismo día mi ginecóloga me manda unas inyecciones , quería preguntar cuando puedo mantener relaciones sexuales con mi…\n[39 | A]: Dra. Nury Santiago Fonseca\n[40 | A]: Laura Valentina Salas Huertas\n[41 | A]: Angelica Tatiana Llanes\n[42 | A]: Paola Andrea Marín Gómez\n\n--- CLUSTER 7: #new-doctors-carousel-item0 ---\n[43 | A]: Dra. Leidy Tatiana Mora Polindara\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tPsicólogo, Buga\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil\n\n--- CLUSTER 8: #new-doctors-carousel-item1 ---\n[44 | A]: Dr. John Eder Martinez Borda\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tOptómetra, Fusagasugá\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil\n\n--- CLUSTER 9: #new-doctors-carousel-item2 ---\n[45 | A]: Prof. Jose David Tavera Vega\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tPsicólogo, Bogotá\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil\n\n--- CLUSTER 10: #new-doctors-carousel-item3 ---\n[46 | A]: Rafael Santiago  Cárdenas Urbano\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tPsiquiatra, Medellín\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil\n\n--- CLUSTER 11: #new-doctors-carousel-item4 ---\n[47 | A]: Dra. Vanessa Usuga Gomez\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tOdontólogo, Medellín\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil\n\n--- CLUSTER 12: #new-doctors-carousel-item5 ---\n[48 | A]: Dr. Juan Manuel Tobar Parra\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tGinecólogo, Popayán\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil\n\n--- CLUSTER 13: #new-doctors-carousel-item6 ---\n[49 | A]: Prof. María Camila Jiménez Ramirez\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tFisioterapeuta, Envigado\n\t\t\t\t\t\t\t\t\t\t\tMostrar perfil', 'target_found': False}



In [15]:
state['llm_summary'].split('CLUSTER')

['\n--- ',
 ' 0: .nav-link ---\n[1 | A]: ¿Cómo protegemos los datos?\n[2 | A]: Pregunta al Experto\n[3 | A]: Registrarse gratuitamente\n[4 | A]: Iniciar sesión\n\n--- ',
 ' 1: .nav-item ---\n[5 | BUTTON]: ¿Eres un especialista?\n\n--- ',
 ' 2: .navigation ---\n[6 | BUTTON]: Visita presencial\n\t\t\t\t\t\t\t\t\t\t\tVisita presencial\n[7 | BUTTON]: En línea\n\t\t\t\t\t\t\t\t\t\t\tEn línea\n\n--- ',
 ' 3: #search ---\n[8 | BUTTON]: Buscar\n\n--- ',
 ' 4: #popular-queries-short ---\n[9 | A]: Psicólogo\n[10 | A]: Ginecólogo\n[11 | A]: Dermatólogo\n[12 | A]: Pediatra\n[13 | A]: Ortopedista y traumatólogo\n[14 | A]: Otorrinolaringólogo\n[15 | A]: Urólogo\n[16 | A]: Cirujano plástico\n[17 | A]: Oftalmólogo\n[18 | A]: Internista\n[19 | A]: Endocrinólogo\n[20 | A]: Neurólogo\n[21 | A]: Gastroenterólogo\n[22 | A]: Cirujano general\n[23 | A]: Cardiólogo\n[24 | BUTTON]: Ver más\n\t\t\t\t\tEspecialidades más populares\n\n--- ',
 ' 5: #most-demanded-services-short ---\n[25 | A]: Implante dental\n[26 

In [11]:
import json
import asyncio
from typing import List, Dict, Any, Optional
# Placeholder for future components
# import numpy as np 
# from gensim.models import KeyedVectors 
# from scipy.spatial.distance import cosine

# Core Playwright imports for asynchronous web interaction
from playwright.async_api import async_playwright, Page, Browser, Playwright

class GuidedRLAgent:
    """
    The central agent class for the guided reinforcement learning scraper.
    It manages the environment, state extraction (clustering), and will contain
    the logic for semantic scoring, action selection, and memory updating.
    """
    def __init__(self):
        """Initializes placeholders for async Playwright objects and RL memory."""
        self.playwright: Optional[Playwright] = None
        self.browser: Optional[Browser] = None
        self.goal_vocab = {
            # This is where your layered vocabulary will be defined later
            "layer_1": ['doctor', 'especialistas', 'detalles de contacto'],
            "layer_2": ['especialidades', 'contacta un especialista', 'pediatria'],
        }
        self.policy_cache = {} # Placeholder for successful paths memory
        print("GuidedRLAgent initialized.")

    async def start(self):
        """Initializes the asynchronous Playwright instance and launches the browser."""
        if not self.browser:
            self.playwright = await async_playwright().start()
            self.browser = await self.playwright.chromium.launch(
                headless=False, 
                slow_mo=300 # Visible mode for observation
            )
            print("Asynchronous browser launched.")
            
    async def close(self):
        """Closes the browser and stops Playwright."""
        if self.browser:
            await self.browser.close()
        if self.playwright:
            await self.playwright.stop()
        print("Browser and Playwright stopped.")

    # --- COMPONENT 1: OBSTACLE HANDLING ---
    async def _handle_initial_obstacles(self, page: Page):
        """
        Attempts to click common consent or cookie banners that might 
        block subsequent interaction on the initial page load.
        """
        consent_selectors = [
            'button:has-text("Aceptar")',
            'button:has-text("Accept")',
            'button:has-text("Allow")',
            '[aria-label*="consent"] button',
            '[id*="cookie"] button',
            '.cc-btn:has-text("Accept")', 
        ]
        
        print("Checking for initial obstacles (cookie/consent banners)...")
        
        for selector in consent_selectors:
            try:
                await page.click(selector, timeout=3000)
                print(f"Successfully clicked consent button using selector: {selector}")
                await page.wait_for_load_state("domcontentloaded", timeout=5000)
                return True
            except:
                continue 
        
        print("No blocking consent banner found or clicked.")
        return False

    # --- COMPONENT 1: DOM CLUSTERING JAVASCRIPT SNIPPET ---
    JS_GET_CONTAINER = """
    (element) => {
        let current = element;
        // Iterate up to 5 levels to find a meaningful container (id or class)
        for (let i = 0; i < 5 && current; i++) {
            // PRIORITY 1: Unique ID (most stable cluster identifier)
            if (current.id) {
                return `#${current.id}`;
            }
            if (current.className) {
                // PRIORITY 2: Descriptive Class Name
                const classes = current.className.split(' ').filter(c => c.length > 0);
                const descriptive_classes = classes.filter(c => 
                    c.toLowerCase().includes('nav') || 
                    c.toLowerCase().includes('menu') || 
                    c.toLowerCase().includes('bar') || 
                    c.toLowerCase().includes('footer') ||
                    c.toLowerCase().includes('filter') ||
                    c.toLowerCase().includes('main')
                );
                if (descriptive_classes.length > 0) {
                    // Use the first descriptive class found
                    return `.${descriptive_classes[0]}`;
                }
            }
            current = current.parentElement;
        }
        return 'root_body'; // Default if no defining container found
    }
    """
    
    # --- COMPONENT 1: STATE EXTRACTION AND CLUSTERING ---
    async def _get_state(self, page: Page) -> Dict[str, Any]:
        """
        Extracts all visible, interactive elements and groups them by their 
        structural container (ID or class), generating the LLM-ready summary.
        """
        elements = []
        try:
            # Locate all visible click-based and selection-based interactive elements
            locators = page.locator('a:visible, button:visible, [role="button"]:visible, [role="link"]:visible, select:visible')
            count = await locators.count()

            for i in range(min(count, 100)): 
                element = locators.nth(i)
                tag_name = (await element.evaluate("e => e.tagName")).lower()
                text = await element.text_content()
                descriptive_text = text.strip() if text and text.strip() else ""

                # Execute the JS snippet to find the structural cluster ID
                container_id = await element.evaluate(self.JS_GET_CONTAINER)

                if not descriptive_text or descriptive_text.startswith('<'):
                    descriptive_text = f"<{tag_name.upper()}> (No Text/Label)"
                
                # Filtering: skip if the text is not meaningful
                if "(No Text/Label)" in descriptive_text:
                    continue
                
                # Create a specific selector (reference)
                specific_selector = f'{tag_name}:visible:nth-of-type({i + 1})'
                
                elements.append({
                    "id": i,
                    "text": descriptive_text,
                    "selector": specific_selector, 
                    "tag": tag_name, 
                    "container_id": container_id 
                })
        except Exception as e:
            print(f"Error getting state: {e}")

        # CLUSTERING LOGIC AND SUMMARY GENERATION
        clustered_elements: Dict[str, List[Dict[str, Any]]] = {}
        for element in elements:
            container = element.pop('container_id') 
            if container not in clustered_elements:
                clustered_elements[container] = []
            clustered_elements[container].append(element)
        
        # Format the summary by cluster for the LLM
        llm_summary_parts = []
        cluster_id_counter = 0
        
        for container_name, cluster in clustered_elements.items():
            llm_summary_parts.append(f"\n--- CLUSTER {cluster_id_counter}: {container_name} ---")
            for e in cluster:
                # Format: [Element ID | TAG]: Descriptive Text
                llm_summary_parts.append(f"[{e['id']} | {e['tag'].upper()}]: {e['text']}")
            cluster_id_counter += 1

        llm_summary = (
            "No active interactive elements with meaningful text found."
            if not llm_summary_parts
            else "\n".join(llm_summary_parts)
        )
        
        return {
            "url": page.url,
            "title": await page.title(),
            "interactive_elements": elements, 
            "clustered_actions": clustered_elements, 
            "llm_summary": llm_summary, 
        }
    
    # --- COMPONENT 2: SEMANTIC SCORING (Placeholder) ---
    async def _semantic_score(self, element_text: str) -> float:
        """
        Placeholder for the core reward mechanism.
        TODO: 1. Convert element_text to embedding.
        TODO: 2. Convert goal_vocab layers to embeddings.
        TODO: 3. Calculate max cosine similarity between element_text embedding and goal embeddings.
        TODO: 4. Return score (e.g., higher score for layer 1 match).
        """
        # For now, return a placeholder score based on simple keyword search
        if any(keyword in element_text.lower() for keyword in self.goal_vocab['layer_1']):
            return 10.0 # High reward for direct goal words
        if any(keyword in element_text.lower() for keyword in self.goal_vocab['layer_2']):
            return 5.0 # Medium reward for related words
        return 0.0

    # --- COMPONENT 3 & 4: POLICY & MEMORY (Placeholders) ---
    async def _choose_action(self, state: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """
        Policy: Uses LLM (or semantic score heuristic) to choose a cluster and then an element.
        """
        # Implementation to be added later (LLM integration)
        print("Action choice logic not yet implemented. Skipping step.")
        return None

    async def _update_memory(self, path: List[Dict[str, Any]], success: bool):
        """
        Memory: Updates the Q-table or policy cache based on success/failure.
        """
        # Implementation to be added later
        pass


    # --- MAIN EXECUTION LOOP ---
    async def navigate_and_extract_state(self, url: str) -> Optional[Dict[str, Any]]:
        """
        Main public method to perform navigation, obstacle handling, and state extraction.
        """
        if not self.browser:
            print("Browser not started. Call await agent.start() first.")
            return None
        
        page: Page = await self.browser.new_page()
        try:
            print(f"Navigating to URL: {url}")
            # Relaxed wait_until to 'domcontentloaded' and increased timeout for complex sites.
            await page.goto(url, wait_until="domcontentloaded", timeout=120000)

            # Handle initial blocking obstacles (like cookie banners)
            await self._handle_initial_obstacles(page)
            
            # Extract and return the clustered state
            state = await self._get_state(page)
            return state
            
        except Exception as e:
            print(f"An unexpected error occurred during state extraction: {e}")
            return None
        finally:
            await page.close()


# --- EXAMPLE USAGE ---
async def main_extractor_example():
    """Example demonstration of the GuidedRLAgent's state extraction capability."""
    # Target URL with known dynamic elements and potential banners
    TARGET_URL = "https://corporativo.compensar.com/"
    
    agent = GuidedRLAgent()
    await agent.start()
    
    try:
        clustered_state = await agent.navigate_and_extract_state(TARGET_URL)
        
        if clustered_state:
            print("\n--- FINAL CLUSTERED STATE SUMMARY ---")
            print(f"URL: {clustered_state['url']}")
            print(f"Title: {clustered_state['title']}")
            print("\nLLM-READY ACTION SUMMARY:")
            print(clustered_state['llm_summary'])
        else:
            print("Failed to retrieve clustered state.")
            
    finally:
        await agent.close()
        return clustered_state




In [2]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [7]:
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
import nest_asyncio
#import pandas as pd

# --- CONFIGURATION ---
BASE_URL = "https://www.doctoralia.co/"
# ---------------------

async def identify_interactive_elements():
    """
    Navigates to the Doctoralia homepage and identifies all interactive elements
    (links, buttons, inputs) that a scraper could potentially use.
    """
    print(f"Starting analysis. Navigating to {BASE_URL}")
    interactive_data = []

    async with async_playwright() as p:
        # Launch browser (headless=True for speed, but set to False if you want to watch)
        browser = await p.chromium.launch(headless=True) 
        page = await browser.new_page()
        
        try:
            # STEP 1: Initial Navigation
            await page.goto(BASE_URL, wait_until="domcontentloaded", timeout=30000)
            await page.wait_for_load_state('networkidle')
            print("Successfully loaded the Doctoralia homepage.")

            # STEP 2: Use page.evaluate() to find elements via JavaScript
            # We look for common interactive tags and attributes
            interactive_data = await page.evaluate('''() => {
                const selectors = [
                    'a[href]',                     // All links
                    'button',                      // All buttons
                    'input:not([type="hidden"])',  // All visible inputs
                    'textarea',                    // Text areas
                    'select'                       // Dropdowns
                ];
                const elements = [];

                selectors.forEach(selector => {
                    document.querySelectorAll(selector).forEach((el, index) => {
                        let identifier = el.textContent.trim() || 
                                         el.getAttribute('placeholder') || 
                                         el.getAttribute('aria-label') || 
                                         el.getAttribute('title') || 
                                         el.name || 
                                         el.id || '';

                        // Format data
                        if (el.tagName !== 'INPUT' || identifier || el.type === 'submit') {
                            let data = {
                                Tag: el.tagName.toLowerCase(),
                                Type: el.getAttribute('type') || 'N/A',
                                Identifier: identifier.substring(0, 70).replace(/\s+/g, ' ') || '(No Visible Text)',
                                General_Selector: selector.split('[')[0] + (el.id ? `#${el.id}` : ''),
                            };
                            elements.push(data);
                        }
                    });
                });
                return elements;
            }''')
            
            print("\n--- Interactive Elements Found ---")
            
            # Use pandas to display results nicely
            if interactive_data:

                
                
                print(interactive_data)

            else:
                print("No interactive elements were detected.")


        except PlaywrightTimeoutError:
            print("\n[ERROR] Playwright Timeout occurred during navigation. The site may have failed to load within 30 seconds.")
        except Exception as e:
            print(f"\n[CRITICAL ERROR] An unexpected error occurred: {e}")

        finally:
            await page.close()
            await browser.close()
            print("\nAnalysis finished.")
            
    return interactive_data

if True:
    
    # Patch asyncio for running within interactive environments
    try:
        nest_asyncio.apply()
        # print("Asyncio patched.")
    except Exception as e:
        print(f"Warning: Could not apply nest_asyncio. Error: {e}")

    # Execute the coroutine
    dic = asyncio.run(identify_interactive_elements())

Starting analysis. Navigating to https://www.doctoralia.co/
Successfully loaded the Doctoralia homepage.

--- Interactive Elements Found ---
[{'Tag': 'a', 'Type': 'N/A', 'Identifier': 'Doctoralia - Página de inicio', 'General_Selector': 'a'}, {'Tag': 'a', 'Type': 'N/A', 'Identifier': '¿Cómo protegemos los datos?', 'General_Selector': 'a'}, {'Tag': 'a', 'Type': 'N/A', 'Identifier': 'Pregunta al Experto', 'General_Selector': 'a'}, {'Tag': 'a', 'Type': 'N/A', 'Identifier': 'Registrarse gratuitamente', 'General_Selector': 'a'}, {'Tag': 'a', 'Type': 'N/A', 'Identifier': 'Iniciar sesión', 'General_Selector': 'a'}, {'Tag': 'a', 'Type': 'N/A', 'Identifier': 'Registrarse gratuitamente', 'General_Selector': 'a'}, {'Tag': 'a', 'Type': 'N/A', 'Identifier': 'Soluciones para profesionales se abre en una nueva pestaña', 'General_Selector': 'a'}, {'Tag': 'a', 'Type': 'N/A', 'Identifier': 'Bogotá', 'General_Selector': 'a'}, {'Tag': 'a', 'Type': 'N/A', 'Identifier': 'Medellín', 'General_Selector': 'a'},

In [19]:
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from bs4 import BeautifulSoup
import nest_asyncio
import time
import re # For cleaning up text

# --- CONFIGURATION ---
BASE_URL = "https://www.doctoralia.co/"

# Define the fixed pairs of specialty and city for the search
SEARCH_PARAMETERS = [
    {"specialty": "Pediatra", "city": "Bogotá"},

]
# --- SELECTORS IDENTIFIED FROM ANALYSIS ---

# 1. Specialty/Doctor Input Field (Using the unique identifier/placeholder)
SPECIALTY_INPUT_SELECTOR = 'input[placeholder*="especialidad, enfermedad o nombre"]'

# 2. City Input Field (Using the unique identifier/placeholder)
CITY_INPUT_SELECTOR = 'input[placeholder*="p. ej. Bogotá"]'

# 3. Skip/Omit Button that appears after search (Assuming it has the text "Omitir")
OMITIR_BUTTON_SELECTOR = 'text="Omitir"'

# 4. NEW: Cookie Accept Button (Commonly uses the text "Aceptar")
COOKIE_ACCEPT_SELECTOR = 'text="Aceptar"'
# ---------------------

def extract_doctor_info(html_content, specialty, city):
    """
    Parses the raw HTML content of the search results page to extract structured doctor information.
    
    Args:
        html_content (str): The raw HTML string collected by Playwright.
        specialty (str): The specialty used in the search (for context).
        city (str): The city used in the search (for context).

    Returns:
        list: A list of dictionaries, where each dictionary represents a doctor.
    """
    
    # 1. Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    doctors_data = []

    # 2. Identify the container for each doctor listing.
    # Based on Doctoralia's typical structure, doctor cards are usually divs with a specific class.
    # We will search for elements that likely represent individual doctor profiles on the results page.
    doctor_cards = soup.select('.doctor-card') # This is a very common class name pattern on this site
    
    if not doctor_cards:
        print(f"Warning: No doctor cards found for {specialty} in {city} using selector '.doctor-card'.")
        # Fallback to another common search result selector if the main one fails
        doctor_cards = soup.select('.list-item.list-item-doctor')

    for i, card in enumerate(doctor_cards, 1):
        try:
            # --- Extract Doctor Name and Profile Link ---
            # Search for the main name link within the card
            name_tag = card.select_one('h2 a, .doctor-card__name a')
            name = name_tag.text.strip() if name_tag else f"Doctor {i}"
            profile_url = BASE_URL.rstrip('/') + name_tag['href'] if name_tag and 'href' in name_tag.attrs else 'N/A'

            # --- Extract Specialties (sometimes listed beneath the main name) ---
            specialty_tag = card.select_one('.doctor-card__specializations')
            # Clean up the text, removing extra spaces and newlines
            specialties = re.sub(r'\s+', ' ', specialty_tag.text.strip()) if specialty_tag else specialty

            # --- Extract Location/Address ---
            # Search for the address block. Doctoralia often lists multiple addresses.
            # We target the most visible or first address summary.
            address_tag = card.select_one('.doctor-card__address, .address-data')
            if address_tag:
                # Get all address lines, join them, and clean up
                address = ' | '.join([
                    re.sub(r'\s+', ' ', p.text.strip()) 
                    for p in address_tag.find_all('p') if p.text.strip()
                ])
                if not address:
                    address = re.sub(r'\s+', ' ', address_tag.text.strip())
            else:
                address = f"Search City: {city}"
            
            # --- Extract Rating and Opinion Count ---
            rating_tag = card.select_one('.doctor-card__rating-score, .rating-score')
            rating = rating_tag.text.strip() if rating_tag else 'No Rating'

            opinions_tag = card.select_one('.doctor-card__opinions, .opinion-count')
            opinions = re.sub(r'[()]', '', opinions_tag.text.strip()) if opinions_tag else '0 opiniones'


            # --- Compile Data ---
            doctors_data.append({
                "name": name,
                "profile_url": profile_url,
                "specialties": specialties,
                "address": address,
                "rating": rating,
                "opinions": opinions,
                "searched_specialty": specialty,
                "searched_city": city
            })
            
        except Exception as e:
            print(f"Error parsing data for doctor card {i} in {city}, {specialty}: {e}")
            continue # Skip to the next card if parsing fails for one

    return doctors_data

async def select_dropdown_option(page, input_selector, value, description):
    """
    Fills an input field and explicitly clicks the corresponding dropdown suggestion.
    ... (omitted for brevity, content is the same as previous file)
    """
    print(f"Typing {description}: {value}...")
    
    # 1. Fill the input field
    await page.locator(input_selector).fill(value, timeout=10000)
    await page.wait_for_timeout(500)

    # 2. Create a selector for the suggestion based on the typed text.
    suggestion_selector = f'a:has-text("{value}")' 
    
    try:
        # 3. Wait for the suggestion to appear and click it
        suggestion_locator = page.locator(suggestion_selector).first
        await suggestion_locator.click(timeout=10000)
        print(f"{description} suggestion clicked successfully.")
        
    except PlaywrightTimeoutError:
        # Fallback: if clicking the specific suggestion fails, try pressing Enter
        print(f"Warning: Specific {description} suggestion not found, pressing Enter as fallback.")
        await page.keyboard.press('Enter')
        
    await page.wait_for_timeout(1000) # Wait a moment for the value to register/page to update


async def scrape_doctoralia():
    """
    Automates the search process, extracts HTML, and then parses doctor information.
    """
    all_doctor_records = [] 

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True) 
        page = await browser.new_page()
        
        print(f"Starting crawl. Navigating to {BASE_URL}")

        try:
            await page.goto(BASE_URL, wait_until="domcontentloaded", timeout=30000)
            await page.wait_for_load_state('networkidle')
            print("Successfully loaded the homepage.")

            # --- Handle Cookie Consent ---
            try:
                print("Checking for cookie consent pop-up...")
                cookie_button = page.locator(COOKIE_ACCEPT_SELECTOR)
                await cookie_button.click(timeout=5000) 
                print("Cookie consent accepted successfully.")
                await page.wait_for_timeout(500)
            except PlaywrightTimeoutError:
                print("No cookie consent pop-up detected (or element not found quickly). Proceeding.")
            
            # --- END Cookie Handling ---


            for params in SEARCH_PARAMETERS:
                specialty = params["specialty"]
                city = params["city"]
                
                print(f"\n--- Performing Search for: {specialty} in {city} ---")
                
                # --- FILLING SEARCH FIELDS WITH DROPDOWN SELECTION ---
                await select_dropdown_option(page, SPECIALTY_INPUT_SELECTOR, specialty, "Specialty")
                await select_dropdown_option(page, CITY_INPUT_SELECTOR, city, "City")
                
                await page.wait_for_load_state('networkidle')

                # --- HANDLING THE INTERACTIVE INTERSTITIAL (OMITIR) ---
                try:
                    omitir_button = page.locator(OMITIR_BUTTON_SELECTOR)
                    await omitir_button.wait_for(state='visible', timeout=15000) 
                    
                    print("Found 'Omitir' button. Clicking to proceed to results...")
                    await omitir_button.click()
                    
                    await page.wait_for_load_state('networkidle')
                    
                except PlaywrightTimeoutError:
                    print("Note: 'Omitir' button not found within timeout (or page loaded directly to results). Proceeding.")
                
                
                # --- FINAL CONTENT EXTRACTION AND PARSING ---
                try:
                    # 1. Extract the raw HTML
                    page_content = await page.content() 
                    print(f"Page content (HTML) captured. Parsing data...")

                    # 2. Parse the HTML using the new function
                    parsed_doctors = extract_doctor_info(page_content, specialty, city)
                    all_doctor_records.extend(parsed_doctors)

                    print(f"Successfully extracted {len(parsed_doctors)} doctor(s) from this search.")

                except Exception as e:
                     print(f"Error capturing or parsing page content: {e}")
                

                # Go back to the homepage for the next iteration
                if params != SEARCH_PARAMETERS[-1]:
                    await page.goto(BASE_URL, wait_until="domcontentloaded")


        except PlaywrightTimeoutError:
            print(f"\n[ERROR] Playwright Timeout occurred. Check the network connection or if selectors have changed.")
        except Exception as e:
            print(f"\n[CRITICAL ERROR] An unexpected error occurred: {e}")

        finally:
            await browser.close()
            print(f"\nCrawl finished. Total doctor records collected: {len(all_doctor_records)}")
            
    return all_doctor_records

if True:
    
    try:
        nest_asyncio.apply()
    except Exception as e:
        print(f"Warning: Could not apply nest-asyncio. Error: {e}")

    # Execute the coroutine
    doctors = asyncio.run(scrape_doctoralia())
    
    if doctors:
        print("\n--- Summary of All Extracted Doctor Records ---")
        for i, doc in enumerate(doctors, 1):
            print(f"\nDoctor {i} (Searched: {doc['searched_specialty']} in {doc['searched_city']}):")
            print(f"  Name: {doc['name']}")
            print(f"  Specialties: {doc['specialties']}")
            print(f"  Address: {doc['address']}")
            print(f"  Rating/Opinions: {doc['rating']} ({doc['opinions']})")
            print(f"  Profile URL: {doc['profile_url']}")
        
        print(f"\nSuccessfully scraped and parsed a total of {len(doctors)} doctor records.")
    else:
        print("\nNo doctor records were extracted. Check the selectors in extract_doctor_info.")

Starting crawl. Navigating to https://www.doctoralia.co/
Successfully loaded the homepage.
Checking for cookie consent pop-up...
No cookie consent pop-up detected (or element not found quickly). Proceeding.

--- Performing Search for: Pediatra in Bogotá ---
Typing Specialty: Pediatra...
Specialty suggestion clicked successfully.
Typing City: Bogotá...
City suggestion clicked successfully.
Note: 'Omitir' button not found within timeout (or page loaded directly to results). Proceeding.
Page content (HTML) captured. Parsing data...
Successfully extracted 0 doctor(s) from this search.

Crawl finished. Total doctor records collected: 0

No doctor records were extracted. Check the selectors in extract_doctor_info.


In [20]:
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
import nest_asyncio
import time

# Good extraction of the first one!!
BASE_URL = "https://www.doctoralia.co/"

# Define the fixed pairs of specialty and city for the search
SEARCH_PARAMETERS = [
    {"specialty": "Pediatra", "city": "Bogotá"},
    {"specialty": "Cardiología", "city": "Medellín"},
    {"specialty": "Dermatología", "city": "Cali"},
]
# --- SELECTORS IDENTIFIED FROM ANALYSIS ---

# 1. Specialty/Doctor Input Field (Using the unique identifier/placeholder)
SPECIALTY_INPUT_SELECTOR = 'input[placeholder*="especialidad, enfermedad o nombre"]'

# 2. City Input Field (Using the unique identifier/placeholder)
CITY_INPUT_SELECTOR = 'input[placeholder*="p. ej. Bogotá"]'

# 3. Skip/Omit Button that appears after search (Assuming it has the text "Omitir")
OMITIR_BUTTON_SELECTOR = 'text="Omitir"'

# 4. NEW: Cookie Accept Button (Commonly uses the text "Aceptar")
COOKIE_ACCEPT_SELECTOR = 'text="Aceptar"'
# ---------------------

async def select_dropdown_option(page, input_selector, value, description):
    """
    Fills an input field and explicitly clicks the corresponding dropdown suggestion.
    
    Args:
        page: Playwright Page object.
        input_selector: CSS selector for the main input field.
        value: The text to type (and the text expected in the suggestion).
        description: Friendly name for the field (e.g., "Specialty").
    """
    print(f"Typing {description}: {value}...")
    
    # 1. Fill the input field
    await page.locator(input_selector).fill(value, timeout=10000)
    await page.wait_for_timeout(500)

    # 2. Create a selector for the suggestion based on the typed text.
    # We look for a suggestion link or item that contains the exact value we typed.
    # This is often the most reliable way to select from an autocomplete list.
    suggestion_selector = f'a:has-text("{value}")' 
    
    # Alternatively, use a generic list item role:
    # suggestion_selector = f'[role="option"] >> text="{value}"' 
    
    try:
        # 3. Wait for the suggestion to appear and click it
        suggestion_locator = page.locator(suggestion_selector).first
        await suggestion_locator.click(timeout=10000)
        print(f"{description} suggestion clicked successfully.")
        
    except PlaywrightTimeoutError:
        # Fallback: if clicking the specific suggestion fails, try pressing Enter
        print(f"Warning: Specific {description} suggestion not found, pressing Enter as fallback.")
        await page.keyboard.press('Enter')
        
    await page.wait_for_timeout(1000) # Wait a moment for the value to register/page to update

async def scrape_doctoralia():
    """
    Automates the search process on Doctoralia: fills fields by selecting dropdown options, 
    skips the interstitial, and extracts the final search results PAGE CONTENT.
    """
    all_results = [] 

    async with async_playwright() as p:
        # Launch browser with headless=True for faster, non-visual execution
        browser = await p.chromium.launch(headless=True) 
        page = await browser.new_page()
        
        print(f"Starting crawl. Navigating to {BASE_URL}")

        try:
            # STEP 1: Initial Navigation
            await page.goto(BASE_URL, wait_until="domcontentloaded", timeout=30000)
            await page.wait_for_load_state('networkidle')
            print("Successfully loaded the homepage.")

            # --- STEP 1.5: Handle Cookie Consent ---
            try:
                print("Checking for cookie consent pop-up...")
                cookie_button = page.locator(COOKIE_ACCEPT_SELECTOR)
                await cookie_button.click(timeout=5000) 
                print("Cookie consent accepted successfully.")
                await page.wait_for_timeout(500)
            except PlaywrightTimeoutError:
                print("No cookie consent pop-up detected (or element not found quickly). Proceeding.")
            
            # --- END Cookie Handling ---


            for params in SEARCH_PARAMETERS:
                specialty = params["specialty"]
                city = params["city"]
                
                print(f"\n--- Performing Search for: {specialty} in {city} ---")
                
                # --- FILLING SEARCH FIELDS WITH DROPDOWN SELECTION ---
                
                # 2a. Select Specialty
                await select_dropdown_option(page, SPECIALTY_INPUT_SELECTOR, specialty, "Specialty")
                
                # 2b. Select City
                await select_dropdown_option(page, CITY_INPUT_SELECTOR, city, "City")
                
                # Wait for the page to navigate to results or show the interstitial
                await page.wait_for_load_state('networkidle')
                print("Initial search completed. Waiting for 'Omitir' button...")


                # --- HANDLING THE INTERACTIVE INTERSTITIAL (OMITIR) ---
                try:
                    omitir_button = page.locator(OMITIR_BUTTON_SELECTOR)
                    # Check visibility to avoid waiting the full timeout if it's not present
                    await omitir_button.wait_for(state='visible', timeout=15000) 
                    
                    print("Found 'Omitir' button. Clicking to proceed to results...")
                    await omitir_button.click()
                    
                    await page.wait_for_load_state('networkidle')
                    
                except PlaywrightTimeoutError:
                    print("Note: 'Omitir' button not found within timeout (or page loaded directly to results). Proceeding.")
                
                
                # --- FINAL CONTENT EXTRACTION ---
                try:
                    page_content = await page.content() 
                    
                    all_results.append({
                        "specialty": specialty,
                        "city": city,
                        "url": page.url,
                        "html_content": page_content
                    })
                    print(f"Page content (HTML) captured successfully. Content size: {len(page_content)} bytes.")

                except Exception as e:
                     print(f"Error capturing page content: {e}")
                

                # Go back to the homepage for the next iteration
                if params != SEARCH_PARAMETERS[-1]:
                    await page.goto(BASE_URL, wait_until="domcontentloaded")
                    # No need to explicitly clear, as we navigate back to the clean homepage


        except PlaywrightTimeoutError:
            print(f"\n[ERROR] Playwright Timeout occurred. Check the network connection or if selectors have changed.")
        except Exception as e:
            print(f"\n[CRITICAL ERROR] An unexpected error occurred: {e}")

        finally:
            await browser.close()
            print(f"\nCrawl finished. Total results collected: {len(all_results)}")
            
    return all_results

if True:
    
    try:
        nest_asyncio.apply()
    except Exception as e:
        print(f"Warning: Could not apply nest-asyncio. Error: {e}")

    # Execute the coroutine
    results = asyncio.run(scrape_doctoralia())
    
    if results:
        print("\n--- Extracted Search Result Summaries ---")
        for item in results:
            # Print a summary of the extracted content
            print(f"[{item['specialty']} in {item['city']}]: URL={item['url']}, HTML Content Size={len(item['html_content'])} bytes.")
        print("\nNote: The full HTML content for each search is stored in the 'html_content' key of the results list.")

Starting crawl. Navigating to https://www.doctoralia.co/
Successfully loaded the homepage.
Checking for cookie consent pop-up...
No cookie consent pop-up detected (or element not found quickly). Proceeding.

--- Performing Search for: Pediatra in Bogotá ---
Typing Specialty: Pediatra...
Specialty suggestion clicked successfully.
Typing City: Bogotá...
City suggestion clicked successfully.
Initial search completed. Waiting for 'Omitir' button...
Note: 'Omitir' button not found within timeout (or page loaded directly to results). Proceeding.
Page content (HTML) captured successfully. Content size: 542789 bytes.

--- Performing Search for: Cardiología in Medellín ---
Typing Specialty: Cardiología...
Typing City: Medellín...
Initial search completed. Waiting for 'Omitir' button...
Note: 'Omitir' button not found within timeout (or page loaded directly to results). Proceeding.
Page content (HTML) captured successfully. Content size: 315028 bytes.

--- Performing Search for: Dermatología in 

In [21]:
results

[{'specialty': 'Pediatra',
  'city': 'Bogotá',
  'url': 'https://www.doctoralia.co/pediatra/bogota',
 {'specialty': 'Cardiología',
  'city': 'Medellín',
  'url': 'https://www.doctoralia.co/',
 {'specialty': 'Dermatología',
  'city': 'Cali',
  'url': 'https://www.doctoralia.co/',