In [2]:
/** Importation */
const pioneer = await import("jsr:@astral/astral")
const browser = await pioneer.launch({ headless: false })
const webpage = await browser.newPage("https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeDocType")

/**
 * Step by step
 * 1. wait for page to render
 * 2. Select "All" option in "Document Type" dropdown
 * 3. Select "Yesterday" option in "Date Range" dropdown
 * 4. Click "Search" button to query for recorded notices
 * 5. Capture the network request/response with the json response with results
 */
/** Extractions */
const title = () => document.title
const links = () =>  Array.from(document.querySelectorAll('a')).map(({ href }) => href)
const meta_tags = () => Array.from(document.querySelectorAll('meta')).map(({ name, content }) => ({ name, content }))

/** Implementation */  
console.log({ 
  // url: await webpage.url,
  // cookies: await webpage.cookies(),
  title: await webpage.evaluate(title),
  links: await webpage.evaluate(links),
  meta_tags: await webpage.evaluate(meta_tags)
})

/** Cleanup */
await browser.close();

{
  title: "Search",
  links: [
    "https://recorderecomm.clarkcountynv.gov/static/updatebrowser.html",
    "",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Shopping/Cart",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Shopping/OrderStatusTracking",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Account/Login",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/#",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeSimpleSearch",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeName",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeParcel",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeInstrumentNumber",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeDocType",
    "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeBookPage",
    "https://recorderecomm.c

In [None]:
Implement this python code in Deno using the core deno library and deno standard modules along with the typescript version of playwright or puppeteer to have it working exactly the same just in deno/typescript

from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta
import logging
import subprocess
import sys
import os
from playwright.sync_api import sync_playwright, expect
import time
import csv
from collections import OrderedDict

# Delay function to convert milliseconds to seconds
# This is used to slow down the scraping process to avoid being flagged as a bot
# delay(1000) will sleep for 1 second
def delay(ms):
    time.sleep(ms/1000) # Convert milliseconds to seconds
    


try:
    from playwright.sync_api import sync_playwright
except ImportError:
    logging.error("Playwright is not installed. Please install it using 'pip install playwright'")
    raise

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

def install_playwright_browsers():
    try:
        subprocess.check_call([sys.executable, '-m', 'playwright', 'install'])
        logging.info("Playwright browsers installed successfully")
    except subprocess.CalledProcessError as e:
        logging.error(f"Failed to install Playwright browsers: {e}")
        raise

def wait_and_click_search_button(page, max_retries=3, retry_delay=5):
    for attempt in range(max_retries):
        try:
            logging.info(f"Attempt {attempt + 1} to find and click the Search button")
            
            # Wait for the Search button to be visible and enabled
            search_button = page.wait_for_selector('#SearchBtn:not(:disabled)', 
                                            state='visible', 
                                                   timeout=30000)
            
            # Ensure the button is clickable
            expect(search_button).to_be_enabled()
            
            # Wait to press the button
            delay(5000)
            
            # Click the Search button
            search_button.click()
            logging.info("Clicked Search button successfully")
            
            # Wait for the search results to load
            delay(5000)
            
            # Wait for some indication that the search has started
            # This could be a loading spinner or a change in the page content
            page.wait_for_selector('.loading-indicator', state='visible', timeout=10000)
            logging.info("Search initiated")
            
            # Wait for the loading indicator to disappear
            page.wait_for_selector('.loading-indicator', state='hidden', timeout=1000)
            logging.info("Search completed")
            
            # Wait for the search results to load
            delay(3000)
            
            return True
        except Exception as e:
            logging.warning(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                logging.error("Max retries reached. Unable to click Search button.")
                raise

def select_document_type(page, max_retries=3, retry_delay=5):
    for attempt in range(max_retries):
        try:
            logging.info(f"Attempt {attempt + 1} to select 'Today' from the dropdown")
            
            # Wait for and click the dropdown
            dropdown = page.wait_for_selector('div.col-md-4 > div:nth-of-type(2) span.k-input', 
                                              state='visible', 
                                              timeout=30000)
            dropdown.click()
            logging.info("Clicked dropdown")

            # Wait for the dropdown options to appear
            page.wait_for_selector('ul.k-list li', state='visible', timeout=10000)

            # Find and click the "Today" option
            option = page.query_selector('ul.k-list li:text-is("Today")')
            if option:
                option.click()
                logging.info("Selected 'Today' from dropdown")
                
                # Wait after selecting the option
                delay(3000)
                return True
            else:
                raise Exception("'Today' option not found in dropdown")

        except Exception as e:
            logging.warning(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                logging.info(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                logging.error("Max retries reached. Unable to select 'Today' from dropdown")
                raise

def scrape_website(**kwargs):
    logging.info("Starting web scraping task")
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=False)  # Set to True for production
            context = browser.new_context()
            page = context.new_page()

            logging.info("Browser launched successfully")

            # Set viewport size
            page.set_viewport_size({"width": 1114, "height": 1033})

            # Navigate directly to the search page
            page.goto("https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeDocType")
            logging.info("Navigated directly to the search page")

            # Wait for the page to load
            page.wait_for_load_state('networkidle', timeout=50000)

            delay(5000)
            # Click on the second dropdown (Date Range)
            page.click('div.col-md-4 > div:nth-of-type(2) span.k-input')
            logging.info("Clicked on Date Range dropdown")
            delay(5000)
            # Select 'Yesterday' from the Date Range dropdown
            page.click('text="Yesterday"')
            logging.info("Selected 'Yesterday' from Date Range dropdown")

            page.wait_for_load_state('networkidle', timeout=50000)
            delay(5000)
            # Click the Search button
            search_button = page.wait_for_selector('#SearchBtn', state='visible')
            search_button.click()
            logging.info("Clicked Search button for 'Yesterday'")
            delay(5000)
            # Add another wait after clicking the Search button
            page.wait_for_load_state('networkidle', timeout=50000)

            # Check if error message text exists, if it does then we want to retry
            max_retries = 3
            wait_time = 3
            
            for attempt in range(max_retries):
                too_soon_error_message = page.query_selector('text="You have attempted to perform a search too quickly. Please try again"')
                no_results_found_error = page.query_selector('text="No search results found. Please try your search again."')
                
                if no_results_found_error:
                    # Wait for the download to complete
                    page.wait_for_load_state('networkidle', timeout=50000)

                    # Click on the Date Range dropdown again
                    page.click('div.col-md-4 > div:nth-of-type(2) span.k-select')
                    logging.info("Clicked on Date Range dropdown again")

                    # Select 'Yesterday' from the Date Range dropdown
                    page.click('text="Yesterday"')
                    logging.info("Selected 'Yesterday' from Date Range dropdown")

                    # Click the Search button again
                    search_button.click()
                    logging.info("Clicked Search button for 'Yesterday'")

                    # Wait for the search results to load
                    page.wait_for_load_state('networkidle', timeout=30000)
                elif too_soon_error_message:
                    logging.info(f"Detected error message. Attempt {attempt + 1}: Waiting {wait_time} seconds before retrying.")
                    time.sleep(wait_time)
                    
                    # Re-click the Search button
                    search_button.click()
                    logging.info("Clicked Search button again")
                    
                    # Wait for the search results to load
                    page.wait_for_load_state('networkidle', timeout=50000)
                    
                    # Increase wait time for next attempt
                    wait_time *= 2
                else:
                    logging.info("No error messages detected. Continuing with the script.")
                    break
            else:
                error_message = f"Failed to get past the error messages after {max_retries} attempts."
                logging.error(error_message)
                raise Exception(error_message)

            # Wait for the 'Export to CSV' button to be visible
            export_button = page.wait_for_selector('text="Export to CSV"', state='visible', timeout=30000)
        
            if export_button:
                # Set up a download handler before clicking the button
                with page.expect_download() as download_info:
                    export_button.click()
                    logging.info("Clicked 'Export to CSV' button")
                
                # Wait for the download to complete
                download = download_info.value
                logging.info("CSV export initiated")
                
                # Save the downloaded file
                date_str = datetime.now().strftime("%Y-%m-%d")
                download_path = os.path.join(os.path.dirname(__file__), '..', 'downloads',  f'clark_county_records.{date_str}.csv')
                download.save_as(download_path)
                logging.info("CSV file downloaded and saved successfully")

                delay(10000)
                browser.close()
            else:
                logging.error("'Export to CSV' button not found")
                raise Exception("'Export to CSV' button not found")
    except Exception as e:
        logging.error(f"An error occurred during web scraping: {str(e)}")
        raise  # Re-raise the exception to mark the task as failed
    finally:
        # Ensure the browser is closed
        if 'browser' in locals():
            browser.close()
        logging.info("Browser closed")

with DAG(
    'clark_county_scraping_dag',
    default_args=default_args,
    description='A DAG for scraping Clark County records',
    schedule_interval=timedelta(days=1),
    start_date=datetime(2024, 1, 1),
    catchup=False,
) as dag:

    install_browsers_task = PythonOperator(
        task_id='install_playwright_browsers',
        python_callable=install_playwright_browsers,
    )

    scrape_task = PythonOperator(
        task_id='scrape_clark_county_records',
        python_callable=scrape_website,
        provide_context=True,
    )

    install_browsers_task >> scrape_task

In [None]:
{
    "title": "nv.clark-county",
    "steps": [
        {
            "type": "setViewport",
            "width": 920,
            "height": 813,
            "deviceScaleFactor": 1,
            "isMobile": false,
            "hasTouch": false,
            "isLandscape": false
        },
        {
            "type": "navigate",
            "assertedEvents": [
                {
                    "type": "navigation",
                    "url": "https://www.google.com/",
                    "title": "Google"
                }
            ],
            "url": "https://www.google.com/"
        },
        {
            "type": "keyDown",
            "target": "main",
            "key": "Meta"
        },
        {
            "type": "keyDown",
            "target": "main",
            "key": "l"
        },
        {
            "type": "navigate",
            "assertedEvents": [
                {
                    "type": "navigation",
                    "url": "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeDocType",
                    "title": ""
                }
            ],
            "url": "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeDocType"
        },
        {
            "type": "click",
            "target": "main",
            "selectors": [
                [
                    "xpath///*[@id=\"SearchCriteria\"]/div[1]/div[1]/div/div/div"
                ]
            ],
            "offsetX": 126.6953125,
            "offsetY": 17
        },
        {
            "type": "click",
            "target": "main",
            "selectors": [
                [
                    "aria/All"
                ],
                [
                    "xpath///*[@id=\"f2a93df7-2fa8-408f-9997-0212c0c9637c\"]"
                ]
            ],
            "offsetX": 86.6953125,
            "offsetY": 13.90625
        },
        {
            "type": "keyDown",
            "target": "main",
            "key": "Escape"
        },
        {
            "type": "keyUp",
            "target": "main",
            "key": "Escape"
        },
        {
            "type": "click",
            "target": "main",
            "selectors": [
                [
                    "xpath///*[@id=\"SearchCriteria\"]/div[1]/div[2]/div/span[2]/span/span[1]"
                ]
            ],
            "offsetX": 104.75,
            "offsetY": 4.90625
        },
        {
            "type": "click",
            "target": "main",
            "selectors": [
                [
                    "aria/Yesterday"
                ],
                [
                    "xpath///*[@id=\"DateRangeDropDown_listbox\"]/li[4]"
                ],
                [
                    "text/Yesterday"
                ]
            ],
            "offsetX": 85.75,
            "offsetY": 6.2109375
        },
        {
            "type": "click",
            "assertedEvents": [
                {
                    "type": "navigation",
                    "url": "https://recorderecomm.clarkcountynv.gov/AcclaimWeb/Search/SearchTypeDoctype",
                    "title": "Search"
                }
            ],
            "target": "main",
            "selectors": [
                [
                    "aria/Search[role=\"button\"]"
                ],
                [
                    "xpath///*[@id=\"SearchBtn\"]"
                ]
            ],
            "offsetX": 34.3515625,
            "offsetY": 12.90625
        },
        {
            "type": "click",
            "assertedEvents": [
                {
                    "type": "navigation"
                }
            ],
            "target": "main",
            "selectors": [
                [
                    "aria/Export to CSV"
                ],
                [
                    "xpath///*[@id=\"SearchResultGrid\"]/div[1]/a[1]"
                ],
                [
                    "text/Export to CSV"
                ]
            ],
            "offsetX": 39.96875,
            "offsetY": 11.59375
        }
    ]
}