In [None]:

async def scrap_page (url, index, site):
    nb_found = 0
    # if url is not set, return
    if not url:
        return
    try:
        from playwright.async_api import async_playwright
        playwright = await async_playwright().start()
        
        browser = await playwright.chromium.launch()
        # browser = await playwright.webkit.launch()

        page = await browser.new_page()

        print(f"Scraping {url} ...")
        await page.goto(url)

        # wait for page to load
        await page.wait_for_load_state('networkidle')
         
        # scrap selector
        if 'selector' in site:
            selector = site['selector']
            print(f"Searching for {selector} ...")
            
            # get all elements
            elements = await page.query_selector_all(selector)
            nb_found = len(elements)
            
            # print number of results
            print(f"Found {len(elements)} results")

            # print all elements text
            for element in elements:
                # print element text
                # text = await element.inner_text()
                # print(text)
                # print element html
                html = await element.inner_html()
                print(html)

                found = {}
                # select title in element
                select_title = site['select_title'] if 'select_title' in site else None
                if select_title:
                    print(f"Searching for {select_title} ...")
                    title = await element.query_selector(select_title)
                    if title:
                        title_text = await title.inner_text()
                        found['title'] = title_text
                        print(title_text)
                # select link in element
                select_link = site['select_link'] if 'select_link' in site else None
                if select_link:
                    link = await element.query_selector(select_link)
                    if (link):
                        link_href = await link.get_attribute('href')
                        if (link_href):
                            found['link'] = link_href

                            # if link is relative, add base url
                            if link_href.startswith('/'):
                                base_url = site['base_url'] if 'base_url' in site else None
                                if base_url:
                                    # remove query string from link_href
                                    if '?' in link_href:
                                        link_href = link_href.split('?')[0]
                                    link_href = f"{base_url}{link_href}"
                                    found['link_full'] = link_href

                                    # compute md5 hash of link_href
                                    import hashlib
                                    found['link_hash'] = hashlib.md5(link_href.encode()).hexdigest()

                            # print(link_href)
                # select time in element
                select_time = site['select_time'] if 'select_time' in site else None
                if select_time:
                    time = await element.query_selector(select_time)
                    if (time):
                        time_text = await time.get_attribute('datetime')
                        if (time_text):
                            found['time'] = time_text
                            # print(time_text)
                # add found to site
                print(found)
                if 'results' not in site:
                    site['results'] = []

                site['results'].append(found)          


        # save screenshot
        # get name of site
        name = site['name'] if 'name' in site else 'unknown'
        output = site['output'] if 'output' in site else '.'
        target_file = f"{output}/screenshot-{name}-{index}.png"
        await page.screenshot(path=target_file, full_page=True)
        await browser.close()
        await playwright.stop()
    except Exception as e:
        print(e)
        
    print(site)
    
    # optional: display screenshot
    show_image = site['show_image'] if 'show_image' in site else False
    if show_image:
        # if file exists, display it
        import os.path
        if os.path.isfile(target_file):
            from IPython.display import Image
            img = Image(filename=target_file)
            display(img)
    
    # return number of found elements
    return nb_found

In [None]:

#build index with timestamp like 2023-01-01-12-00
from datetime import datetime
now = datetime.now()
index = now.strftime("%Y%m%d-%H%M")
print(index)
await scrap_page('https://applh.com/', index, {
    'output': './my-output/',
    'name': 'applh',
    'index': index,
})


In [None]:
#delete ./output files older than 1 hour
!find ./output -type f -mmin +15 -delete