In [None]:
import pandas
import asyncio
import sys
import re
import json
import traceback
from playwright.async_api import async_playwright, Playwright, expect
from playwright_stealth import stealth_async
from collections.abc import Callable

In [None]:
configs = {
    'NavigationDefaultTimeoutSec': 5,
    'PageDefaultTimeoutSec': 5,
    'WaitingSecWithinActions': 1,
}

class ManualLoopInterrupt(Exception):
    pass

In [None]:
class WebHandler:
    def __init__(self):
        self.callee = None
        
    async def main_loop(self):
        async with async_playwright() as playwright:
            global browser_context
            chromium = playwright.chromium
            browser_context = await chromium.launch_persistent_context('',
                                                                       headless=False,
                                                                       user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.',
                                                                      )
            browser_context.set_default_navigation_timeout(configs['NavigationDefaultTimeoutSec'] * 1000)
            browser_context.set_default_timeout(configs['PageDefaultTimeoutSec'] * 1000)
            stealth_page = browser_context.pages[0]
            await stealth_async(stealth_page)
            
            print('Launched!')
            while True:
                try:
                    if isinstance(self.callee, Callable):
                        await self.callee(stealth_page, browser_context)
                except ManualLoopInterrupt:
                    break
                except:
                    print(''.join(traceback.format_exception(*sys.exc_info())))
                finally:
                    self.callee = None
                    await asyncio.sleep(1)
                    
            await browser_context.close()

In [None]:
wh = WebHandler()
asyncio.create_task(wh.main_loop())

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    print(type(browser_context))

wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    print(f'stealth_page.url: {stealth_page.url}')
    await stealth_page.goto('https://www.google.com')
    print(f'stealth_page.url: {stealth_page.url}')

wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    print(f'stealth_page.url: {stealth_page.url}')
    await stealth_page.goto('https://arh.antoinevastel.com/bots/areyouheadless')
    print(f'stealth_page.url: {stealth_page.url}')

wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    msg_div = stealth_page.locator('#res')
    msg_contents = await msg_div.inner_text()
    print(f'msg_contents: {msg_contents}')

wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    print(f'stealth_page.url: {stealth_page.url}')
    await stealth_page.goto('https://www.useragents.me')
    print(f'stealth_page.url: {stealth_page.url}')

wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    json_parent_div = stealth_page.locator('#most-common-desktop-useragents-json-csv')
    await json_parent_div.highlight()
    print(f'json_parent_div: {json_parent_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    json_div = json_parent_div.locator('div', has_text='JSON')
    await json_div.highlight()
    print(f'json_div: {json_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    json_textarea = json_div.locator('textarea')
    await json_textarea.highlight()
    print(f'json_textarea: {json_textarea}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    json_text = await json_textarea.input_value()
    parsed_json = json.loads(json_text)
    print(f'json_text: {str(json.dumps(parsed_json, indent=4))}')


wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    # stealth_page = await browser_context.new_page()
    # await stealth_async(stealth_page)
    user_agent = await stealth_page.evaluate('() => navigator.userAgent')
    print(f'user_agent: {user_agent}')

wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    api_req_context = browser_context.request
    response = await api_req_context.get('https://www.pagina12.com.ar/robots.txt')
    print(response)
    text = await response.text()
    print(text)

wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    print(f'stealth_page.url: {stealth_page.url}')
    # await stealth_page.goto('https://www.pagina12.com.ar/800250-genealogistas')
    # await stealth_page.goto('https://www.pagina12.com.ar/810583-cambio-el-mundo')
    await stealth_page.goto('https://www.pagina12.com.ar/775639-el-futuro-de-la-ia-y-su-impacto-en-el-conocimiento-cambiara-')
    print(f'stealth_page.url: {stealth_page.url}')

wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    header_div = stealth_page.locator('div.article-header')
    await header_div.highlight()
    print(f'header_div: {header_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    title_div = header_div.locator('h1')
    await title_div.highlight()
    print(f'title_div: {title_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    title_text = await title_div.inner_text()
    print(f'title_text: {title_text}')


wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    desktop_only_div = stealth_page.locator('div.hide-on-mobile')
    await desktop_only_div.highlight()
    print(f'desktop_only_div: {desktop_only_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    article_info_div = desktop_only_div.locator('div.article-info')
    await article_info_div.highlight()
    print(f'article_info_div: {article_info_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    date_time = article_info_div.locator('time')
    await date_time.highlight()
    print(f'date_time: {date_time}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    date_text = await date_time.inner_text()
    print(f'date_text: {date_text}')


wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    header_div = stealth_page.locator('div.article-header')
    await header_div.highlight()
    print(f'header_div: {header_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    author_div = header_div.locator('div.author')
    await author_div.highlight()
    print(f'author_div: {author_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    author_a = author_div.locator('a')
    await author_a.highlight()
    print(f'author_a: {author_a}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    author_text = await author_a.inner_text()
    author_text = re.sub(r'^\s*Por\s+', '', author_text)
    author_text = author_text.strip()
    print(f'author_text: {author_text}')


wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    main_image_div = stealth_page.locator('div.article-main-image')
    await main_image_div.highlight()
    print(f'main_image_div: {main_image_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    image_img = main_image_div.locator('img')
    await image_img.highlight()
    print(f'image_img: {image_img}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    image_src = await image_img.get_attribute('src')
    print(f'image_src: {image_src}')


wh.callee = test

In [None]:
async def test(stealth_page, browser_context):
    # raise ManualLoopInterrupt()
    main_content_div = stealth_page.locator('div.article-main-content')
    await main_content_div.highlight()
    print(f'main_content_div: {main_content_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)

    article_text_div = main_content_div.locator('div.article-text')
    await article_text_div.highlight()
    print(f'article_text_div: {article_text_div}')
    await stealth_page.wait_for_timeout(configs['WaitingSecWithinActions'] * 1000)
    
    article_text = await article_text_div.inner_text()
    print(f'date_text: {article_text}')


wh.callee = test