# web

> Tools for the web / html

In [None]:
#| default_exp web

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import re
from bs4 import BeautifulSoup

In [None]:
#| export
def clean_text(soup: BeautifulSoup) -> str:
    import re
    from bs4 import BeautifulSoup
    """Extracts text from html whilst removing extra whitespace and newlines"""
    text = soup.get_text()
    text = re.sub(r'^\s*$', '\n', text, flags=re.MULTILINE) # convert any white space only lines to newlines
    text = re.sub(r'\n{3,}', '\n\n', text) # squish any 3 or more consecutive newlines to 2 newlines
    return text 

In [None]:
from bs4 import BeautifulSoup




Example Document

Paragraph 1.
Paragraph 2.

Above is a line containing only tabs.
After running clean_text, the tabs are stripped.
Paragraph 3.
Paragraph 4.




In [None]:

example = """
<html>
<head><title>Example Document</title></head>
<body>
<p>Paragraph 1.</p>

<p>Paragraph 2.</p>

\t\t\t
Above is a line containing only tabs.
After running clean_text, the tabs are stripped.
<p>Paragraph 3.</p>



<p>Paragraph 4.</p>
</body>
</html>
"""
soup = BeautifulSoup(example, 'html.parser')
print(clean_text(soup))

In [None]:
#| export
from pathlib import Path

In [None]:
#| export
async def request(url: str=None, search: str=None, browser="brave") -> str:
    from playwright.async_api import async_playwright
    # if we are inside a Jupyter notebook, we have to patch the event loop
    if "get_ipython" in globals():
        try:
            import nest_asyncio
            nest_asyncio.apply()
        except ImportError as e:
                e.args.append("Detected we are inside a IPython or Jupyter environment. Import requires access to an event loop. Please install nest_asyncio to enable this functionality")
                raise e
                
    if browser == "brave":
        brave = Path("/usr/bin/brave-browser")
        

    async with async_playwright() as p:
        if browser == "brave":
            browser = await p.chromium.launch(headless=True, executable_path=brave)
        else:
            browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        if url:
            await page.goto(url)
        elif search:
            await page.goto("https://duckduckgo.com")
            await page.click('input[name="q"]')
            await page.type('input[name="q"]', search)
            await page.press('input[name="q"]', "Enter")
        await page.wait_for_load_state("networkidle")
        content = await page.content()
        await browser.close()
        return content

We test our `request` function on a url which is known to require client side javascript rendering:

In [None]:
r = await request("http://quotes.toscrape.com/js/") 
javascript_rendered_element = """<div class="quote"><span class="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span><span>by <small class="author">Albert Einstein</small></span><div class="tags">Tags: <a class="tag">inspirational</a> <a class="tag">life</a> <a class="tag">live</a> <a class="tag">miracle</a> <a class="tag">miracles</a></div></div>"""
assert javascript_rendered_element in r

## Using duckduckgo for search

In [None]:
#| export
from pathlib import Path
import asyncio
import json
from playwright.async_api import async_playwright


In [None]:
#| export
async def handle_response(response, data):
    if "links.duckduckgo.com/d.js" in response.url:
        d = await response.body()
        d = str(d)
        # by inspection, we know the data lives between these two function calls
        s = str(d).find("DDG.pageLayout.load(\\'d")
        e = str(d).find("DDG.duckbar.load(\\'images")
        d = d[s:e]
        # after finding the location of the data, we extract out the relevant json
        s = d.find("[")
        e = d.rfind("]") + 1
        d = d[s:e].encode("utf-8").decode("unicode_escape")
        d = json.loads(d)
        d = [[r.get("u"), r.get("t"), r.get("a")] for r in d if r.get("a")]
        data.set_result(d)

async def ddg(q, wait=1, browser="brave", headless=True):
    # if we are inside a Jupyter notebook, we have to patch the event loop
    if "get_ipython" in globals():
        try:
            import nest_asyncio
            nest_asyncio.apply()
        except ImportError as e:
                e.args.append("Detected we are inside a IPython or Jupyter environment. Import requires access to an event loop. Please install nest_asyncio to enable this functionality")
                raise e
                
    if browser == "brave":
        brave = Path("/usr/bin/brave-browser")
    async with async_playwright() as p:
        if browser == "brave":
            browser = await p.chromium.launch(headless=headless, executable_path=brave)
        else:
            browser = await p.chromium.launch(headless=headless)
        page = await browser.new_page()
        data = asyncio.Future()
        page.on("response", lambda response: asyncio.create_task(handle_response(response, data)))
        await page.goto("https://duckduckgo.com")
        await page.click('input[name="q"]')
        await page.type('input[name="q"]', q)
        await page.press('input[name="q"]', "Enter")
        data = await data
        content = await page.content()
        # await page.wait_for_load_state("networkidle")
        if wait:
            await asyncio.sleep(wait)
        print(data)
        print()
        print(content)
        return data, content


In [None]:
data, content = await ddg("chicken burger", wait=10, headless=False)

In [None]:
content

'<!DOCTYPE html><html lang="en-US" class="has-zcm  is-link-style-exp is-link-order-exp is-link-breadcrumb-exp is-related-search-exp is-vertical-tabs-exp js no-touch opacity csstransforms3d csstransitions svg cssfilters is-not-mobile-device full-urls breadcrumb-urls react has-footer has-right-rail-module" style="--sds-color-palette-black:#000000; --sds-color-palette-white:#FFFFFF; --sds-color-palette-transparent:transparent; --sds-color-palette-gray-100:#111111; --sds-color-palette-gray-90:#222222; --sds-color-palette-gray-85:#333333; --sds-color-palette-gray-80:#444444; --sds-color-palette-gray-70:#666666; --sds-color-palette-gray-60:#888888; --sds-color-palette-gray-50:#aaaaaa; --sds-color-palette-gray-40:#cccccc; --sds-color-palette-gray-30:#dddddd; --sds-color-palette-gray-20:#eeeeee; --sds-color-palette-gray-10:#f5f5f5; --sds-color-palette-gray-0:#fafafa; --sds-color-palette-red-100:#330B01; --sds-color-palette-red-90:#551605; --sds-color-palette-red-80:#77230C; --sds-color-palette

## Clients

In [None]:
#| export
from httpx import AsyncClient
from playwright.async_api import async_playwright
import asyncio

In [None]:
#| export
# `nest_asyncio` allows asyncio to run nested event loops, which is often necessary
# in a Jupyter notebook because the kernel itself is running an event loop.
# Without this, using asyncio-based libraries like httpx or aiohttp can cause errors.
if "get_ipython" in globals():
    try:
        import nest_asyncio

        nest_asyncio.apply()
    except ImportError as e:
        raise ImportError(
            "Detected we are inside a IPython or Jupyter environment."
            "Import requires access to an event loop."
            "Please install nest_asyncio to enable this functionality"
        ) from e

In [None]:
class HttpClient:
    def __init__(self):
        self.client = AsyncClient()

    async def get(self, url):
        return await self.client.get(url)

    async def close(self):
        await self.client.aclose()
        
class HttpxClient:
    def __init__(self):
        self.client = AsyncClient()
        
    async def get(self, url):
        return await self.client.get(url)


class PlaywrightClient:
    def __init__(self):
        self.playwright = None
        self.browser = None

    async def start(self):
        self.playwright = await async_playwright().__aenter__()
        self.browser = await self.playwright.chromium.launch()

    async def get(self, url):
        if not self.browser:
            await self.start()
        page = await self.browser.new_page()
        response = await page.goto(url)
        content = await page.content()
        await page.close()
        return content

    async def close(self):
        if self.browser:
            await self.browser.close()
        if self.playwright:
            await self.playwright.stop()


In [None]:
# Example usage:
async def main():
    url = "https://example.com"
    
    # Using HttpClient
    client = HttpClient()
    response = await client.get(url)
    print(response.text)
    await client.close()

    # Using PlaywrightClient
    client = PlaywrightClient()
    response = await client.get(url)
    print(response)
    await client.close()

asyncio.run(main())

## Scrapers

In [None]:
import requests

from bs4 import BeautifulSoup
import json


In [None]:

class Scraper:
    def __init__(self, client=None, parser=None, serializer=None, request_hook=None, response_hook=None):
        self.client = client if client is not None else requests.Session()
        self.parser = parser if parser is not None else self.default_parser
        self.serializer = serializer if serializer is not None else self.default_serializer
        self.request_hook = request_hook
        self.response_hook = response_hook

    def default_parser(self, content, content_type):
        if 'html' in content_type:
            return BeautifulSoup(content, 'html.parser')
        elif 'json' in content_type:
            return json.loads(content)
        else:
            return content

    def default_serializer(self, data, path):
        with open(path, 'w') as f:
            json.dump(data, f)

    def scrape(self, url):
        # Call the request hook if it's set
        if self.request_hook:
            self.request_hook(url)

        # Make the request
        response = self.client.get(url)

        # Call the response hook if it's set
        if self.response_hook:
            self.response_hook(response)

        # Parse the response content
        content_type = response.headers.get('Content-Type', '')
        data = self.parser(response.content, content_type)

        return data

# Usage:
def log_request(url):
    print(f"Making a request to {url}")

def log_response(response):
    print(f"Received a response with status code {response.status_code}")

scraper = Scraper(request_hook=log_request, response_hook=log_response)
data = scraper.scrape('https://httpbin.org/get')

# Do something with the data...
print(data)

In [None]:

class Scraper:
    def __init__(self, client=None, parser=None, serializer=None, request_hook=None, response_hook=None):
        self.client = client if client is not None else requests.Session()
        self.parser = parser if parser is not None else self.default_parser
        self.serializer = serializer if serializer is not None else self.default_serializer
        self.request_hook = request_hook
        self.response_hook = response_hook

    def default_parser(self, content, content_type):
        if 'html' in content_type:
            return BeautifulSoup(content, 'html.parser')
        elif 'json' in content_type:
            return json.loads(content)
        else:
            return content

    def default_serializer(self, data, path):
        with open(path, 'w') as f:
            json.dump(data, f)

    def scrape(self, url):
        # Call the request hook if it's set
        if self.request_hook:
            self.request_hook(url)

        # Make the request
        response = self.client.get(url)

        # Call the response hook if it's set
        if self.response_hook:
            self.response_hook(response)

        # Parse the response content
        content_type = response.headers.get('Content-Type', '')
        data = self.parser(response.content, content_type)

        return data

# Usage:
def log_request(url):
    print(f"Making a request to {url}")

def log_response(response):
    print(f"Received a response with status code {response.status_code}")

scraper = Scraper(request_hook=log_request, response_hook=log_response)
data = scraper.scrape('https://httpbin.org/get')

# Do something with the data...
print(data)