In [1]:
urls = ["https://www.infinitepay.io", "https://www.infinitepay.io/maquininha",
        "https://www.infinitepay.io/maquininha-celular", "https://www.infinitepay.io/tap-to-pay",
        "https://www.infinitepay.io/pdv", "https://www.infinitepay.io/receba-na-hora",
        "https://www.infinitepay.io/gestao-de-cobranca-2", "https://www.infinitepay.io/gestao-de-cobranca",
        "https://www.infinitepay.io/link-de-pagamento", "https://www.infinitepay.io/loja-online",
        "https://www.infinitepay.io/boleto", "https://www.infinitepay.io/conta-digital",
        "https://www.infinitepay.io/conta-pj", "https://www.infinitepay.io/pix",
        "https://www.infinitepay.io/pix-parcelado", "https://www.infinitepay.io/emprestimo",
        "https://www.infinitepay.io/cartao", "https://www.infinitepay.io/rendimento"]

In [2]:
import selenium
import json
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options

In [3]:
options = Options()
options.add_argument("--window-size=1920x1080")
options.add_argument("--verbose")
options.add_argument("user-data-dir=/tmp/selenium")

driver = selenium.webdriver.Chrome(options=options)

In [None]:
def go_to_url(url: str) -> str:
    """Navigates the browser to the given URL."""
    driver.get(url.strip())
    return f"Navigated to URL: {url}"

def get_page_source() -> str:
    """Returns the current page source."""
    LIMIT = 4000000
    return driver.page_source[0:LIMIT]

def get_page_text() -> str:
    """Returns the text content of the current page after excluding unwanted tags."""
    page_source = get_page_source()

    soup = BeautifulSoup(page_source, 'html.parser')

    # Remove unwanted tags
    tags_to_remove = ['script', 'style', 'nav', 'footer', 'aside', 'header', 'path']
    for tag in tags_to_remove:
        for element in soup.find_all(tag):
            element.decompose()

    # Get cleaned text
    text = soup.get_text(separator=' ', strip=True)

    # Escape quotation marks to prevent JSON parsing issues
    text = text.replace('"', '\\"')

    return text

def create_kb(urls: list[str]) -> dict:
    """Creates a knowledge base from a list of URLs."""
    scrapped = {}
    for url in urls:
        print(f"Scraping {url}...")
        go_to_url(url)
        page_text = get_page_text()
        scrapped[url] = page_text
    return scrapped

def save_kb(kb: dict, file_path: str) -> None:
    """Saves the knowledge base to a JSON file."""
    with open(file_path, 'w') as f:
        json.dump(kb, f)

In [None]:
knowledge_base = create_kb(urls)
save_kb(knowledge_base, "../data/mock_knowledge_base.json")

Scraping https://www.infinitepay.io...
Scraping https://www.infinitepay.io/maquininha...
Scraping https://www.infinitepay.io/maquininha-celular...
Scraping https://www.infinitepay.io/tap-to-pay...
Scraping https://www.infinitepay.io/pdv...
Scraping https://www.infinitepay.io/receba-na-hora...
Scraping https://www.infinitepay.io/gestao-de-cobranca-2...
Scraping https://www.infinitepay.io/gestao-de-cobranca...
Scraping https://www.infinitepay.io/link-de-pagamento...
Scraping https://www.infinitepay.io/loja-online...
Scraping https://www.infinitepay.io/boleto...
Scraping https://www.infinitepay.io/conta-digital...
Scraping https://www.infinitepay.io/conta-pj...
Scraping https://www.infinitepay.io/pix...
Scraping https://www.infinitepay.io/pix-parcelado...
Scraping https://www.infinitepay.io/emprestimo...
Scraping https://www.infinitepay.io/cartao...
Scraping https://www.infinitepay.io/rendimento...
