In [1]:
from dotenv import load_dotenv

In [2]:
load_dotenv()

False

In [8]:
import re, os
import tiktoken
import logging
import requests

from bs4 import BeautifulSoup

from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import SKLearnVectorStore

In [9]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
def extract_urls_from_sitemap(url_sitemap):
    """
    Extract URLs and metadata from a sitemap XML file.
    PS: Designed for RoboCup Small Size League sitemap
    
    Args:
        url_sitemap (str): URL of the sitemap to parse
        
    Returns:
        list: List of dictionaries containing URL information
        Each dictionary contains:
            - url: The page URL
            - lastmod: Last modification date
            - changefreq: How often the page changes
            - priority: Page priority (0.0 to 1.0)
    """
    
    try:
        # Fetch sitemap
        response = requests.get(url_sitemap)
        response.raise_for_status()  # Raise exception for bad status codes
        
        # Parse XML with lxml parser
        soup = BeautifulSoup(response.content, 'lxml-xml')
        
        # Initialize results list
        urls_info = []
        
        # Check if this is a sitemap index
        if soup.find('sitemapindex'):
            logger.info("Found sitemap index, processing sub-sitemaps...")
            # Get all sitemap URLs
            sitemap_urls = [sitemap.find('loc').text for sitemap in soup.find_all('sitemap')]
            
            # Process each sub-sitemap
            for sitemap_url in sitemap_urls:
                try:
                    sub_response = requests.get(sitemap_url)
                    sub_response.raise_for_status()
                    sub_soup = BeautifulSoup(sub_response.content, 'lxml-xml')
                    
                    # Process URLs in sub-sitemap
                    for url_elem in sub_soup.find_all('url'):
                        url_info = extract_url_info(url_elem)
                        if url_info:
                            urls_info.append(url_info)
                            
                except Exception as e:
                    logger.error(f"Error processing sub-sitemap {sitemap_url}: {str(e)}")
                    continue
                    
        else:
            # Process URLs in single sitemap
            logger.info("Processing single sitemap...")
            for url_elem in soup.find_all('url'):
                url_info = extract_url_info(url_elem)
                if url_info:
                    urls_info.append(url_info)
        
        logger.info(f"Successfully extracted {len(urls_info)} URLs from sitemap")
        return urls_info
        
    except requests.RequestException as e:
        logger.error(f"Error fetching sitemap: {str(e)}")
        return []
    except Exception as e:
        logger.error(f"Unexpected error processing sitemap: {str(e)}")
        return []

def extract_url_info(url_elem):
    """
    Extract information from a URL element in the sitemap.
    
    Args:
        url_elem: BeautifulSoup element containing URL information
        
    Returns:
        dict: Dictionary containing URL information or None if invalid
    """
    try:
        # Extract required URL
        url = url_elem.find('loc')
        if not url:
            return None
            
        # Extract optional fields with defaults
        lastmod = url_elem.find('lastmod')
        changefreq = url_elem.find('changefreq')
        priority = url_elem.find('priority')
        
        # Build info dictionary
        url_info = {
            'url': url.text.strip(),
            'lastmod': lastmod.text.strip() if lastmod else None,
            'changefreq': changefreq.text.strip() if changefreq else None,
            'priority': float(priority.text.strip()) if priority else 0.5
        }
        
        return url_info
        
    except Exception as e:
        logger.error(f"Error extracting URL info: {str(e)}")
        return None

In [11]:
urls_info = extract_urls_from_sitemap("https://ssl.robocup.org/page-sitemap.html")

INFO:__main__:Processing single sitemap...
INFO:__main__:Successfully extracted 115 URLs from sitemap


In [15]:
website_urls = [info['url'] for info in urls_info]

print(f"Total URLs: {len(website_urls)}")
print(f"First 2 URLs: {website_urls[:2]}")
print(f"Last 2 URLs: {website_urls[-2:]}")


Total URLs: 115
First 2 URLs: ['https://ssl.robocup.org/rules/', 'https://ssl.robocup.org/robocups/robocup-2019/robocup-2019-committees/']
Last 2 URLs: ['https://ssl.robocup.org/technical-overview-of-the-small-size-league/', 'https://ssl.robocup.org/robocups/robocup-2025/robocup-2025-teams/']


In [16]:
def count_tokens(text, model="cl100k_base"):
    """
    Count the number of tokens in the text using tiktoken.
    
    Args:
        text (str): The text to count tokens for
        model (str): The tokenizer model to use (default: cl100k_base for GPT-4)
        
    Returns:
        int: Number of tokens in the text
    """
    encoder = tiktoken.get_encoding(model)
    return len(encoder.encode(text))

In [6]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    
    # Target the main article content for documentation 
    main_content = soup.find("article", class_="md-content__inner")
    
    # If found, use that, otherwise fall back to the whole document
    content = main_content.get_text() if main_content else soup.text
    
    # Clean up whitespace
    content = re.sub(r"\n\n+", "\n\n", content).strip()
    
    return content

In [7]:
def load_site():
    """
    Load information from the official website.
    
    This function:
    1. Uses RecursiveUrlLoader to fetch pages from the website
    2. Counts the total documents and tokens loaded
    
    Returns:
        list: A list of Document objects containing the loaded content
        list: A list of tokens per document
    """
    print("Loading website...")

    docs = []
    for url in website_urls:

        loader = RecursiveUrlLoader(
            url,
            max_depth=5,
            extractor=bs4_extractor,
        )

        # Load documents using lazy loading (memory efficient)
        docs_lazy = loader.lazy_load()

        # Load documents and track URLs
        for d in docs_lazy:
            docs.append(d)

    print(f"Loaded {len(docs)} documents from website.")
    print("\nLoaded URLs:")
    for i, doc in enumerate(docs):
        print(f"{i+1}. {doc.metadata.get('source', 'Unknown URL')}")
    
    # Count total tokens in documents
    total_tokens = 0
    tokens_per_doc = []
    for doc in docs:
        total_tokens += count_tokens(doc.page_content)
        tokens_per_doc.append(count_tokens(doc.page_content))
    print(f"Total tokens in loaded documents: {total_tokens}")
    
    return docs, tokens_per_doc

In [8]:
def save_full_website(documents):
    """ Save the documents to a file """

    # Open the output file
    output_filename = "full_website.txt"

    with open(output_filename, "w") as f:
        # Write each document
        for i, doc in enumerate(documents):
            # Get the source (URL) from metadata
            source = doc.metadata.get('source', 'Unknown URL')
            
            # Write the document with proper formatting
            f.write(f"DOCUMENT {i+1}\n")
            f.write(f"SOURCE: {source}\n")
            f.write("CONTENT:\n")
            f.write(doc.page_content)
            f.write("\n\n" + "="*80 + "\n\n")

    print(f"Documents concatenated into {output_filename}")


In [9]:
def split_documents(documents):
    """
    Split documents into smaller chunks for improved retrieval.
    
    This function:
    1. Uses RecursiveCharacterTextSplitter with tiktoken to create semantically meaningful chunks
    2. Ensures chunks are appropriately sized for embedding and retrieval
    3. Counts the resulting chunks and their total tokens
    
    Args:
        documents (list): List of Document objects to split
        
    Returns:
        list: A list of split Document objects
    """
    print("Splitting documents...")
    
    # Initialize text splitter using tiktoken for accurate token counting
    # chunk_size=8,000 creates relatively large chunks for comprehensive context
    # chunk_overlap=500 ensures continuity between chunks
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=8000,  
        chunk_overlap=500  
    )
    
    # Split documents into chunks
    split_docs = text_splitter.split_documents(documents)
    
    print(f"Created {len(split_docs)} chunks from documents.")
    
    # Count total tokens in split documents
    total_tokens = 0
    for doc in split_docs:
        total_tokens += count_tokens(doc.page_content)
    
    print(f"Total tokens in split documents: {total_tokens}")
    
    return split_docs

In [10]:
def create_vectorstore(splits):
    """
    Create a vector store from document chunks using SKLearnVectorStore.
    
    This function:
    1. Initializes an embedding model to convert text into vector representations
    2. Creates a vector store from the document chunks
    
    Args:
        splits (list): List of split Document objects to embed
        
    Returns:
        SKLearnVectorStore: A vector store containing the embedded documents
    """
    print("Creating SKLearnVectorStore...")
    
    # Initialize OpenAI embeddings
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    
    # Create vector store from documents using SKLearn
    persist_path = os.getcwd()+"/sklearn_vectorstore.parquet"
    vectorstore = SKLearnVectorStore.from_documents(
        documents=splits,
        embedding=embeddings,
        persist_path=persist_path   ,
        serializer="parquet",
    )
    print("SKLearnVectorStore created successfully.")
    
    vectorstore.persist()
    print("SKLearnVectorStore was persisted to", persist_path)

    return vectorstore

Métodos definidos.

Agora vamos de fato fazer o Scrapping do site da Capyba:

In [11]:
# Load the documents
documents, tokens_per_doc = load_site()


Loading Capyba website...
Loaded 16 documents from Capyba documentation.

Loaded URLs:
1. https://www.capyba.com/
2. https://www.capyba.com/cases/case-genomika-en
3. https://www.capyba.com/cases/cases-home
4. https://www.capyba.com/cases/case-inadash-en
5. https://www.capyba.com/about-us
6. https://www.capyba.com/our-process
7. https://www.capyba.com/blog
8. https://www.capyba.com/careers
9. https://www.capyba.com/get-in-touch
10. https://www.capyba.com/services
11. https://www.capyba.com/cases/case-fretapp-en
12. https://www.capyba.com/capybaday
13. https://www.capyba.com/services
14. https://www.capyba.com/our-process
15. https://www.capyba.com/cases/cases-home
16. https://www.capyba.com/about-us
Total tokens in loaded documents: 12385


In [12]:
# Confirming the amount of documents
len(documents)

16

In [13]:
# Confirming the amount of tokens report
len(tokens_per_doc)

16

In [14]:
# Seeing a preview of a document with the number of tokens
print(f"First document has {tokens_per_doc[0]} tokens. Content:\n{documents[0]}")

First document has 536 tokens. Content:
page_content='Capyba Software - Driving Transformation

ServicesOur processCasesBlogAboutGet in TouchJoin the Magic Team 🧙MenuMenuServicesOur ProcessCasesBlogAboutJoin the Magic Team 🧙Get in touchReady for#CapybaDay2023?Learn moreWe transformideas into digital products  Discover how we can help youGet in touchWhat We DoWe create, co-create and transform the digital business of global enterprises and help them grow.SoftwareDevelopmentSolid experience in building and scaling software on mobile, web and connected plataforms.See moreSee moreProductDesignDesigning delightful experiences with a human-centered approach by involving human perspective.See moreSee moreTechConsultingDigital transformation is not just a buzzword, it's the key to maintain your business at a high level.See moreSee moreCasesAlbert Einstein: GenomikaA web system to generate automatic medical reports from parameters and configurable rules based on genetic tests.TECHNOLOGIES:TOOLS

In [15]:
# Save the documents to a file
save_full_website(documents=documents)

Documents concatenated into capyba_full.txt


In [16]:
# Split the documents
split_docs = split_documents(documents)

Splitting documents...
Created 16 chunks from documents.
Total tokens in split documents: 12385


In [17]:
# Create the vector store
vectorstore = create_vectorstore(split_docs)

Creating SKLearnVectorStore...
SKLearnVectorStore created successfully.
SKLearnVectorStore was persisted to /home/tone/www/capyba/capyba_mcp/sklearn_vectorstore.parquet


Pronto. Com esses passos criamos um vectorstore que salvou todos documentos extraidos do site da Capyba.

Vamos fazer alguns testes para garantir que funciona:

In [18]:
# Create retriever to get relevant documents (k=3 means return top 3 matches)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [19]:
# Get relevant documents for the query
query = "What is the size of the goal ?"    
relevant_docs = retriever.invoke(query)
print(f"Query: {query}")
print(f"Retrieved {len(relevant_docs)} relevant documents")

for d in relevant_docs:
    print(d.metadata['source'])
    print(d.page_content[0:500])
    print("\n--------------------------------\n")

Query: Who are the people that Work at Capyba ?
Retrieved 3 relevant documents
https://www.capyba.com/about-us
About Us
ServicesOur processCasesBlogAboutGet in TouchJoin the Magic Team 🧙MenuMenuServicesOur ProcessCasesBlogAboutJoin the Magic Team 🧙Get in touchAbout UsWe're Capyba.A software studiomade by peopleWe seek innovation with personal growth, a fair and balanced work environment, transforming and generating social impact.This is being Capyba.Alessa AlvesProject Manageralessa@capyba.comAmanda CamposOperations Analystalessa@capyba.comAntônio GabrielSoftware Engineeralessa@capyba.comArmanda MariaPro

--------------------------------

https://www.capyba.com/about-us
About Us
ServicesOur processCasesBlogAboutGet in TouchJoin the Magic Team 🧙MenuMenuServicesOur ProcessCasesBlogAboutJoin the Magic Team 🧙Get in touchAbout UsWe're Capyba.A software studiomade by peopleWe seek innovation with personal growth, a fair and balanced work environment, transforming and generating social impact

Se você abrir o site, provavelmente vai entrar a página com os membros do time da Capyba.

> Inclusive, não estou lá :/

Com essa informação, uma LLM poderá gerar uma resposta!

> Daria para melhorar muito a extração de texto, removendo o cabeçalho e o footer, por exemplo. Aceitamos improovements!

Agora bora botar as LLM pra trabalhar!