In [1]:
import re, os
import tiktoken
import logging
import requests

from tqdm import tqdm

from bs4 import BeautifulSoup

from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import SKLearnVectorStore

In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
def extract_urls_from_sitemap(url_sitemap):
    """
    Extract URLs and metadata from a sitemap XML file.
    PS: Designed for RoboCup Small Size League sitemap
    
    Args:
        url_sitemap (str): URL of the sitemap to parse
        
    Returns:
        list: List of dictionaries containing URL information
        Each dictionary contains:
            - url: The page URL
            - lastmod: Last modification date
            - changefreq: How often the page changes
            - priority: Page priority (0.0 to 1.0)
    """
    
    try:
        # Fetch sitemap
        response = requests.get(url_sitemap)
        response.raise_for_status()  # Raise exception for bad status codes
        
        # Parse XML with lxml parser
        soup = BeautifulSoup(response.content, 'lxml-xml')
        
        # Initialize results list
        urls_info = []
        
        # Check if this is a sitemap index
        if soup.find('sitemapindex'):
            logger.info("Found sitemap index, processing sub-sitemaps...")
            # Get all sitemap URLs
            sitemap_urls = [sitemap.find('loc').text for sitemap in soup.find_all('sitemap')]
            
            # Process each sub-sitemap
            for sitemap_url in sitemap_urls:
                try:
                    sub_response = requests.get(sitemap_url)
                    sub_response.raise_for_status()
                    sub_soup = BeautifulSoup(sub_response.content, 'lxml-xml')
                    
                    # Process URLs in sub-sitemap
                    for url_elem in sub_soup.find_all('url'):
                        url_info = extract_url_info(url_elem)
                        if url_info:
                            urls_info.append(url_info)
                            
                except Exception as e:
                    logger.error(f"Error processing sub-sitemap {sitemap_url}: {str(e)}")
                    continue
                    
        else:
            # Process URLs in single sitemap
            logger.info("Processing single sitemap...")
            for url_elem in soup.find_all('url'):
                url_info = extract_url_info(url_elem)
                if url_info:
                    urls_info.append(url_info)
        
        logger.info(f"Successfully extracted {len(urls_info)} URLs from sitemap")
        return urls_info
        
    except requests.RequestException as e:
        logger.error(f"Error fetching sitemap: {str(e)}")
        return []
    except Exception as e:
        logger.error(f"Unexpected error processing sitemap: {str(e)}")
        return []

def extract_url_info(url_elem):
    """
    Extract information from a URL element in the sitemap.
    
    Args:
        url_elem: BeautifulSoup element containing URL information
        
    Returns:
        dict: Dictionary containing URL information or None if invalid
    """
    try:
        # Extract required URL
        url = url_elem.find('loc')
        if not url:
            return None
            
        # Extract optional fields with defaults
        lastmod = url_elem.find('lastmod')
        changefreq = url_elem.find('changefreq')
        priority = url_elem.find('priority')
        
        # Build info dictionary
        url_info = {
            'url': url.text.strip(),
            'lastmod': lastmod.text.strip() if lastmod else None,
            'changefreq': changefreq.text.strip() if changefreq else None,
            'priority': float(priority.text.strip()) if priority else 0.5
        }
        
        return url_info
        
    except Exception as e:
        logger.error(f"Error extracting URL info: {str(e)}")
        return None

In [4]:
urls_info = extract_urls_from_sitemap("https://ssl.robocup.org/page-sitemap.html")

INFO:__main__:Processing single sitemap...
INFO:__main__:Successfully extracted 115 URLs from sitemap


In [5]:
default_urls = [
  "https://robocup-ssl.github.io/ssl-goals/sslgoals.html"
  "https://robocup-ssl.github.io/ssl-rules/sslrules.html"
  "https://github.com/orgs/RoboCup-SSL/repositories"
  "https://ssl.robocup.org/rules/",
  "https://ssl.robocup.org/tournament-rules/",
  "https://ssl.robocup.org/technical-overview-of-the-small-size-league/",
  "https://ssl.robocup.org/tournament-organization/",
  "https://ssl.robocup.org/divisions/",
  "https://ssl.robocup.org/open-source-contributions/",
  "https://ssl.robocup.org/history-of-open-source-submissions/",
  "https://ssl.robocup.org/scientific-publications/",
  "https://ssl.robocup.org/team-description-papers/",
  "https://ssl.robocup.org/robocups/robocup-2025/robocup-2025-teams/"
  "https://ssl.robocup.org/robocups/",
  "https://ssl.robocup.org/history-of-technical-challenges/",
  "https://ssl.robocup.org/match-statistics/",
  "https://ssl.robocup.org/contact/"
]

In [6]:
extracted_urls = [info['url'] for info in urls_info]

website_urls = default_urls + extracted_urls

website_urls = list(set(website_urls))


print(f"Total URLs: {len(website_urls)}")
print(f"First 2 URLs: {website_urls[:2]}")
print(f"Last 2 URLs: {website_urls[-2:]}")


Total URLs: 117
First 2 URLs: ['https://ssl.robocup.org/game-logs/', 'https://ssl.robocup.org/robocups/robocup-2022/robocup-2022-qualification/']
Last 2 URLs: ['https://ssl.robocup.org/robocups/robocup-2019/robocup-2019-open-source-submissions/', 'https://ssl.robocup.org/contact/']


In [7]:
def count_tokens(text, model="cl100k_base"):
    """
    Count the number of tokens in the text using tiktoken.
    
    Args:
        text (str): The text to count tokens for
        model (str): The tokenizer model to use (default: cl100k_base for GPT-4)
        
    Returns:
        int: Number of tokens in the text
    """
    encoder = tiktoken.get_encoding(model)
    return len(encoder.encode(text))

In [8]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    
    # Target the main article content for documentation 
    main_content = soup.find("article", class_="status-publish")
    
    # If found, use that, otherwise fall back to the whole document
    content = main_content.get_text() if main_content else soup.text
    
    # Clean up whitespace
    content = re.sub(r"\n\n+", "\n\n", content).strip()
    
    return content

In [9]:
def load_site():
    """
    Load information from the official website.
    
    This function:
    1. Uses RecursiveUrlLoader to fetch pages from the website
    2. Counts the total documents and tokens loaded
    
    Returns:
        list: A list of Document objects containing the loaded content
        list: A list of tokens per document
    """
    print("Loading website...")

    docs = []

    # Show the progress
    for url in tqdm(website_urls, desc="Processing URLs and loading page content", total=len(website_urls), unit="URLs"):
        loader = RecursiveUrlLoader(
            url,
            max_depth=5,
            extractor=bs4_extractor,
        )

        # Load documents using lazy loading (memory efficient)
        docs_lazy = loader.lazy_load()

        # Load documents and track URLs
        for d in docs_lazy:
            docs.append(d)

    print(f"Loaded {len(docs)} documents from website.")
    
    # Count total tokens in documents
    total_tokens = 0
    tokens_per_doc = []
    for doc in tqdm(docs, desc="Counting tokens in documents", total=len(docs), unit="documents"):
        total_tokens += count_tokens(doc.page_content)
        tokens_per_doc.append(count_tokens(doc.page_content))
        
    print(f"Total tokens in loaded documents: {total_tokens}")
    
    return docs, tokens_per_doc

In [10]:
def save_full_website(documents):
    """ Save the documents to a file """

    # Open the output file
    output_filename = "full_website.txt"

    with open(output_filename, "w") as f:
        # Write each document
        for i, doc in enumerate(documents):
            # Get the source (URL) from metadata
            source = doc.metadata.get('source', 'Unknown URL')
            
            # Write the document with proper formatting
            f.write(f"DOCUMENT {i+1}\n")
            f.write(f"SOURCE: {source}\n")
            f.write("CONTENT:\n")
            f.write(doc.page_content)
            f.write("\n\n" + "="*80 + "\n\n")

    print(f"Documents concatenated into {output_filename}")


In [11]:
def split_documents(documents):
    """
    Split documents into smaller chunks for improved retrieval.
    
    This function:
    1. Uses RecursiveCharacterTextSplitter with tiktoken to create semantically meaningful chunks
    2. Ensures chunks are appropriately sized for embedding and retrieval
    3. Counts the resulting chunks and their total tokens
    
    Args:
        documents (list): List of Document objects to split
        
    Returns:
        list: A list of split Document objects
    """
    print("Splitting documents...")
    
    # Initialize text splitter using tiktoken for accurate token counting
    # chunk_size=8,000 creates relatively large chunks for comprehensive context
    # chunk_overlap=500 ensures continuity between chunks
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=8000,  
        chunk_overlap=500  
    )
    
    # Split documents into chunks
    split_docs = text_splitter.split_documents(documents)
    
    print(f"Created {len(split_docs)} chunks from documents.")
    
    # Count total tokens in split documents
    total_tokens = 0
    for doc in split_docs:
        total_tokens += count_tokens(doc.page_content)
    
    print(f"Total tokens in split documents: {total_tokens}")
    
    return split_docs

In [12]:
def create_vectorstore(splits):
    """
    Create a vector store from document chunks using SKLearnVectorStore.
    
    This function:
    1. Initializes an embedding model to convert text into vector representations
    2. Creates a vector store from the document chunks
    
    Args:
        splits (list): List of split Document objects to embed
        
    Returns:
        SKLearnVectorStore: A vector store containing the embedded documents
    """
    print("Creating SKLearnVectorStore...")
    
    # Initialize OpenAI embeddings
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    
    # Create vector store from documents using SKLearn
    persist_path = os.getcwd()+"/sklearn_vectorstore.parquet"
    vectorstore = SKLearnVectorStore.from_documents(
        documents=splits,
        embedding=embeddings,
        persist_path=persist_path,
        serializer="parquet",
        
    )
    print("SKLearnVectorStore created successfully.")
    
    vectorstore.persist()
    print("SKLearnVectorStore was persisted to", persist_path)

    return vectorstore

Let's run the site scrapping:

In [13]:
# Load the documents
documents, tokens_per_doc = load_site()


Loading website...


Processing URLs and loading page content: 100%|██████████| 117/117 [04:31<00:00,  2.32s/URLs]


Loaded 224 documents from website.


Counting tokens in documents: 100%|██████████| 224/224 [00:00<00:00, 618.42documents/s]

Total tokens in loaded documents: 174455





In [14]:
# Confirming the amount of documents
len(documents)

224

In [15]:
# Confirming the amount of tokens report
len(tokens_per_doc)

224

In [16]:
# Seeing a preview of a document with the number of tokens
print(f"First document has {tokens_per_doc[0]} tokens. Content:\n{documents[0]}")

First document has 434 tokens. Content:
page_content='The league attempts to create log files of all official matches during each RoboCup. They contain the timestamped ProtoBuf messages of ssl-vision, ssl-game-controller and vision-tracker producers like the autoRefs.
Software
The league provides multiple tools to deal with log files of SSL games:

ssl-logtools (C++): Original tooling with recorder and player
ssl-go-tools (Go): Set of CLI tools to record, play and analyze log files

The ssl-logtools can be considered legacy and are not maintained anymore. However, the ssl-go-tools do not provide a UI for the player yet.
File Format
Each log file starts with the following header:

1: String – File type (“SSL_LOG_FILE”) 2: Int32 – Log file format version 

Format version 1 encodes the protobuf messages in the following format:

1: Int64 – Receiver timestamp in ns 2: Int32 – Message type 3: Int32 – Size of binary protobuf message 4: String – Binary protobuf message 

The message types are

In [17]:
# Save the documents to a file
save_full_website(documents=documents)

Documents concatenated into full_website.txt


In [18]:
# Split the documents
split_docs = split_documents(documents)

Splitting documents...
Created 224 chunks from documents.
Total tokens in split documents: 174455


In [19]:
# Create the vector store
vectorstore = create_vectorstore(split_docs)

Creating SKLearnVectorStore...


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (913,) + inhomogeneous part.

# Done.

With the vector store created, we can now create a retriever to get relevant documents.

Let's do some tests:

In [20]:
# Create retriever to get relevant documents (k=3 means return top 3 matches)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [None]:
# Get relevant documents for the query
query = "How to submit a paper?"    
relevant_docs = retriever.invoke(query)
print(f"Query: {query}")
print(f"Retrieved {len(relevant_docs)} relevant documents")

for d in relevant_docs:
    print(d.metadata['source'])
    print(d.page_content[0:500])
    print("\n--------------------------------\n")