In [3]:
import sys

sys.path.append("..")

In [4]:
import os
from aurelio_sdk import AsyncAurelioClient

client = AsyncAurelioClient(api_key=os.environ["AURELIO_API_KEY"], base_url=os.environ["BASE_URL"])

print(client.base_url)


http://localhost:8000


## Chunk documents

Load data for chunking

In [5]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv2", split="train")
data

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'content', 'references'],
    num_rows: 2673
})

In [7]:
content: str = data[3]["content"]
print(content[:1000])


# Mamba: Linear-Time Sequence Modeling with Selective State Spaces
# Albert Gu*1 and Tri Dao*2
1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me
# Abstract
Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ computational ineï¬ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities

### Regex chunker

In [16]:
from aurelio_sdk import ChunkingOptions, ChunkResponse

# All options are optional
chunking_options = ChunkingOptions(
    chunker_type="regex", delimiters=[], max_chunk_length=400
)

response_regex: ChunkResponse = await client.chunk(
    content=content, processing_options=chunking_options
)

response_regex

ChunkResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=42937, pages=None, seconds=None), message=None, processing_options=ChunkingOptions(max_chunk_length=400, chunker_type='regex', window_size=1, delimiters=[]), document=ResponseDocument(id='doc_106b2d3b-b3b3-41c1-a6a5-745a9b364c31', content='# Mamba: Linear-Time Sequence Modeling with Selective State Spaces\n# Albert Gu*1 and Tri Dao*2\n1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me\n# Abstract\nFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ\x80\x99 computational ineï¬\x83ciency on long sequences, but they h

In [19]:
response_regex.document.num_chunks

97

In [None]:
import json

for chunk in response_regex.document.chunks[:3]:
    print(json.dumps(chunk.model_dump(), indent=2))
    print("-" * 100)

### Semantic chunker

In [6]:
from aurelio_sdk import ChunkingOptions, ChunkResponse

# All options are optional
chunking_options = ChunkingOptions(
    chunker_type="semantic", max_chunk_length=400, window_size=5
)

response_semantic: ChunkResponse = await client.chunk(
    content=content, processing_options=chunking_options
)

In [None]:
import json

for chunk in response_semantic.document.chunks[:3]:
    print(json.dumps(chunk.model_dump(), indent=2))
    print("-" * 100)

#### Compare `regex` and `semantic` chunks
Helper function to compare chunks

In [None]:
from IPython.display import HTML


def display_chunks_side_by_side(semantic_chunks, regex_chunks):
    """Display concatenated texts with chunks in different background colors side by side on white background"""
    print(f"Semantic chunks: {len(semantic_chunks)}")
    print(f"Regex chunks: {len(regex_chunks)}")

    colors = ["#FFC0CB", "#ADD8E6", "#90EE90", "#FFFFE0", "#D3D3D3"]

    semantic_html = ""
    for idx, chunk in enumerate(semantic_chunks):
        color = colors[idx % len(colors)]
        semantic_html += (
            f'<span style="background-color:{color};">{chunk.content}</span>'
        )

    regex_html = ""
    for idx, chunk in enumerate(regex_chunks):
        color = colors[idx % len(colors)]
        regex_html += f'<span style="background-color:{color};">{chunk.content}</span>'

    html = f"""
    <table style="width:100%; table-layout:fixed; background-color:white;">
        <tr>
            <th style="width:50%; text-align:left;">Semantic Chunking</th>
            <th style="width:50%; text-align:left;">Regex Chunking</th>
        </tr>
        <tr>
            <td style="vertical-align: top; text-align:left;">{semantic_html}</td>
            <td style="vertical-align: top; text-align:left;">{regex_html}</td>
        </tr>
    </table>
    """

    display(HTML(html))

In [None]:
display_chunks_side_by_side(
    response_semantic.document.chunks, response_regex.document.chunks
)

In [None]:
display_chunks_side_by_side(
    response_semantic.document.chunks, response_regex.document.chunks
)