In [None]:
import sys

sys.path.append("..")

In [None]:
from aurelio_sdk import AsyncAurelioClient

client = AsyncAurelioClient(base_url="")

## Chunk documents

Load data for chunking

In [None]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv2", split="train")
data

In [None]:
content: str = data[3]["content"]
print(content[:1000])

### Regex chunker

In [None]:
from aurelio_sdk import ChunkingOptions, ChunkResponse

# All options are optional
chunking_options = ChunkingOptions(
    chunker_type="regex", delimiters=[], max_chunk_length=400
)

response_regex: ChunkResponse = await client.chunk(
    content=content, processing_options=chunking_options
)

In [None]:
import json

for chunk in response_regex.document.chunks[:3]:
    print(json.dumps(chunk.model_dump(), indent=2))
    print("-" * 100)

### Semantic chunker

In [None]:
from aurelio_sdk import ChunkingOptions, ChunkResponse

# All options are optional
chunking_options = ChunkingOptions(
    chunker_type="semantic", max_chunk_length=400, window_size=5
)

response_semantic: ChunkResponse = await client.chunk(
    content=content, processing_options=chunking_options
)

In [None]:
import json

for chunk in response_semantic.document.chunks[:3]:
    print(json.dumps(chunk.model_dump(), indent=2))
    print("-" * 100)

#### Compare `regex` and `semantic` chunks
Helper function to compare chunks

In [None]:
from IPython.display import HTML


def display_chunks_side_by_side(semantic_chunks, regex_chunks):
    """Display concatenated texts with chunks in different background colors side by side on white background"""
    print(f"Semantic chunks: {len(semantic_chunks)}")
    print(f"Regex chunks: {len(regex_chunks)}")

    colors = ["#FFC0CB", "#ADD8E6", "#90EE90", "#FFFFE0", "#D3D3D3"]

    semantic_html = ""
    for idx, chunk in enumerate(semantic_chunks):
        color = colors[idx % len(colors)]
        semantic_html += (
            f'<span style="background-color:{color};">{chunk.content}</span>'
        )

    regex_html = ""
    for idx, chunk in enumerate(regex_chunks):
        color = colors[idx % len(colors)]
        regex_html += f'<span style="background-color:{color};">{chunk.content}</span>'

    html = f"""
    <table style="width:100%; table-layout:fixed; background-color:white;">
        <tr>
            <th style="width:50%; text-align:left;">Semantic Chunking</th>
            <th style="width:50%; text-align:left;">Regex Chunking</th>
        </tr>
        <tr>
            <td style="vertical-align: top; text-align:left;">{semantic_html}</td>
            <td style="vertical-align: top; text-align:left;">{regex_html}</td>
        </tr>
    </table>
    """

    display(HTML(html))

In [None]:
display_chunks_side_by_side(
    response_semantic.document.chunks, response_regex.document.chunks
)

In [None]:
display_chunks_side_by_side(
    response_semantic.document.chunks, response_regex.document.chunks
)