In [1]:
import io
import zipfile
import requests
import frontmatter
import re
import json
import os
from dotenv import load_dotenv
from pathlib import Path
from tqdm.auto import tqdm

# Ingest and Index Data

In [2]:
def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data

In [3]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')
evidently_docs = read_repo_data('evidentlyai', 'docs')

print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")

FAQ documents: 1219
Evidently documents: 95


In [4]:
# Previews Evidently document at index 45
evidently_doc = evidently_docs[45]
print(f"Filename: {evidently_doc['filename']}")
print(f"Metadata: {evidently_doc.get('attributes', {})}")
print(f"Content preview:\n{evidently_doc['content'][:1000]}...")

Filename: docs-main/examples/LLM_regression_testing.mdx
Metadata: {}
Content preview:
In this tutorial, you will learn how to perform regression testing for LLM outputs.

You can compare new and old responses after changing a prompt, model, or anything else in your system. By re-running the same inputs with new parameters, you can spot any significant changes. This helps you push updates with confidence or identify issues to fix.

<Info>
  **This example uses Evidently Cloud.** You'll run evals in Python and upload them. You can also skip the upload and view Reports locally. For self-hosted, replace `CloudWorkspace` with `Workspace`.
</Info>

# Tutorial scope

Here's what we'll do:

* **Create a toy dataset**. Build a small Q&A dataset with answers and reference responses.

* **Get new answers**. Imitate generating new answers to the same question.

* **Create and run a Report with Tests**. Compare the answers using LLM-as-a-judge to evaluate length, correctness and style consistency.


# Simple Chunking

In [5]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [6]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    evidently_chunks.extend(chunks)

In [7]:
# Preview first 3 chunks
for i, chunk in enumerate(evidently_chunks[:3]):
    print(f"\n--- Chunk {i} ---")
    print(f"Filename: {chunk.get('filename')}")
    print(f"Start Index: {chunk['start']}")
    print(f"Content Preview:\n{chunk['chunk'][:1000]}...")


--- Chunk 0 ---
Filename: docs-main/api-reference/introduction.mdx
Start Index: 0
Content Preview:
<Note>
  If you're not looking to build API reference documentation, you can delete
  this section by removing the api-reference folder.
</Note>

## Welcome

There are two ways to build API documentation: [OpenAPI](https://mintlify.com/docs/api-playground/openapi/setup) and [MDX components](https://mintlify.com/docs/api-playground/mdx/configuration). For the starter kit, we are using the following OpenAPI specification.

<Card
  title="Plant Store Endpoints"
  icon="leaf"
  href="https://github.com/mintlify/starter/blob/main/api-reference/openapi.json"
>
  View the OpenAPI specification file
</Card>

## Authentication

All API endpoints are authenticated using Bearer tokens and picked up from the specification file.

```json
"security": [
  {
    "bearerAuth": []
  }
]
```...

--- Chunk 1 ---
Filename: docs-main/changelog/changelog.mdx
Start Index: 0
Content Preview:
<Update label="2025-

# Split by Paragraphs and Section

In [14]:
# For simple document files
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())

In [15]:
def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.
    
    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)
    
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)
    
    return sections

In [16]:
sections = split_markdown_by_level(text, level=2)

In [17]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [18]:
# Preview first 3 chunks
print(f"Total sections (chunks) extracted: {len(evidently_chunks)}")
for i, chunk in enumerate(evidently_chunks[:3]):
    print(f"\n--- Chunk {i} ---")
    print(f"Filename: {chunk.get('filename')}")
    print(f"Metadata: {chunk.get('attributes', {})}")
    print(f"Section Preview:\n{chunk['section'][:1000]}...")

Total sections (chunks) extracted: 262

--- Chunk 0 ---
Filename: docs-main/api-reference/introduction.mdx
Metadata: {}
Section Preview:
## Welcome

There are two ways to build API documentation: [OpenAPI](https://mintlify.com/docs/api-playground/openapi/setup) and [MDX components](https://mintlify.com/docs/api-playground/mdx/configuration). For the starter kit, we are using the following OpenAPI specification.

<Card
  title="Plant Store Endpoints"
  icon="leaf"
  href="https://github.com/mintlify/starter/blob/main/api-reference/openapi.json"
>
  View the OpenAPI specification file
</Card>...

--- Chunk 1 ---
Filename: docs-main/api-reference/introduction.mdx
Metadata: {}
Section Preview:
## Authentication

All API endpoints are authenticated using Bearer tokens and picked up from the specification file.

```json
"security": [
  {
    "bearerAuth": []
  }
]
```...

--- Chunk 2 ---
Filename: docs-main/docs/library/data_definition.mdx
Metadata: {}
Section Preview:
## Basic flow

**Step 

# Intelligent Chunking with LLM

Models used:
- x-ai/grok-4-fast:free
- deepseek/deepseek-chat-v3.1:free
- google/gemma-3n-e2b-it:free

In [19]:
# Go up to root directory
env_path = Path(os.getcwd()).parents[0] / '.env'
load_dotenv(dotenv_path=env_path)

# Get the API key from environment variable
api_key = os.getenv("OPENROUTER_API_KEY")

# Check if API key was loaded
if not api_key:
    raise Exception("OPENROUTER_API_KEY not found in .env file")

# Send the request
response = requests.post(
    url="https://openrouter.ai/api/v1/chat/completions",
    headers={
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    },
    data=json.dumps({
        "model": "google/gemma-3n-e2b-it:free",
        "messages": [
            {
                "role": "user",
                "content": "What is the meaning of life?"
            }
        ]
    })
)

print(response.json())

{'id': 'gen-1759143822-AZrnbVBjPgy4BcSn7IJb', 'provider': 'Google AI Studio', 'model': 'google/gemma-3n-e2b-it:free', 'object': 'chat.completion', 'created': 1759143823, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'native_finish_reason': 'STOP', 'index': 0, 'message': {'role': 'assistant', 'content': 'Ah, the age-old question! The meaning of life is one of the most pondered and debated topics in human history, and there\'s no single, universally accepted answer. It\'s a deeply personal and philosophical exploration. Here\'s a breakdown of different perspectives, ranging from philosophical to religious to scientific, and a few thoughts on how you might find *your* meaning:\n\n**1. Philosophical Perspectives:**\n\n*   **Nihilism:** This view argues that life is inherently without objective meaning, purpose, or intrinsic value. There\'s no cosmic plan or destiny.  While seemingly bleak, some find freedom in this by accepting the lack of inherent meaning and creating their own.

In [20]:
# LLM call via OpenRouter
def llm(prompt, model='google/gemma-3n-e2b-it:free'):
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ]
    }

    response = requests.post(url, headers=headers, data=json.dumps(data))

    if response.status_code != 200:
        print("ERROR:", response.status_code, response.text)
        return ""

    response_json = response.json()
    return response_json['choices'][0]['message']['content']

In [21]:
# Prompt template for chunking
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

In [22]:
# Chunking logic
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [None]:
evidently_chunks = []

for doc in tqdm(evidently_docs):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [None]:
# Print the chunks for inspection
for chunk in evidently_chunks[:5]:
    print("Section from:", chunk['title'])
    print(chunk['section'])
    print("\n---\n")