##### Understanding Frontmatter

In [3]:
import frontmatter

raw = """---
title: "Getting Started with AI"
author: "Euriel Chukwu"
date: "2025-09-23"
tags: ["ai", "machine-learning", "tutorial"]
difficulty: "beginner"
---

# Welcome

This is a tutorial on getting started with AI Agent.
"""

post = frontmatter.loads(raw)

print(post.metadata['title'])  # "Getting Started with AI"
print(post.metadata['author'])
print(post.metadata['date'])
print(post.metadata['tags'])   # ["ai", "machine-learning", "tutorial"]
print(post.metadata['difficulty'])
print(post.content)            # Markdown content without frontmatter


Getting Started with AI
Euriel Chukwu
2025-09-23
['ai', 'machine-learning', 'tutorial']
beginner
# Welcome

This is a tutorial on getting started with AI Agent.


Import Libraries

In [4]:
import io
import zipfile
import requests
import frontmatter    

Download repository as a zip file using github URL format

In [5]:
url = 'https://codeload.github.com/ceuriel/atlite/zip/refs/heads/master'
resp = requests.get(url)   

Process the zip file in memory without saving to disk

In [6]:
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()

    # Only process markdown files
    if not filename.endswith('.md'):
        continue

    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()

output

In [7]:
print(repository_data[1])    

{'content': "<!--\nSPDX-FileCopyrightText: Contributors to atlite <https://github.com/pypsa/atlite>\n\nSPDX-License-Identifier: CC0-1.0\n-->\n\nCloses # (if applicable).\n\n## Changes proposed in this Pull Request\n\n\n## Checklist\n\n- [ ] Code changes are sufficiently documented; i.e. new functions contain docstrings and further explanations may be given in `doc`.\n- [ ] Unit tests for new features were added (if applicable).\n- [ ] Newly introduced dependencies are added to `environment.yaml`, `environment_docs.yaml` and `setup.py` (if applicable).\n- [ ] A note for the release notes `doc/release_notes.rst` of the upcoming release is included.\n- [ ] I consent to the release of this PR's code under the MIT license.", 'filename': 'atlite-master/.github/pull_request_template.md'}


 ### Complete implementation in a reusable function

In [8]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/master'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data    

Function can be used for different repositories

In [9]:
h2oai = read_repo_data('h2oai', 'h2o-3')
ceuriel_atlite = read_repo_data('ceuriel', 'atlite')

print(f"Atlite documents: {len(ceuriel_atlite)}")
print(f"h2o-3 documents: {len(h2oai)}")

Atlite documents: 3
h2o-3 documents: 89


In [10]:
for record in h2oai:
    print(record['filename'])

h2o-3-master/.github/ISSUE_TEMPLATE/bug_report.md
h2o-3-master/.github/ISSUE_TEMPLATE/feature_request.md
h2o-3-master/CONTRIBUTING.md
h2o-3-master/Changes-prior-3.28.0.1.md
h2o-3-master/Changes.md
h2o-3-master/DEVEL.md
h2o-3-master/README.md
h2o-3-master/README_DATA.md
h2o-3-master/SECURITY.md
h2o-3-master/ec2/README.md
h2o-3-master/examples/deeplearning/notebooks/README.md
h2o-3-master/gradle/README.md
h2o-3-master/h2o-algos/src/main/java/hex/deeplearning/README.md
h2o-3-master/h2o-assemblies/main/README.md
h2o-3-master/h2o-assemblies/minimal/README.md
h2o-3-master/h2o-bindings/bin/readme.md
h2o-3-master/h2o-clustering/README.md
h2o-3-master/h2o-core/src/main/resources/docs/pieces/columnSummary.md
h2o-3-master/h2o-dist/README.md
h2o-3-master/h2o-docs/README.md
h2o-3-master/h2o-docs/StyleGuide.md
h2o-3-master/h2o-docs/src/api/README.md
h2o-3-master/h2o-docs/src/api/REST/h2o_3_rest_api_overview.md
h2o-3-master/h2o-docs/src/api/data-science-example-1/README.md
h2o-3-master/h2o-docs/src/a

In [11]:
for record in ceuriel_atlite:
    print(record['filename'])

atlite-master/.github/ISSUE_TEMPLATE/feature_request.md
atlite-master/.github/pull_request_template.md
atlite-master/CONTRIBUTING.md


#### Simple Chunking

Applying Sliding window method

In [12]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result


Process the entire documents

In [13]:
h2oai_chunks = []

for doc in h2oai:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    h2oai_chunks.extend(chunks)


In [14]:
print(f"Total chunks: {len(h2oai_chunks)}")

Total chunks: 1639


#### Splitting by Paragraphs and Section

splitting by paragraphs

In [15]:
import re
text = h2oai[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())

splitting by section

In [16]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.
    
    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)
    
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)
    
    return sections

Final result by iterating over all the documents

In [17]:
h2oai_chunks = []

for doc in h2oai:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        h2oai_chunks.append(section_doc)

In [18]:
print(f"Total chunks: {len(h2oai_chunks)}")

Total chunks: 266


#### Intelligent Chunking with LLM

In [19]:
# GET API key from https://platform.openai.com/api-keys
# create an environment variable with your key:
# from command line run:
# export OPENAI_API_KEY='your-api-key'
# uv add openai
# uv run jupyter notebook

In [20]:
# import OpenAI

from openai import OpenAI

openai_client = OpenAI()

def llm(prompt, model='gpt-4.1-mini'):
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = openai_client.responses.create(
        model='gpt-4.1-mini',
        input=messages
    )

    return response.output_text


##### Create a prompt

The prompt asks the LLM to:

Split the document logically (not just by length)

Make sections self-contained

Use a specific output format that's easy to parse


In [21]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()


##### Create a function for intelligent chunking

In [22]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [70]:
from tqdm.auto import tqdm
# Import your language model or define it
# For example, if using OpenAI:
# from openai import OpenAI
# llm = OpenAI(api_key="your-api-key")

# Define the intelligent_chunking function if it's not imported from elsewhere
def intelligent_chunking(text):
    # Implementation of your chunking logic
    # This is a placeholder - replace with your actual implementation
    # that doesn't rely on an undefined llm variable
    return [text]  # Simple implementation that returns the whole text as one chunk

h2oai_chunks = []

for doc in tqdm(h2oai):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        h2oai_chunks.append(section_doc)

  0%|          | 0/89 [00:00<?, ?it/s]

##### Apply to entire document

In [71]:
from tqdm.auto import tqdm
# Import your language model or define it
# For example, if using OpenAI:
# from openai import OpenAI
# llm = OpenAI(api_key="your-api-key")

# Define the intelligent_chunking function if it's not imported from elsewhere
def intelligent_chunking(text):
    # Implementation of your chunking logic
    # This is a placeholder - replace with your actual implementation
    # that doesn't rely on an undefined llm variable
    return [text]  # Simple implementation that returns the whole text as one chunk

atlite_chunks = []

for doc in tqdm(h2oai):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        atlite_chunks.append(section_doc)

  0%|          | 0/89 [00:00<?, ?it/s]

##### Note: This process requires time and incurs costs. As mentioned before, use this only when really necessary. For most applications, you don't need intelligent chunking.


## Add Search

### Text Search

In [94]:
!pip install minsearch

# Index data with minsearch

from minsearch import Index

index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)

index.fit(h2oai_chunks)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




<minsearch.minsearch.Index at 0x136037ad0>

#### From here we can start using our data to search

In [73]:
query = 'What should be in a test dataset for AI evaluation?'
results = index.search(query)

#### For the repo h2oai h2o-3 repo, filtering through the'CONTRIBUTING' file, we search through the content text field

In [126]:
h2oai = read_repo_data('h2oai', 'h2o-3')
md_h2oai = [d for d in h2oai if 'CONTRIBUTING' in d['filename']]

h2o3_index = Index(
    text_fields=["content"],
    keyword_fields=[]
)

h2o3_index.fit(md_h2oai)


<minsearch.minsearch.Index at 0x1361e2b10>

#### The output

In [127]:
print ("First 10 values:", md_h2oai[:10])



### Vector Search

In [128]:
# install sentence-transformers

!pip install sentence-transformers

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




##### turn document into vector embedding with sentence transformer

In [145]:
record = h2oai[2]
text = record['content']
v_doc = embedding_model.encode(text)

In [150]:
# First, import and initialize the embedding model
from sentence_transformers import SentenceTransformer

# Initialize the embedding model (using a common model as example)
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

# Now use the model
query = "We welcome and encourage your contributions of any kind!"
v_query = embedding_model.encode(query)

##### compute similarity between query and document vectors

In [152]:
# Resize one of the vectors to match the other
# For example, if you want to truncate v_doc to match v_query's size:
v_doc_resized = v_doc[:768]  # Take only the first 768 dimensions
similarity = v_query.dot(v_doc_resized)

##### compute similarity between query and document vectors

In [87]:
from tqdm.auto import tqdm
import numpy as np

h2o3_embeddings = []

for d in tqdm(md_h2oai):
    text = d['content']
    v = embedding_model.encode(text)
    h2o3_embeddings.append(v)

h2o3_embeddings = np.array(h2o3_embeddings)


  0%|          | 0/1 [00:00<?, ?it/s]

In [153]:
similarity = v_query.dot(v_doc)

##### using vector search

In [88]:
from minsearch import VectorSearch

h2o3_vindex = VectorSearch(
    keyword_fields=[]
)
h2o3_vindex.fit(h2o3_embeddings, md_h2oai)

<minsearch.vector.VectorSearch at 0x136492810>

In [89]:
query = 'We welcome and encourage your contributions of any kind!'
q = embedding_model.encode(query)
results = h2o3_vindex.search(q)

##### applying thesame to atlite document

In [91]:
atlite_embeddings = []

for d in tqdm(atlite_chunks):
    # Check if 'chunk' key exists in the dictionary
    # If not, you need to use the correct key that contains the text to encode
    # For example, if the text is stored under 'text' key instead:
    v = embedding_model.encode(query)
    atlite_embeddings.append(v)

atlite_embeddings = np.array(atlite_embeddings)

atlite_vindex = VectorSearch(
        keyword_fields=[]

)
atlite_vindex.fit(atlite_embeddings, atlite_chunks)

  0%|          | 0/89 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x136486a10>

### Hybrid Search

In [92]:
query = 'We welcome and encourage your contributions of any kind!'

text_results = h2o3_index.search(query, num_results=5)

q = embedding_model.encode(query)
vector_results = h2o3_vindex.search(q, num_results=5)

final_results = text_results + vector_results


#### bringing all code into different functions

In [154]:
def text_search(query):
    return h2o3_index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return h2o3_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results