In [2]:
import frontmatter

raw = """---
title: "Getting Started with AI"
author: "Euriel Chukwu"
date: "2025-09-23"
tags: ["ai", "machine-learning", "tutorial"]
difficulty: "beginner"
---

# Welcome

This is a tutorial on getting started with AI Agent.
"""

post = frontmatter.loads(raw)

print(post.metadata['title'])  # "Getting Started with AI"
print(post.metadata['author'])
print(post.metadata['date'])
print(post.metadata['tags'])   # ["ai", "machine-learning", "tutorial"]
print(post.metadata['difficulty'])
print(post.content)            # Markdown content without frontmatter


Getting Started with AI
Euriel Chukwu
2025-09-23
['ai', 'machine-learning', 'tutorial']
beginner
# Welcome

This is a tutorial on getting started with AI Agent.


In [3]:
import io
import zipfile
import requests
import frontmatter    

In [4]:
url = 'https://codeload.github.com/ceuriel/atlite/zip/refs/heads/master'
resp = requests.get(url)   

In [5]:
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()

    # Only process markdown files
    if not filename.endswith('.md'):
        continue

    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()

In [6]:
print(repository_data[1])    

{'content': "<!--\nSPDX-FileCopyrightText: Contributors to atlite <https://github.com/pypsa/atlite>\n\nSPDX-License-Identifier: CC0-1.0\n-->\n\nCloses # (if applicable).\n\n## Changes proposed in this Pull Request\n\n\n## Checklist\n\n- [ ] Code changes are sufficiently documented; i.e. new functions contain docstrings and further explanations may be given in `doc`.\n- [ ] Unit tests for new features were added (if applicable).\n- [ ] Newly introduced dependencies are added to `environment.yaml`, `environment_docs.yaml` and `setup.py` (if applicable).\n- [ ] A note for the release notes `doc/release_notes.rst` of the upcoming release is included.\n- [ ] I consent to the release of this PR's code under the MIT license.", 'filename': 'atlite-master/.github/pull_request_template.md'}


In [7]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/master'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data    

In [11]:
h2oai = read_repo_data('h2oai', 'h2o-3')
ceuriel_atlite = read_repo_data('ceuriel', 'atlite')

print(f"Atlite documents: {len(ceuriel_atlite)}")
print(f"h2o-3 documents: {len(h2oai)}")

Atlite documents: 3
h2o-3 documents: 89


In [10]:
for record in h2oai:
    print(record['filename'])

h2o-3-master/.github/ISSUE_TEMPLATE/bug_report.md
h2o-3-master/.github/ISSUE_TEMPLATE/feature_request.md
h2o-3-master/CONTRIBUTING.md
h2o-3-master/Changes-prior-3.28.0.1.md
h2o-3-master/Changes.md
h2o-3-master/DEVEL.md
h2o-3-master/README.md
h2o-3-master/README_DATA.md
h2o-3-master/SECURITY.md
h2o-3-master/ec2/README.md
h2o-3-master/examples/deeplearning/notebooks/README.md
h2o-3-master/gradle/README.md
h2o-3-master/h2o-algos/src/main/java/hex/deeplearning/README.md
h2o-3-master/h2o-assemblies/main/README.md
h2o-3-master/h2o-assemblies/minimal/README.md
h2o-3-master/h2o-bindings/bin/readme.md
h2o-3-master/h2o-clustering/README.md
h2o-3-master/h2o-core/src/main/resources/docs/pieces/columnSummary.md
h2o-3-master/h2o-dist/README.md
h2o-3-master/h2o-docs/README.md
h2o-3-master/h2o-docs/StyleGuide.md
h2o-3-master/h2o-docs/src/api/README.md
h2o-3-master/h2o-docs/src/api/REST/h2o_3_rest_api_overview.md
h2o-3-master/h2o-docs/src/api/data-science-example-1/README.md
h2o-3-master/h2o-docs/src/a

In [9]:
for record in ceuriel_atlite:
    print(record['filename'])

atlite-master/.github/ISSUE_TEMPLATE/feature_request.md
atlite-master/.github/pull_request_template.md
atlite-master/CONTRIBUTING.md
