In [11]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        Dict with separate lists for .md and .mdx files
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    md_files = []
    mdx_files = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                data['content'] = post.content
                
                if filename_lower.endswith('.md'):
                    md_files.append(data)
                else:
                    mdx_files.append(data)

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return {
        "md": md_files,
        "mdx": mdx_files
    }

In [12]:
# Fetch Qiskit docs
qiskit_docs = read_repo_data('Qiskit', 'documentation')

# Count files
num_md = len(qiskit_docs["md"])
num_mdx = len(qiskit_docs["mdx"])

print(f"Total .md files: {num_md}")
print(f"Total .mdx files: {num_mdx}")

Total .md files: 5
Total .mdx files: 8388


In [13]:
# Print first 5 filenames
print("\nFirst 5 .md filenames:")
for file in qiskit_docs["md"][:5]:
    print("-", file["filename"])

print("\nFirst 5 .mdx filenames:")
for file in qiskit_docs["mdx"][:5]:
    print("-", file["filename"])


First 5 .md filenames:
- documentation-main/CODE_OF_CONDUCT.md
- documentation-main/README.md
- documentation-main/mdx-guide.md
- documentation-main/scripts/nb-tester/README.md
- documentation-main/style-guide.md

First 5 .mdx filenames:
- documentation-main/docs/accessibility.mdx
- documentation-main/docs/api/qiskit-addon-aqc-tensor/0.1/ansatz-generation-ansatz-block.mdx
- documentation-main/docs/api/qiskit-addon-aqc-tensor/0.1/ansatz-generation-kak.mdx
- documentation-main/docs/api/qiskit-addon-aqc-tensor/0.1/ansatz-generation-one-qubit-ansatz-block.mdx
- documentation-main/docs/api/qiskit-addon-aqc-tensor/0.1/ansatz-generation-two-qubit-ansatz-block.mdx


In [14]:
# Print sample content (first 300 chars) of first 5 files
print("\nSample content of first 5 .md files:")
for file in qiskit_docs["md"][:5]:
    print(f"\n--- {file['filename']} ---\n{file['content'][:300]}...\n")

print("\nSample content of first 5 .mdx files:")
for file in qiskit_docs["mdx"][:5]:
    print(f"\n--- {file['filename']} ---\n{file['content'][:300]}...\n")


Sample content of first 5 .md files:

--- documentation-main/CODE_OF_CONDUCT.md ---
<!-- Copyright Contributors to the Qiskit project. -->

# Code of Conduct

All members of this project agree to adhere to the Qiskit Code of Conduct listed at [quantum.cloud.ibm.com/docs/open-source/code-of-conduct](https://quantum.cloud.ibm.com/docs/open-source/code-of-conduct)

---

License: [CC B...


--- documentation-main/README.md ---
# Qiskit documentation

The documentation content home for https://quantum.cloud.ibm.com/docs and https://docs.quantum.ibm.com (excluding API reference).

Refer to:

- Our [MDX guide](./mdx-guide.md) for how to write documentation and use our variant of markdown.
- Our [style guide](./style-guide.md...


--- documentation-main/mdx-guide.md ---
# How to write the documentation

Refer to:

- The [README](./README.md) for how to set up the project.
- Our [style guide](./style-guide.md) for technical writing guidance.

We use [MDX](https://mdxjs.com), which is like norm