In [1]:
import io
import zipfile
import requests
import frontmatter

In [2]:
doc_extensions = {'md', 'mdx'}
code_extensions = {'py', 'sql', 'java', 'ipynb'}

extensions = doc_extensions | code_extensions

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filepath = file_info.filename
        filepath_lower = filepath.lower()

        if filepath_lower.endswith('/'):
            continue

        filename = filepath_lower.split('/')[-1]

        if filename.startswith('.'):
            continue

        ext = filename.split('.')[-1]

        if ext not in extensions:
            continue

        filepath_edited = filepath.split('/', maxsplit=1)[1]

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                if ext in doc_extensions:
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data['filename'] = filepath_edited
                elif ext in code_extensions:
                    data = {
                        'code': True,
                        'content': content,
                        'filename': filepath_edited
                    }

                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    zf.close()
    return repository_data

In [3]:
de_zoomcamp_data = read_repo_data('DataTalksClub', 'data-engineering-zoomcamp')

In [4]:
len(de_zoomcamp_data)

192

In [5]:
index = {}

for record in de_zoomcamp_data:
    index[record['filename']] = record

In [9]:
import nbformat
from nbconvert import MarkdownExporter
from nbconvert.preprocessors import ClearOutputPreprocessor

exporter = MarkdownExporter()
exporter.register_preprocessor(ClearOutputPreprocessor(), enabled=True)

def format_notebook_as_md(raw_notebook: str) -> str:
    nb_parsed = nbformat.reads(
        raw_notebook,
        as_version=nbformat.NO_CONVERT,
    )
    md_body, _ = exporter.from_notebook_node(nb_parsed)
    return md_body

In [50]:
def strip_code_fence(text: str) -> str:
    text = text.strip()

    if not text.startswith("```"):
        return text

    lines = text.splitlines()
    lines = lines[1:]

    if lines and lines[-1].strip() == "```":
        lines = lines[:-1]

    return "\n".join(lines)

In [15]:
from openai import OpenAI

openai_client = OpenAI()

In [27]:
def llm(instructions, content, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": content}
    ]

    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=messages,
    )

    return response.output_text

In [68]:
notebook_editing_instructions = """
You're a professional coding editor.

You are given a Markdown file that was converted from a Jupyter notebook.  
The file already contains code blocks and inline comments.  

Your task:

- Turn it into clear, well-structured documentation.  
- Add section headers (##) where appropriate. Keep sections relatively large (8-10 paragraphs and code blocks)
- Add concise, high-level explanations for each code block.  
- Summarize what the code is doing without being overly verbose.  
- Keep the formatting in Markdown.
- Aim for a balance: clear enough to guide someone new, but not overloaded with detail. 

Output the improved Markdown file with the new documentation.
""".strip()

code_doc_instructions = """
You are given a piece of source code.  

Your task:  
- Analyze the code and produce a clear, high-level description of what it does.  
- If the code defines functions, methods, or classes, describe their purpose and role.  
- If it’s just a script without explicit functions/classes, summarize what the script does step by step at a high level.  
- Add logical sections or headings (##) if needed. Sections must be relatively large (8-10 paragraphs and code blocks)
- Keep explanations concise and clear — avoid unnecessary verbosity.  
- Output the result in Markdown, structured like documentation.  
- Do not rewrite or modify the code itself, only provide descriptive documentation.
""".strip()

In [36]:
result = llm(system_prompt, md_body)

In [41]:
from tqdm.auto import tqdm

In [62]:
ipynb_data = []

for record in de_zoomcamp_data:
    if record.get('code') == True and record['filename'].endswith('.ipynb'):
        ipynb_data.append(record)


print(f'processing {len(ipynb_data)} jupyter notebooks...')

for record in tqdm(ipynb_data):
    md_body = format_notebook_as_md(record['content'])
    new_content = llm(notebook_editing_instructions, md_body)
    new_content = strip_code_fence(new_content)
    record['content'] = new_content
    record['code'] = False

processing 0 jupyter notebooks...


0it [00:00, ?it/s]

In [65]:
code_data = []

for record in de_zoomcamp_data:
    if record.get('code') != True:
        continue

    path = record['filename']
    ext = path.split('.')[-1]

    if ext not in code_extensions:
        continue

    if ext == 'ipynb':
        continue

    # print(path)
    code_data.append(record)

print(f'processing {len(code_data)} code files...')

processing 78 code files...


In [69]:
for record in tqdm(code_data):
    code = record['content']

    new_content = llm(code_doc_instructions, code)
    new_content = strip_code_fence(new_content)

    record['content'] = new_content
    record['code'] = False

  0%|          | 0/78 [00:00<?, ?it/s]

In [73]:
import json

In [76]:
!mkdir data

In [77]:
output_file = 'data/de-zoomcamp-processed.json'

with open(output_file, 'w', encoding='utf-8') as f_out:
    json.dump(de_zoomcamp_data, f_out, indent=2)

In [79]:
!head data/de-zoomcamp-processed.json

[
  {
    "content": "## Terraform Overview\n\n[Video](https://www.youtube.com/watch?v=18jIzE41fJ4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=2)\n\n### Concepts\n\n#### Introduction\n\n1. What is [Terraform](https://www.terraform.io)?\n   * open-source tool by [HashiCorp](https://www.hashicorp.com), used for provisioning infrastructure resources\n   * supports DevOps best practices for change management\n   * Managing configuration files in source control to maintain an ideal provisioning state \n     for testing and production environments\n2. What is IaC?\n   * Infrastructure-as-Code\n   * build, change, and manage your infrastructure in a safe, consistent, and repeatable way \n     by defining resource configurations that you can version, reuse, and share.\n3. Some advantages\n   * Infrastructure lifecycle management\n   * Version control commits\n   * Very useful for stack-based deployments, and with cloud providers such as AWS, GCP, Azure, K8S\u2026\n   * State-based approach t

In [80]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [88]:
de_zoomcamp_chunks = []

for doc in de_zoomcamp_data:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    de_zoomcamp_chunks.extend(chunks)

In [91]:
len(de_zoomcamp_chunks)

865

In [93]:
de_zoomcamp_chunks[100]

{'start': 2000,
 'chunk': "d (e.g., '2019', '2020').\n- `service`: The type of taxi service (e.g., 'green', 'yellow').\n\n### Step-by-Step Walkthrough\n\n1. **Monthly Loop**: The function iterates over the months from January to December.\n  \n2. **File Naming**: For each month, it constructs the name of the CSV file using the service type and year. Months are formatted to ensure two digits (e.g., '01', '02').\n\n3. **HTTP Request**: It downloads the CSV file from the constructed URL using the `requests` library. The downloaded content is saved as a local file.\n\n4. **Data Reading and Conversion**: The local CSV file is read into a pandas DataFrame with gzip compression. It is then converted and saved as a Parquet file for efficient storage and querying.\n\n5. **GCS Upload**: The Parquet file is uploaded to the specified GCS bucket using the `upload_to_gcs` function.\n\n6. **Logging**: Throughout the process, the script outputs messages to provide feedback about the local file's creat