In [1]:
import io
import zipfile
import requests
import frontmatter

In [2]:
doc_extensions = {'md', 'mdx'}
code_extensions = {'py', 'sql', 'java', 'ipynb'}

extensions = doc_extensions | code_extensions

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filepath = file_info.filename
        filepath_lower = filepath.lower()

        if filepath_lower.endswith('/'):
            continue

        filename = filepath_lower.split('/')[-1]

        if filename.startswith('.'):
            continue

        ext = filename.split('.')[-1]

        if ext not in extensions:
            continue

        filepath_edited = filepath.split('/', maxsplit=1)[1]

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                if ext in doc_extensions:
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data['filename'] = filepath_edited
                elif ext in code_extensions:
                    data = {
                        'code': True,
                        'content': content,
                        'filename': filepath_edited
                    }

                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    zf.close()
    return repository_data

In [3]:
de_zoomcamp_data = read_repo_data('DataTalksClub', 'data-engineering-zoomcamp')

In [4]:
len(de_zoomcamp_data)

192

In [7]:
index = {}

for record in de_zoomcamp_data:
    index[record['filename']] = record
    print(record['filename'])

01-docker-terraform/1_terraform_gcp/1_terraform_overview.md
01-docker-terraform/1_terraform_gcp/2_gcp_overview.md
01-docker-terraform/1_terraform_gcp/README.md
01-docker-terraform/1_terraform_gcp/terraform/README.md
01-docker-terraform/1_terraform_gcp/windows.md
01-docker-terraform/2_docker_sql/README.md
01-docker-terraform/2_docker_sql/data-loading-parquet.ipynb
01-docker-terraform/2_docker_sql/data-loading-parquet.py
01-docker-terraform/2_docker_sql/ingest_data.py
01-docker-terraform/2_docker_sql/pg-test-connection.ipynb
01-docker-terraform/2_docker_sql/pipeline.py
01-docker-terraform/2_docker_sql/upload-data.ipynb
01-docker-terraform/README.md
02-workflow-orchestration/README.md
03-data-warehouse/README.md
03-data-warehouse/big_query.sql
03-data-warehouse/big_query_hw.sql
03-data-warehouse/big_query_ml.sql
03-data-warehouse/extract_model.md
03-data-warehouse/extras/README.md
03-data-warehouse/extras/web_to_gcs.py
04-analytics-engineering/README.md
04-analytics-engineering/SQL_refres

In [23]:
cnt = index['04-analytics-engineering/taxi_rides_ny/models/core/dim_zones.sql']
code = cnt['content']

In [10]:
from openai import OpenAI

openai_client = OpenAI()

In [11]:
def llm(instructions, content, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": content}
    ]

    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=messages,
    )

    return response.output_text

In [24]:
code_doc_instructions = """
You are given a piece of source code.  

Your task:  
- Analyze the code and produce a clear, high-level description of what it does.  
- If the code defines functions, methods, or classes, describe their purpose and role.  
- If it’s just a script without explicit functions/classes, summarize what the script does step by step at a high level.  
- Add logical sections or headings (##) if needed. Sections must be relatively large (8-10 paragraphs and code blocks)
- Keep explanations concise and clear — avoid unnecessary verbosity.  
- Output the result in Markdown, structured like documentation.  
- Do not rewrite or modify the code itself, only provide descriptive documentation.
"""

In [25]:
result = llm(code_doc_instructions, code)

In [26]:
print(result)

# High-Level Description of the Code

This code snippet is written in SQL and is intended for use within a data transformation and modeling tool, likely a system like dbt (data build tool). It transforms data from a reference table, typically in a data warehouse, into another table for further analysis or reporting.

## Materialization Configuration

```sql
{{ config(materialized='table') }}
```

This line indicates that the SQL query should be materialized as a table in the database. Materialization refers to the way the result of a query is stored; in this case, the results will be saved in a physical table rather than just a view or temporary output. This is beneficial for performance reasons if the query will be run frequently, as it reduces the need to recompute the data every time it is accessed.

## Data Selection

```sql
select 
    locationid, 
    borough, 
    zone, 
    replace(service_zone,'Boro','Green') as service_zone 
from {{ ref('taxi_zone_lookup') }}
```

This `SELEC

In [27]:
print(code)

{{ config(materialized='table') }}

select 
    locationid, 
    borough, 
    zone, 
    replace(service_zone,'Boro','Green') as service_zone 
from {{ ref('taxi_zone_lookup') }}
