In [35]:
import io
import zipfile
import requests
import frontmatter

extensions = {'md', 'mdx', 'py', 'sql', 'java', 'ipynb'}

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filepath = file_info.filename
        filepath_lower = filepath.lower()

        if filepath_lower.endswith('/'):
            continue

        filename = filepath_lower.split('/')[-1]

        if filename.startswith('.'):
            continue

        ext = filename.split('.')[-1]

        if ext not in extensions:
            continue

        filepath_edited = filepath.split('/', maxsplit=1)[1]

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filepath_edited
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data

In [36]:
de_zoomcamp_data = read_repo_data('DataTalksClub', 'data-engineering-zoomcamp')

In [37]:
len(de_zoomcamp_data)

192

In [38]:
for record in de_zoomcamp_data:
    print(record['filename'])

01-docker-terraform/1_terraform_gcp/1_terraform_overview.md
01-docker-terraform/1_terraform_gcp/2_gcp_overview.md
01-docker-terraform/1_terraform_gcp/README.md
01-docker-terraform/1_terraform_gcp/terraform/README.md
01-docker-terraform/1_terraform_gcp/windows.md
01-docker-terraform/2_docker_sql/README.md
01-docker-terraform/2_docker_sql/data-loading-parquet.ipynb
01-docker-terraform/2_docker_sql/data-loading-parquet.py
01-docker-terraform/2_docker_sql/ingest_data.py
01-docker-terraform/2_docker_sql/pg-test-connection.ipynb
01-docker-terraform/2_docker_sql/pipeline.py
01-docker-terraform/2_docker_sql/upload-data.ipynb
01-docker-terraform/README.md
02-workflow-orchestration/README.md
03-data-warehouse/README.md
03-data-warehouse/big_query.sql
03-data-warehouse/big_query_hw.sql
03-data-warehouse/big_query_ml.sql
03-data-warehouse/extract_model.md
03-data-warehouse/extras/README.md
03-data-warehouse/extras/web_to_gcs.py
04-analytics-engineering/README.md
04-analytics-engineering/SQL_refres