In [None]:
import io
import zipfile
import requests
import frontmatter

def build_repository_url(owner: str, repo: str, branch: str = 'main') -> str:
    return f'https://codeload.github.com/{owner}/{repo}/zip/refs/heads/{branch}'

def process_markdown_file(z: zipfile.ZipFile, file_info: zipfile.ZipInfo) -> dict[str, object] | None:
    try:
        with z.open(file_info) as f:
            content = f.read().decode('utf-8')
            post = frontmatter.loads(content)
            data = post.to_dict()
            data['filename'] = file_info
            return data
    except Exception as e:
        print(f"Error processing {file_info.filename}: {e}")
        return None

def read_zip_from_url(url: str) -> list[dict]:
    response = requests.get(url)
    repository_data = []
    try:
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            for file_info in z.infolist():
                if file_info.filename.endswith('.md') or file_info.filename.endswith('.mdx'):
                    repository_data.append(process_markdown_file(z, file_info))
    except Exception as e:
        print(f"Error reading zip file: {e}")

    return [data for data in repository_data if data is not None]

def main():
    dtc_faq = read_zip_from_url(build_repository_url('DataTalksClub', 'faq'))
    evidently_docs = read_zip_from_url(build_repository_url('evidentlyai', 'docs'))

    print(f"FAQ documents: {len(dtc_faq)}")
    print(f"Evidently documents: {len(evidently_docs)}")

if __name__ == "__main__":
    main()


FAQ documents: 1219
Evidently documents: 95
