In [1]:
import logging
import pandas as pd
import os

CLONE_SLEEP_SECONDS = 150

GITHUB_REPOS_DIR = 'github-repos'
PARQUET_DIR = "parquet"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

df = pd.read_parquet(os.path.join(PARQUET_DIR, 'q-repo-list_q-kdb.parquet'))
logging.info(f"{len(df)=}")

2024-11-07 16:19:49,408 - INFO - len(df)=44


In [2]:
df['license'].value_counts()

license
{'key': 'apache-2.0', 'name': 'Apache License 2.0', 'node_id': 'MDc6TGljZW5zZTI=', 'spdx_id': 'Apache-2.0', 'url': 'https://api.github.com/licenses/apache-2.0'}                             21
{'key': 'mit', 'name': 'MIT License', 'node_id': 'MDc6TGljZW5zZTEz', 'spdx_id': 'MIT', 'url': 'https://api.github.com/licenses/mit'}                                                         18
{'key': 'cc-by-4.0', 'name': 'Creative Commons Attribution 4.0 International', 'node_id': 'MDc6TGljZW5zZTI1', 'spdx_id': 'CC-BY-4.0', 'url': 'https://api.github.com/licenses/cc-by-4.0'}     3
{'key': 'cc0-1.0', 'name': 'Creative Commons Zero v1.0 Universal', 'node_id': 'MDc6TGljZW5zZTY=', 'spdx_id': 'CC0-1.0', 'url': 'https://api.github.com/licenses/cc0-1.0'}                     2
Name: count, dtype: int64

In [3]:
import os
import subprocess
import time

# Function to check if a repository is already downloaded
def check_and_download_repo(org_name, repo_name):
    """
    Check if a GitHub repository exists locally, and if not, download it.

    Args:
        org_name (str): The name of the GitHub organization or user.
        repo_name (str): The name of the repository.

    Returns:
        None

    Side Effects:
        - Creates directories if they do not exist.
        - Clones the repository from GitHub if it does not exist locally.
        - Logs information and errors during the process.

    Raises:
        subprocess.CalledProcessError: If the git clone command fails.
    """
    full_name = f'{org_name}/{repo_name}'
    repo_url = f'https://github.com/{full_name}.git'
    org_path = os.path.join(GITHUB_REPOS_DIR, org_name)
    repo_path = os.path.join(org_path, repo_name)
    if not os.path.exists(repo_path):
        os.makedirs(org_path, exist_ok=True)
        logging.info(f"Downloading repository {full_name} into {GITHUB_REPOS_DIR}...")
        try:
            subprocess.run(['git', 'clone', repo_url, repo_path], check=True)
        except subprocess.CalledProcessError as e:
            logging.error(f"Failed to clone repository {full_name} from {repo_url}. Error: {e}")

        time.sleep(CLONE_SLEEP_SECONDS)

    else:
        logging.info(f"Repository {repo_name} already exists in {org_path}.")

# Loop through the DataFrame and download the repositories
for index, row in df.head(232).iterrows():
    # Keep org_name and repo_name separate since path separator may be different on different OS.
    org_name = row['owner']['login']
    repo_name = row['name']
    stars = row['stargazers_count']
    check_and_download_repo(org_name, repo_name)


2024-11-07 16:19:59,347 - INFO - Downloading repository KxSystems/kdb into github-repos...
Cloning into 'github-repos/KxSystems/kdb'...
Updating files: 100% (229/229), done.
2024-11-07 16:23:46,284 - INFO - Downloading repository KxSystems/pyq into github-repos...
Cloning into 'github-repos/KxSystems/pyq'...
Updating files: 100% (83/83), done.
2024-11-07 16:26:27,396 - INFO - Downloading repository psaris/qtips into github-repos...
Cloning into 'github-repos/psaris/qtips'...
2024-11-07 16:29:04,473 - INFO - Downloading repository psaris/funq into github-repos...
Cloning into 'github-repos/psaris/funq'...
Updating files: 100% (88/88), done.
2024-11-07 16:31:45,281 - INFO - Downloading repository qbists/studyq into github-repos...
Cloning into 'github-repos/qbists/studyq'...
Updating files: 100% (207/207), done.
2024-11-07 16:34:36,525 - INFO - Downloading repository KxSystems/jupyterq into github-repos...
Cloning into 'github-repos/KxSystems/jupyterq'...
Updating files: 100% (98/98), do

In [4]:
logging.info('Done')

2024-11-07 18:23:40,204 - INFO - Done
