In [None]:
import os
import json
import time
import math
import pickle
import requests
from tqdm import tqdm


#~~~~~~~~~~~~~~~~~~
USER = 'YOUR_GITHUB_LOGIN'
TOKEN = 'YOUR_GITHUB_TOKEN'
#~~~~~~~~~~~~~~~~~~

REMAINING_REQUESTS = 30


def save_checkpoint(lower_bound: int, upper_bound: int):
    global repo_list
    repo_list = list(set(repo_list))
    print(f'Saving checkpoint {lower_bound, upper_bound}...')
    with open('repo_checkpoint.pkl', 'wb') as f:
        pickle.dump((lower_bound, upper_bound, repo_list), f)

def get_request(lower_bound: int, upper_bound: int, page: int=1):
    global REMAINING_REQUESTS, USER, TOKEN, repo_list
    r = requests.get(
           f'https://api.github.com/search/repositories?q=language:Python+size:{lower_bound}..{upper_bound}+stars:>99&per_page=100&page={page}',
           auth=(USER, TOKEN)
        )

    if r.status_code == 403:
        print('API rate limit exceeded')
        save_checkpoint(lower_bound, upper_bound, repo_list)
        print('Exiting program')
        exit()
    elif r.status_code == 422:
        return False
    try:
        assert r.status_code == 200
    except:
        print(f'Unexpected status code. Status code returned is {r.status_code}')
        print(r.text)
        save_checkpoint(lower_bound, upper_bound, repo_list)
        print('Exiting program')
        exit()
    
    REMAINING_REQUESTS -= 1

    if REMAINING_REQUESTS == 0:
        print('Sleeping 60 seconds to stay under GitHub API rate limit...')
        time.sleep(60)
        save_checkpoint(lower_bound, upper_bound)
        REMAINING_REQUESTS = 30

    return r

def download_range(lower_bound, upper_bound):
    global repo_list
    for page in range(1, 11):
        r = get_request(lower_bound=lower_bound, upper_bound=upper_bound, page=page)

        if page == 1:
            n_results = r.json()['total_count']
            n_query_pages = min(math.ceil(n_results / 100), 10)

        for repository in r.json()['items']:
            name = repository['full_name']
            stars = repository['stargazers_count']
            lang = repository['language']
            repo_list.append((name, stars, lang))

        if page >= n_query_pages:
            return n_results

        
if __name__ == '__main__':
    if 'repo_checkpoint.pkl' in os.listdir():
        with open('repo_checkpoint.pkl', 'rb') as f:
            lower_bound, upper_bound, repo_list = pickle.load(f)
        print(f'Loading from {lower_bound}..{upper_bound}')
    else:
        lower_bound = 0
        upper_bound = 0
        repo_list = []

    if lower_bound >= 10000000:
        print('Completed. Remove repo_checkpoint.pkl and restart')
        exit()

    while lower_bound < 10000000:
        r = get_request(lower_bound, upper_bound)
        n_results = r.json()['total_count']
        print(f'Size {lower_bound}..{upper_bound} ~> {n_results} results')
        print(f'Downloading repositories in size range {lower_bound}..{upper_bound}')
        download_range(lower_bound, upper_bound)
        lower_bound = upper_bound + 1
        upper_bound += 100

    save_checkpoint(lower_bound, upper_bound)

    with open('github_repositories_upd.csv', 'w') as f:
        for repo in repo_list:
            name, stars, lang = repo
            f.write(f'{name},{stars},{lang}\n')

In [None]:
import os
import csv
from tqdm import tqdm
from joblib import Parallel, delayed


path = 'github-data'


def download_repo(repo):
    file_name = repo.split('/')[-1]
    if file_name not in os.listdir(f'{path}/'):
        os.system(rf'git clone --depth 1 --single-branch https://github.com/{repo} "{path}/{file_name}"')
    else:
        print(f'Already downloaded {repo}')

with open('github_repositories_upd.csv', 'r') as f:
    csv_reader = csv.reader(f)
    repositories = list(filter(lambda x: x[-1] == 'Python', map(tuple, csv_reader)))

repo_names = [repo[0] for repo in repositories]
Parallel(n_jobs=1, prefer='threads')(
    delayed(download_repo)(name) for name in tqdm(repo_names))