In [1]:
import glob
import json
import os
from datetime import datetime

from github import Github, RateLimitExceededException

GH_KEYS = [os.environ['GH_KEY{}'.format(i)] for i in range(1, 10)]

In [2]:
# get 18f repo names (& sort them)

gh = Github(GH_KEYS[0])
org = gh.get_organization('18F')
repo_names = sorted([r.name for r in org.get_repos()])

print(len(repo_names))
print(repo_names[:5])

839
['14c-prototype', '18f-bot', '18f-cli', '18f-education-discovery', '18f-reveal.js-theme']


In [3]:
# get and save all commits (since 2016) per repo

SINCE = datetime(2016, 1, 1)
IDX_START = 0
IDX_END = 100


class Extract:
    def __init__(self):
        self.keys = GH_KEYS[:]
        self.gh = Github(self.keys[0])
    
    def fetch(self, name):
        try:
            repo = self.gh.get_organization('18F').get_repo(name)
            commits = repo.get_commits(since=SINCE)
            return { 'repo': name, 'commits': [c.raw_data for c in commits] }
        except RateLimitExceededException:
            self.update_key()
            return self.fetch(name)   

    def save(self, data, name):
        with open('data/repos/{}.json'.format(name), 'w') as f:
            json.dump(data, f)

    def update_key(self):
        if not len(self.keys):
            raise Exception('no more API keys to use :(')
        key = self.keys.pop(0)
        print('using new key "{}"...'.format(key))
        self.gh = Github(key)

        
e = Extract()   
for i, repo in enumerate(repo_names[IDX_START:IDX_END]):
    print('{}. fetching commits for {}...'.format(i + IDX_START, repo))
    e.save(e.fetch(repo), repo)

In [4]:
# combine & save into one big dataset

data_files = glob.glob('data/repos/*')
results = {}

for fname in data_files:
    with open(fname) as f:
        d = json.load(f)
        results[d['repo']] = d['commits']

with open('data/commits.json', 'w') as f:
    json.dump(results, f, ensure_ascii=False)