In [2]:
import os
import sys
import re
import time
from pymongo import MongoClient
from github import Github
from random import randint
from pprint import pprint

# Credentials

In [3]:
pw_file = 'credentials/pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to MongoDB

In [4]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

# get reference to  resume_db
db = client['github_db']

# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Create a MongoDB collection

In [6]:
# create a collection in the 'github_db' database
lst_collections = ['git_users_meta', 'git_users_following', 'git_users_followers', 'git_users_starred', 
                   'git_repos_meta', 'git_repos_docs', 'git_repos_subscribers', 'git_repos_stargazers', 
                  'git_repos_contributors']

for collection in lst_collections:
    if collection not in db.collection_names():
        db.create_collection(collection)

In [94]:
def show_collections():
    '''
    IN: None
    RETURN: Printed output of collections and collection counts
    '''
    for name in sorted(db.collection_names()):
        print('Documents in \"{0}\" collection: {1}'.format(name, db[name].count()))

In [95]:
show_collections()

Documents in "git_repos_contributors" collection: 801
Documents in "git_repos_docs" collection: 27367
Documents in "git_repos_meta" collection: 801
Documents in "git_repos_stargazers" collection: 0
Documents in "git_repos_subscribers" collection: 801
Documents in "git_users_followers" collection: 698
Documents in "git_users_following" collection: 698
Documents in "git_users_meta" collection: 698
Documents in "git_users_starred" collection: 0


# Get authenticated access to Github for higher requests limit

In [8]:
# import github private token
with open('credentials/token.txt', 'r') as infile:
    token = infile.readline().strip()
    user = infile.readline().strip()

git_client = Github(token)

# Rate limits

In [96]:
def get_remaining(_type ='core'):
    '''
    IN: term to determine if rate limit for requests (core) or searching (search) is to be returned
        _type => 'core' or 'search'
    
    RETURN: tuple of remaining rate limit quantity, and time till reset
    '''
    rate_limit = git_client.get_rate_limit()
    raw = dict(rate_limit.raw_data)
    remaining = int(raw['resources'][_type]['remaining'])
    
    return remaining

In [97]:
print('Resource rate limit remaining: {0}'.format(get_remaining('core')))
print('Search rate limit remaining: {0}'.format(get_remaining('search')))

Resource rate limit remaining: 5000
Search rate limit remaining: 30


# Wait function

In [11]:
def limit_wait(location, ct=0):
    '''
    IN: 
        location => function name (input text) where program paused till rate is reset
        ct => counter to show length of pause in 5 minute intervals
    RETURN: 
        None - program will halt here if limit is below buffer threshold and continue while it is above/reset
    '''
    # wait till rate limit is refreshed
    rate, _ = get_remaining('core')
    
    if rate < 150: #buffer
        print('Stopped @ {0}'.format(location))
    while (rate < 150):
        print('Rate: {0} ===> Waiting...{1}'.format(rate, ct))
        time.sleep(300)
        rate, _ = get_remaining('core')
        ct += 5

# Repo Metadata

In [12]:
def repo_metadata(repo):
    '''
    IN: github repository object
    RETURN: dictionary with only relevant/desired repo metadata to be uploaded into mongodb
    '''
    repo_keys = ['owner', 'name', 'full_name', 'description', 'fork', 'html_url', 'homepage', 'language', 
                 'forks_count', 'size', 'open_issues_count', 'has_issues', 'has_wiki', 'has_downloads', 
                 'pushed_at', 'created_at', 'subscribers_count', 'stargazers_count']
    
    owner_keys = ['type', 'login', 'id', 'site_admin']
    
    # check for rate limit
    limit_wait('repo_metadata')
    
    # get returned payload raw data into dictionary
    raw_dict = dict(repo.raw_data)
    
    # trim repo by keys listed above
    repo_dict = {k: raw_dict[k] for k in repo_keys}
    
    # trim owner by keys listed above
    owner_dict = {k: raw_dict['owner'][k] for k in owner_keys}
    repo_dict['owner'] = owner_dict
    repo_dict['_id'] = repo.id
    
    return repo_dict

In [13]:
def repo_metadata_upsert(repo):
    '''
    IN: github repository object
    RETURN: None - data returned from repo_metadata function is inserted into mongodb
    '''
    try:
        db['git_repos_meta'].update_one(
            {'_id': repo.id}, 
            {'$set':{col.insert_one(repo_metadata(repo))}}, 
            upsert=True)
    except:
        pass
    return None

# Repo Subscribers

In [14]:
def repo_subscribers(repo):
    '''
    IN: github repository object
    RETURN: dictionary containing repo subscribers to be uploaded into mongodb
    '''
    # check for rate limit
    limit_wait('repo_subscribers')
    
    repo_dict = {}
    repo_dict['_id'] = repo.id
    repo_dict['full_name'] = repo.full_name
    
    try:
        repo_dict['repo_subscribers'] = [f.login for f in repo.get_subscribers()]
        
    except:
        repo_dict['repo_subscribers'] = []
        
    return repo_dict

In [15]:
def repo_subscribers_upsert(repo):
    '''
    IN: github repository object
    RETURN: None - data returned from repo_subscribers function is inserted into mongodb
    '''
    try:
        db['git_repos_subscribers'].update_one(
            {'_id': repo.id}, 
            {'$set': {col.insert_one(repo_subscribers(repo))}
            }, upsert=True)
    except:
        pass
    return None

# Repo Contributors

In [18]:
def repo_contributors(repo):
    '''
    IN: github repository object
    RETURN: dictionary containing contributors (login names) keyed by repo full_name to be uploaed into mongodb
    '''
    # check for rate limit
    limit_wait('repo_contributors')
    
    repo_dict = {}
    repo_dict['_id'] = repo.id
    repo_dict['full_name'] = repo.full_name
    
    try:
        repo_dict['repo_contributors'] = [f.login for f in repo.get_contributors()]
        
    except:
        repo_dict['repo_contributors'] = ''
        
    return repo_dict

In [19]:
def repo_contributors_upsert(repo):
    '''
    IN: github repository object
    RETURN: None - data returned from repo_contributors function is inserted into mongodb
    '''
    try:
        db['git_repos_contributors'].update_one(
            {'_id': repo.id}, 
            {'$set': {col.insert_one(repo_contributors(repo))}
            }, upsert=True)
    except:
        pass
    return None

# User Metadata

In [20]:
def user_metadata(user):
    '''
    IN: github NamedUser object
    RETURN: dictionary with only relevant/desired user metadata to be uploaded into mongodb
    '''
    user_keys = ['email', 'followers', 'hireable', 'login', 'bio', 'avatar_url', 'company', 
                 'updated_at', 'type', 'created_at', 'name', 'location', 'html_url', 'public_repos', 
                 'blog', 'public_gists', 'following']
    
    # check for rate limit
    limit_wait('user_metadata')
        
    # get returned payload raw data into dictionary
    raw_dict = dict(user.raw_data)
    
    # trim user by keys listed above
    user_dict = {k: raw_dict[k] for k in user_keys}
    user_dict['_id'] = user.id
    
    return user_dict

In [21]:
def user_metadata_upsert(user):
    '''
    IN: github NamedUser object
    RETURN: None - data returned from user_metadata function is inserted into mongodb
    '''
    try:
        db['git_users_meta'].update_one(
            {'_id': user.id}, 
            {'$set':{col.insert_one(user_metadata(user))}}, 
            upsert=True)
    except:
        pass
    return None

# User Following

In [22]:
def user_following(user):
    '''
    IN: github NamedUser object
    RETURN: dictionary containing users (login names) followed by user (passed) to be uploaed into mongodb
    '''
    # check for rate limit
    limit_wait('user_following') 
    
    user_dict = {}
    user_dict['_id'] = user.id
    user_dict['login'] = user.login
        
    try:
        user_dict['user_following'] = [f.login for f in repo.get_following()]
        
    except:
        user_dict['user_following'] = []
        
    return user_dict

In [23]:
def user_following_upsert(user):
    '''
    IN: github NamedUser object
    RETURN: None - data returned from user_following function is inserted into mongodb
    '''
    try:
        db['git_users_following'].update_one(
            {'_id': user.id}, 
            {'$set': {col.insert_one(user_following(user))}
            }, upsert=True)
    except:
        pass
    return None

# User Followers

In [24]:
def user_followers(user):
    '''
    IN: github NamedUser object
    RETURN: dictionary containing users (login names) who follow user (passed) to be uploaed into mongodb
    '''
    # check for rate limit
    limit_wait('user_followers')
    
    user_dict = {}
    user_dict['_id'] = user.id
    user_dict['login'] = user.login
        
    try:
        user_dict['user_followers'] = [f.login for f in repo.get_followers()]
        
    except:
        user_dict['user_followers'] = []
    
    return user_dict

In [25]:
def user_followers_upsert(user):
    '''
    IN: github NamedUser object
    RETURN: None - data returned from user_followers function is inserted into mongodb
    '''
    try:
        db['git_users_followers'].update_one(
            {'_id': user.id}, 
            {'$set': {col.insert_one(user_followers(user))}
            }, upsert=True)
    except:
        pass
    return None

# Repo Files

In [28]:
def repo_scripts_upsert(repo, file_name, raw_file_cont, file_ext, doc_name):
    '''
    IN: 
        repo ==> github repository object
        file_name ==> file name pulled from github repo
        raw_file_contnet ==> raw file contents of file_name
        file_ext ==> file extension of file_name
        doc_name ==> generic, sequentially numbered 'clean' name for storing documents in mongodb
    RETURN: 
        None - file details/information uploaded into mongodb
    '''
    # strip newline and tab chars from raw file content
    col = db['git_repos_docs']
    doc_name = '{0}_{1}'.format(repo.id, doc_name)
    strip_file_cont = raw_file_cont.replace('\n', ' ')
    strip_file_cont = strip_file_cont.replace('\t', ' ')

    try:
        col.update_one(
            {'document' : doc_name}, 
            {'$set':{
                    'file_name': file_name,
                    'raw_file': raw_file_cont,
                    'strip_file': strip_file_cont,
                    'extension': file_ext,
                    'full_name' : repo.full_name,
                    'document' : doc_name}
                }, 
            upsert=True)
    except:
        pass
    return None

In [29]:
def repo_scripts(repo, doc_ct=0, _path='.'):
    '''
    IN: 
        repo ==> github repository object
        doc_ct ==> sequential number for 'clean' file names inserted into mongodb
        _path ==> path to folder to extract repository files; '.' is top level repository folder
    RETRUN: 
        None - all files from repo except those in the exclusion list will be inserted into mongob
    '''
    # only files with these extensions
    extensions = ['py', 'md']
    
    # grab all contents in the main directory
    dir_contents = repo.get_dir_contents(_path)
    num_files = len(dir_contents)
    repo_fullname = repo.full_name

    # check for rate limit
    limit_wait('repo_scripts')
    
    for content in dir_contents:
        # if item is a directory then recursively navigate lower to get files inside
        if content.type == 'dir':
            repo_scripts(repo, doc_ct, _path=content.path)

        else:
            doc_ct += 1
            # get file extension
            file_ext = content.name.split('.')[-1]

            if file_ext in extensions:
                # try to decode, but return blank if fail
                try:
                    raw_file_content = content.decoded_content.decode(errors='replace')
                except:
                    raw_file_content = ''
                    
                # actual file name, mongodb doc stored name
                file_name = content.name
                doc_name = 'doc_{0}'.format(doc_ct)
                
                # add to collection if under limit (>16mb)
                file_size = sys.getsizeof(raw_file_content)
                if file_size < 16000000:
                    repo_scripts_upsert(repo, file_name, raw_file_content, file_ext, doc_name)
                else:
                    print('FILE SIZE LIMIT: {0} -- {1}'.format(file_name, file_size))
    return None

# Search Github for 'Python' repositories

In [76]:
def get_repo(repo):
    '''
    IN: github repository object
    RETURN: None - all desired repository data uploaded to mongodb
    '''
    if db['git_repos_meta'].find({'full_name': repo.full_name}).count() == 0:
        print('Getting.....#{0} - {1}'.format(ct, repo.full_name))
        repo_scripts(repo)
        repo_metadata_upsert(repo)
        repo_subscribers_upsert(repo)
        repo_contributors_upsert(repo)
    return None

In [77]:
def get_repo_owner(repo):
    '''
    IN: github repository object
    RETURN: None - all desired data pertaining to repository owner uploaded to mongodb
    '''
    user = repo.owner
    if db['git_users_meta'].find({'_id': user.id}).count() == 0:
        print('Getting.....#{0} - {1}'.format(ct, user.login))
        user_metadata_upsert(user)
        user_following_upsert(user)
        user_followers_upsert(user)

In [79]:
def python_top_repos(ct=0):
    '''
    IN: counter initializer for number of repositories traversed
    RETURN: None (print when finished traversing)
    '''
    for repo in git_client.search_repositories('python', sort='stars', order='desc'):
        try:
            # ensure its a python tagged dirctory
            if repo.language.lower() == 'python':
                ct+=1
                get_repo(repo)
                get_repo(owner)

            # don't get cut off
            time.sleep(randint(2,9))
        except:
            pass
    return print('DONE')

### 804 Repos Pulled - 09/05/16

# Get repos under top level user/organization

In [86]:
for user_meta in db['git_users_meta'].find()[2:3]:
    USER = user_meta['login']
    _user = git_client.get_user(USER)
    for repo in _user.get_repos():
        print(repo.full_name)

scrapy/cssselect
scrapy/dirbot
scrapy/gsoc2014-integration-tests
scrapy/loginform
scrapy/parsel
scrapy/queuelib
scrapy/scrapely
scrapy/scrapy
scrapy/scrapy-itemloader
scrapy/scrapy.org
scrapy/scrapyd
scrapy/scrapyd-client
scrapy/slybot
scrapy/w3lib


In [80]:
def repos_of_top_repos(ct=0):
    for user_meta in db['git_users_meta'].find()[2:3]:
        git_user = user_meta['login']
        _user = git_client.get_user(git_user)

        for repo in _user.get_repos():
            try:
                ct+=1
                get_repo(repo)
                #get_repo_owner(repo) - do not need, already pulled from top_repos

                # don't get cut off
                time.sleep(randint(2,9))
            except:
                pass
    return print('DONE')

In [None]:
repos_of_top_repos()

In [68]:
#cursor = db['git_repos_meta'].find({'owner.login':'scikit-learn'})
#for i in cursor:
#    pprint(i)

{'_id': 843222,
 'created_at': '2010-08-17T09:43:38Z',
 'description': 'scikit-learn: machine learning in Python',
 'fork': False,
 'forks_count': 7596,
 'full_name': 'scikit-learn/scikit-learn',
 'has_downloads': True,
 'has_issues': True,
 'has_wiki': True,
 'homepage': 'http://scikit-learn.org',
 'html_url': 'https://github.com/scikit-learn/scikit-learn',
 'language': 'Python',
 'name': 'scikit-learn',
 'open_issues_count': 1280,
 'owner': {'id': 365630,
           'login': 'scikit-learn',
           'site_admin': False,
           'type': 'Organization'},
 'pushed_at': '2016-09-04T13:07:11Z',
 'size': 122720,
 'stargazers_count': 13136,
 'subscribers_count': 1358}
{'_id': 1402233,
 'created_at': '2011-02-23T13:12:44Z',
 'description': 'Benchmarks for various machine learning packages',
 'fork': False,
 'forks_count': 28,
 'full_name': 'scikit-learn/ml-benchmarks',
 'has_downloads': True,
 'has_issues': True,
 'has_wiki': True,
 'homepage': 'http://scikit-learn.github.com/ml-benchma