In [1]:
import os
import sys
import re
import time
from github import Github
from random import randint
from pprint import pprint

# Credentials

In [2]:
from pymongo import MongoClient
pw_file = 'credentials/pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to MongoDB

In [3]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

# get reference to  resume_db
db = client['github_db']

# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Create a MongoDB collection

In [4]:
for name in db.collection_names():
    #db.drop_collection(name)
    print('Documents in \"{0}\" collection: {1}'.format(name, db[name].count()))

In [5]:
# create a collection in the 'github_db' database
lst_collections = ['git_users_meta', 'git_users_following', 'git_users_followers', 'git_users_starred', 
                   'git_repos_meta', 'git_repos_docs', 'git_repos_subscribers', 'git_repos_stargazers', 
                  'git_repos_contributors']

for collection in lst_collections:
    if collection not in db.collection_names():
        db.create_collection(collection)

In [6]:
for name in db.collection_names():
    print('Documents in \"{0}\" collection: {1}'.format(name, db[name].count()))

Documents in "git_repos_stargazers" collection: 0
Documents in "git_users_starred" collection: 0
Documents in "git_repos_meta" collection: 0
Documents in "git_users_followers" collection: 0
Documents in "git_repos_contributors" collection: 0
Documents in "git_repos_subscribers" collection: 0
Documents in "git_users_meta" collection: 0
Documents in "git_users_following" collection: 0
Documents in "git_repos_docs" collection: 0


# Get authenticated access to Github for higher requests limit

In [7]:
# import github private token
with open('credentials/token.txt', 'r') as infile:
    token = infile.readline().strip()
    user = infile.readline().strip()

git_client = Github(token)

# Rate limits

In [8]:
def get_remaining(_type ='core'):
    '''
    _type = 'core' or 'search'
    Return: tuple of remaining rate limit quantity, and time till reset
    '''
    
    rate_limit = git_client.get_rate_limit()
    raw = dict(rate_limit.raw_data)
    
    remaining = int(raw['resources'][_type]['remaining'])
    reset = int(raw['resources'][_type]['reset'])
    
    return (remaining, reset)

In [9]:
print('Resource rate limit remaining: {0}'.format(get_remaining('core')))
print('Search rate limit remaining: {0}'.format(get_remaining('search')))

Resource rate limit remaining: (4611, 1472854009)
Search rate limit remaining: (30, 1472852919)


# Wait function

In [10]:
def limit_wait():
    # wait till rate limit is refreshed
    rate, _ = get_remaining('core')
    while (rate < 10):
        time.sleep(300)

# X X X X X X X X X X X X X X X X X X X X X X X X X X 

In [11]:
USER = 'ptwobrussell'
REPO = 'Mining-the-Social-Web'

# check for rate limit
limit_wait()

user = git_client.get_user(USER)
repo = user.get_repo(REPO)

# Repo Metadata

In [12]:
def repo_metadata(repo):
    repo_keys = ['owner', 'name', 'full_name', 'description', 'fork', 'html_url', 'homepage', 
             'language', 'forks_count', 'size', 'open_issues_count', 'has_issues', 'has_wiki', 'has_downloads', 
             'pushed_at', 'created_at', 'subscribers_count', 'stargazers_count']
    
    owner_keys = ['type', 'login', 'id', 'site_admin']
    
    # check for rate limit
    limit_wait()
    
    # get returned payload raw data into dictionary
    raw_dict = dict(repo.raw_data)
    
    # trim repo by keys listed above
    repo_dict = {k: raw_dict[k] for k in repo_keys}
    
    # trim owner by keys listed above
    owner_dict = {k: raw_dict['owner'][k] for k in owner_keys}
    repo_dict['owner'] = owner_dict
    repo_dict['_id'] = repo.id
    
    return repo_dict

In [13]:
def repo_metadata_upsert(repo):
    col = db['git_repos_meta']
    
    try:
        col.update_one(
            {'_id': repo.id}, 
            {'$set':{col.insert_one(repo_metadata(repo))}}, 
            upsert=True)
    except:
        pass
    return None

In [14]:
_ = db.drop_collection('git_repos_meta')
_ = db.create_collection('git_repos_meta')

In [15]:
_ = repo_metadata_upsert(repo)
print(db.git_repos_meta.count())

1


# Repo Subscribers

In [16]:
def repo_subscribers(repo):
    '''
    IN: github repo object
    RETURN: dictionary containing repo subscribers
    '''
    repo_dict = {}
    
    # check for rate limit
    limit_wait()
    
    repo_dict['_id'] = repo.id
    repo_dict['full_name'] = repo.full_name
    
    try:
        repo_dict['repo_subscribers'] = [f.login for f in repo.get_subscribers()]
    except:
        repo_dict['repo_subscribers'] = []
        
    return repo_dict

In [17]:
def repo_subscribers_upsert(repo):
    col = db['git_repos_subscribers']
    
    try:
        col.update_one(
            {'_id': repo.id}, 
            {'$set': {col.insert_one(repo_subscribers(repo))}
            }, upsert=True)
    except:
        pass
    return None

In [18]:
_ = db.drop_collection('git_repos_subscribers')
_ = db.create_collection('git_repos_subscribers')

In [19]:
_ = repo_subscribers_upsert(repo)
print(db.git_repos_subscribers.count())

1


In [20]:
ct = db.git_repos_subscribers.find()
ct = ct[0]
print(len(ct['repo_subscribers']))

172


# Repo Stargazers

In [21]:
def repo_stargazers(repo):
    '''
    IN: github repo object
    RETURN: dictionary containing list of stargazers
    '''
    repo_dict = {}
    
    # check for rate limit
    limit_wait()

    repo_dict['_id'] = repo.id
    repo_dict['full_name'] = repo.full_name
    
    try:
        repo_dict['repo_stargazers'] = [f.login for f in repo.get_stargazers()]
    except:
        repo_dict['repo_stargazers'] = []

    return repo_dict

In [22]:
def repo_stargazers_upsert(repo):
    col = db['git_repos_stargazers']
    
    try:
        col.update_one(
            {'_id': repo.id}, 
            {'$set': {col.insert_one(repo_stargazers(repo))}
            }, upsert=True)
    except:
        pass
    return None

In [23]:
_ = db.drop_collection('git_repos_stargazers')
_ = db.create_collection('git_repos_stargazers')

In [24]:
_ = repo_stargazers_upsert(repo)
print(db.git_repos_stargazers.count())

1


In [25]:
ct = db.git_repos_stargazers.find()
ct = ct[0]
print(len(ct['repo_stargazers']))

1074


# Repo Contributors

In [26]:
def repo_contributors(repo):
    '''
    IN: github repo object
    RETURN: dictionary of contributors keyed by repo full_name
    '''
    repo_dict = {}
    
    # check for rate limit
    limit_wait()
    
    repo_dict['_id'] = repo.id
    repo_dict['full_name'] = repo.full_name
    
    try:        
        repo_dict['repo_contributors'] = [f.login for f in repo.get_contributors()]
    except:
        repo_dict['repo_contributors'] = ''
        
    return repo_dict

In [27]:
def repo_contributors_upsert(repo):
    col = db['git_repos_contributors']
    
    try:
        col.update_one(
            {'_id': repo.id}, 
            {'$set': {col.insert_one(repo_contributors(repo))}
            }, upsert=True)
    except:
        pass
    return None

In [28]:
_ = db.drop_collection('git_repos_contributors')
_ = db.create_collection('git_repos_contributors')

In [29]:
_ = repo_contributors_upsert(repo)
print(db.git_repos_contributors.count())

1


In [30]:
ct = db.git_repos_contributors.find()
ct = ct[0]
print(len(ct['repo_contributors']))

7


In [31]:
for i in db.git_repos_contributors.find():
    pprint(i)

{'_id': 1040700,
 'full_name': 'ptwobrussell/Mining-the-Social-Web',
 'repo_contributors': ['ptwobrussell',
                       'sammyrulez',
                       'cb372',
                       'vkrest',
                       'gar',
                       'jeffhoek',
                       'megansquire']}


# User Metadata

In [32]:
def user_metadata(user):
    user_keys = ['email', 'followers', 'hireable', 'login', 'bio', 'avatar_url', 'company', 
                 'updated_at', 'type', 'created_at', 'name', 'location', 'html_url', 'public_repos', 
                 'blog', 'public_gists', 'following']
    # check for rate limit
    limit_wait()
        
    # get returned payload raw data into dictionary
    raw_dict = dict(user.raw_data)
    
    # trim user by keys listed above
    user_dict = {k: raw_dict[k] for k in user_keys}
    user_dict['_id'] = user.id
    
    return user_dict

In [33]:
def user_metadata_upsert(user):
    col = db['git_users_meta']
    
    try:
        col.update_one(
            {'_id': user.id}, 
            {'$set':{col.insert_one(user_metadata(user))}}, 
            upsert=True)
    except:
        pass
    return None

In [34]:
_ = db.drop_collection('git_users_meta')
_ = db.create_collection('git_users_meta')

In [35]:
_ = user_metadata_upsert(user)
print(db.git_users_meta.count())

1


In [36]:
for i in db.git_users_meta.find():
    pprint(i)

{'_id': 98668,
 'avatar_url': 'https://avatars.githubusercontent.com/u/98668?v=3',
 'bio': None,
 'blog': 'http://twitter.com/ptwobrussell',
 'company': None,
 'created_at': '2009-06-25T00:56:57Z',
 'email': None,
 'followers': 622,
 'following': 0,
 'hireable': None,
 'html_url': 'https://github.com/ptwobrussell',
 'location': 'Franklin, TN',
 'login': 'ptwobrussell',
 'name': 'Matthew A. Russell',
 'public_gists': 12,
 'public_repos': 20,
 'type': 'User',
 'updated_at': '2016-08-27T18:00:41Z'}


# User Following

In [37]:
def user_following(user):
    '''
    IN: github user object
    RETURN: dictionary of users (login name) followed by user (passed) 
            keyed by passed user's login
    '''
    user_dict = {}
    
    # check for rate limit
    limit_wait()

    user_dict['_id'] = user.id
    user_dict['login'] = user.login
        
    try:
        user_dict['user_following'] = [f.name for f in user.get_following()]
    except:
        user_dict['user_following'] = []
        
    return user_dict

In [38]:
def user_following_upsert(user):
    col = db['git_users_following']
    
    try:
        col.update_one(
            {'_id': user.id}, 
            {'$set': {col.insert_one(user_following(user))}
            }, upsert=True)
    except:
        pass
    return None

In [39]:
_ = db.drop_collection('git_users_following')
_ = db.create_collection('git_users_following')

In [40]:
_ = user_following_upsert(user)
print(db.git_users_following.count())

1


In [41]:
ct = db.git_users_following.find()
ct = ct[0]
print(len(ct['user_following']))

0


In [42]:
for i in db.git_users_following.find():
    pprint(i)

{'_id': 98668, 'login': 'ptwobrussell', 'user_following': []}


# User Followers

In [43]:
def user_followers(user):
    '''
    IN: github user object
    RETURN: dictionary of users (login name) who follow user (passed)
            keyed by passed user's login
    '''
    user_dict = {}
    
    # check for rate limit
    limit_wait()
    
    user_dict['_id'] = user.id
    user_dict['login'] = user.login
        
    try:
        user_dict['user_followers'] = [f.login for f in user.get_followers()]
    except:
        user_dict['user_followers'] = []
    
    return user_dict

In [44]:
def user_followers_upsert(user):
    col = db['git_users_followers']
    
    try:
        col.update_one(
            {'_id': user.id}, 
            {'$set': {col.insert_one(user_followers(user))}
            }, upsert=True)
    except:
        pass
    return None

In [45]:
_ = db.drop_collection('git_users_followers')
_ = db.create_collection('git_users_followers')

In [46]:
_ = user_followers_upsert(user)
print(db.git_users_followers.count())

1


In [47]:
ct = db.git_users_followers.find()
ct = ct[0]
print(len(ct['user_followers']))

622


# Users Starred

In [48]:
def user_starred(user):
    '''
    IN: github user object
    RETURN: dictionary of repos (full_name) starred by user (passed)
            keyed by passed user's login
    '''
    user_dict = {}
    
    # check for rate limit
    limit_wait()
    
    user_dict['_id'] = user.id
    user_dict['login'] = user.login
    
    try:
        user_dict['starred_repos'] = [f.full_name for f in user.get_starred()]
    except:
        user_dict['starred_repos'] = []
        
    return user_dict

In [49]:
def user_starred_upsert(user):
    col = db['git_users_starred']
    
    try:
        col.update_one(
            {'_id': user.id}, 
            {'$set': {col.insert_one(user_starred(user))}
            }, upsert=True)
    except:
        pass
    return None

In [50]:
_ = db.drop_collection('git_users_starred')
_ = db.create_collection('git_users_starred')

In [51]:
_ = user_starred_upsert(user)
print(db.git_users_starred.count())

1


In [52]:
ct = db.git_users_starred.find()
ct = ct[0]
print(len(ct['starred_repos']))

131


# Repo Files

In [53]:
def repo_scripts_upsert(repo, file_name, raw_file_cont, file_ext, doc_name):
    '''
    '''
    # strip newline and tab chars from raw file content
    col = db['git_repos_docs']
    doc_name = '{0}_{1}'.format(repo.id, doc_name)
    strip_file_cont = raw_file_cont.replace('\n', ' ')
    strip_file_cont = strip_file_cont.replace('\t', ' ')

    try:
        col.update_one(
            {'document' : doc_name}, 
            {'$set':{
                    'file_name': file_name,
                    #'raw_file': raw_file_cont,
                    #'strip_file': strip_file_cont,
                    'extension': file_ext,
                    'full_name' : repo.full_name,
                    'document' : doc_name}
                }, 
            upsert=True)
    except:
        pass
    return None

In [54]:
def repo_scripts(repo, doc_ct=0, _path='.'):
    '''
    IN: github repo object and path to folder
    RETRUN: all files from repo except those in the exclusion list
    '''
    # only files with these extensions
    extensions = ['py', 'md']

    # check for rate limit
    limit_wait()
    
    # grab all contents in the main directory
    dir_contents = repo.get_dir_contents(_path)
    repo_fullname = repo.full_name

    for content in dir_contents:
        # if item is a directory then recursively navigate lower to get files inside
        if content.type == 'dir':
            repo_scripts(repo, doc_ct, _path=content.path)

        else:
            doc_ct += 1
            # get file extension
            file_ext = content.name.split('.')[-1]
            #file_ext = file_split[-1]

            if file_ext in extensions:
                # try to decode, but return blank if fail
                try:
                    raw_file_content = content.decoded_content.decode(errors='replace')
                except:
                    raw_file_content = ''
                    
                # actual file name, mongodb doc stored name
                file_name = content.name
                doc_name = 'doc_{0}'.format(doc_ct)
                
                # add to collection if under limit (>16mb)
                file_size = sys.getsizeof(raw_file_content)
                if file_size < 16000000:
                    repo_scripts_upsert(repo, file_name, raw_file_content, file_ext, doc_name)
                else:
                    print('FILE SIZE LIMIT: {0} -- {1}'.format(file_name, file_size))
    return None

In [55]:
_ = db.drop_collection('git_repos_docs')
_ = db.create_collection('git_repos_docs')

In [56]:
_ = repo_scripts(repo)
print(db.git_repos_docs.count())

85


In [57]:
for name in db.collection_names():
    #db.drop_collection(name)
    print('Documents in \"{0}\" collection: {1}'.format(name, db[name].count()))

Documents in "git_repos_stargazers" collection: 1
Documents in "git_users_starred" collection: 1
Documents in "git_repos_meta" collection: 1
Documents in "git_users_followers" collection: 1
Documents in "git_repos_contributors" collection: 1
Documents in "git_repos_subscribers" collection: 1
Documents in "git_users_meta" collection: 1
Documents in "git_users_following" collection: 1
Documents in "git_repos_docs" collection: 85


# Search Github for 'Python' repositories

In [58]:
for repo in git_client.search_repositories('python')[5:6]:
    doc_ct = 0
    
    # ensure its a python tagged dirctory
    if repo.language.lower() == 'python':
        
        #############################
        # INSERT FURTHER CALLS HERE #
        
        print('Working on {0}'.format(repo.full_name))
        repo_scripts(repo, doc_ct)

        # insert metadata
        repo_metadata_upsert(repo)
        repo_stargazers_upsert(repo)
        repo_subscribers_upsert(repo)
        repo_contributors_upsert(repo)
        
        user_metadata_upsert(user)
        user_following_upsert(user)
        user_followers_upsert(user)
        user_starred_upsert(user)
        #############################
        
    # don't get cut off
    time.sleep(randint(2,9))
    
print('DONE')

Working on joe011/python
DONE


In [59]:
for name in db.collection_names():
    print('Number of documents in \"{0}\" collection: {1}'.format(name, db[name].count()))

Number of documents in "git_repos_stargazers" collection: 2
Number of documents in "git_users_starred" collection: 1
Number of documents in "git_repos_meta" collection: 2
Number of documents in "git_users_followers" collection: 1
Number of documents in "git_repos_contributors" collection: 2
Number of documents in "git_repos_subscribers" collection: 2
Number of documents in "git_users_meta" collection: 1
Number of documents in "git_users_following" collection: 1
Number of documents in "git_repos_docs" collection: 90


In [None]:
#for cursor in db['github_repos'].find():
#    pprint(cursor)