In [1]:
import os
import re
import time
from github import Github
from random import randint

# Credentials

In [2]:
from pymongo import MongoClient
pw_file = 'credentials/pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to MongoDB

In [3]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

# get reference to  resume_db
db = client['github_db']

# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Create a MongoDB collection

In [4]:
# create a collection in the 'github_db' database
if 'github_users' not in db.collection_names():
    db.create_collection('github_users')

In [5]:
print('Collections in database: {0}'.format(db.collection_names()))
print('Number of items in {0} collection: {1}'.format(db.collection_names(), db['github_users'].count()))

Collections in database: ['github_users']
Number of items in ['github_users'] collection: 0


# Upsert MongoDB document (insert/update)

In [6]:
def upsert_doc(collection, link_id, term, res_txt):
    '''
    collection: mongodb collection data will be upserted into
    link_id: unique id for users resume online
    term: term searched when resume was found
    res_text: text only from resume
    '''
    db[collection].update_one({
        'link_id': link_id,
    }, 
    {
        '$set':{
            'search_term': term,
            'resume_text': res_txt,
        }
    }, upsert=True
    )

    return None

# Get authenticated access to Github for higher requests limit

In [7]:
# import github private token
with open('credentials/token.txt', 'r') as infile:
    token = infile.readline().strip()
    user = infile.readline().strip()

git_client = Github(token)

# Rate limits

In [8]:
def get_remaining(_type ='core'):
    '''
    _type = 'core' or 'search'
    Return: tuple of remaining rate limit quantity, and time till reset
    '''
    
    rate_limit = git_client.get_rate_limit()
    raw = dict(rate_limit.raw_data)
    
    remaining = int(raw['resources'][_type]['remaining'])
    reset = int(raw['resources'][_type]['reset'])
    
    return (remaining, reset)

In [9]:
print('Resource rate limit remaining: {0}'.format(get_remaining('core')))
print('Search rate limit remaining: {0}'.format(get_remaining('search')))

Resource rate limit remaining: (4981, 1472678113)
Search rate limit remaining: (30, 1472675665)


# REMOVE ME - ONLY FOR DEV

In [10]:
USER = 'ptwobrussell'
REPO = 'Mining-the-Social-Web'

user = git_client.get_user(USER)
repo = user.get_repo(REPO)

In [11]:
def repo_data(repo):
    repo_keys = ['id', 'owner', 'name', 'full_name', 'description', 'private', 'fork', 'html_url', 'homepage', 
             'language', 'forks_count', 'size', 'open_issues_count', 'has_issues', 'has_wiki', 'has_downloads', 
             'pushed_at', 'created_at', 'subscribers_count', 'stargazers_count']
    
    owner_keys = ['gravatar_id', 'type', 'login', 'id', 'site_admin', 'url', 'avatar_url', 'html_url']
    
    # get returned payload raw data into dictionary
    raw_dict = dict(repo.raw_data)
    
    # trim repo by keys listed above
    repo_dict = {k: raw_dict[k] for k in repo_keys}
    
    # trim owner by keys listed above
    owner_dict = {k: raw_dict['owner'][k] for k in owner_keys}
    
    repo_dict['owner'] = owner_dict
    
    return repo_dict

In [12]:
def list_subscribers(repo):
    '''
    IN: github repo object
    RETURN: list of repo subscribers by their name for repo (passed)
    '''
    try:
        subscribers = [f.login for f in repo.get_subscribers()]
    except:
        subscribers = ''
        
    return subscribers

In [13]:
# repo_data(repo)

# User data support functions

In [14]:
def list_following(user):
    '''
    IN: github user object
    RETURN: list of users by their login name followed by user (passed) 
    '''
    if user.following > 0:
        following = [f.name for f in user.get_following()]
    else:
        following = ''
        
    return following

In [15]:
def list_followers(user):
    '''
    IN: github user object
    RETURN: list of followers of the user by their login name
    '''
    if user.followers > 0:
        followers = [f.login for f in user.get_followers()]
    else:
        followers = ''
    
    return followers

In [16]:
def list_starred(user):
    '''
    IN: github user object
    RETURN: list of starred repos by their full name starred by user (passed)
    '''
    try:
        starred = [f.full_name for f in user.get_starred()]
    except:
        starred = ''
        
    return starred

# User data

In [17]:
def user_data(user):
    user_keys = ['email', 'followers', 'hireable', 'login', 'id', 'bio', 'avatar_url', 'company', 
                 'updated_at', 'type', 'created_at', 'name', 'location', 'html_url', 'public_repos', 
                 'blog', 'public_gists', 'following']

    # get returned payload raw data into dictionary
    raw_dict = dict(user.raw_data)
    
    # trim user by keys listed above
    user_dict = {k: raw_dict[k] for k in user_keys}
    
    # add in list of followers by login name
    user_dict['list_followers'] = list_followers(user)
    
    # add in list of following by login name
    user_dict['list_following'] = list_following(user)
    
    # add in list of starred repos by full name
    user_dict['list_starred'] = list_starred(user)
    
    return user_dict

# Download Python files

In [18]:
def directory_download(directory, location):
    '''
    Download all *.py files inside repo
    '''
    try:
        # grab all contents in the main directory
        dir_contents = directory.get_dir_contents('.')

        for content in dir_contents:
            
            # if item grabbed is a directory then recursively navigate lower to get files inside
            if content.type == 'dir':
                directory_download(content, location)
            
            # if item grabbed is a python script, download
            elif content.path[-2:] == 'py':
                print('Processing - {0}'.format(content.name))
                with open('{0}/{1}'.format(location, content.name), 'wb') as outfile:
                    outfile.write(content.decoded_content)
    except:
        pass

# Repo crawler

In [None]:
for repo in g.search_repositories('python')[5:15]:
    # create folder to store repos python files
    if not os.path.exists('files'):
        os.makedirs('files')
        
    # ensure language is python
    try:
        # ensure its a python tagged dirctory
        if repo.language.lower() == 'python':
            username, dir_name = repo.full_name.split('/')
            print('Name: {0} \t Username: {1} \t Repo: {2}'.format(repo.owner.name, username, dir_name))
            
            # make directory named after the repo name
            folder = 'files/{0}__{1}'.format(username, dir_name)
            
            # create folder to store repos python files
            if not os.path.exists(folder):
                os.makedirs(folder)
            
            # download repo files
            directory_download(repo, folder)
            
            # delete empty directories (did not have python files)
            if not os.listdir(folder):
                os.rmdir(folder)
        
        # don't get cut off
        time.sleep(randint(2,9))
    except:
        pass