In [1]:
import os
import sys
import re
import time
from github import Github
from random import randint
from pprint import pprint

# Credentials

In [2]:
from pymongo import MongoClient
pw_file = 'credentials/pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to MongoDB

In [3]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

# get reference to  resume_db
db = client['github_db']

# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Create a MongoDB collection

In [4]:
#db.drop_collection('github_repos')

In [5]:
print('Collections in database: {0}'.format(db.collection_names()))
for name in db.collection_names():
    print('Number of items in \"{0}\" collection: {1}'.format(name, db[name].count()))

Collections in database: ['github_users']
Number of items in "github_users" collection: 0


In [6]:
# create a collection in the 'github_db' database
lst_collections = ['github_users', 'github_repos']

for collection in lst_collections:
    if collection not in db.collection_names():
        db.create_collection(collection)

In [7]:
print('Collections in database: {0}'.format(db.collection_names()))
for name in db.collection_names():
    print('Number of items in \"{0}\" collection: {1}'.format(name, db[name].count()))

Collections in database: ['github_users', 'github_repos']
Number of items in "github_users" collection: 0
Number of items in "github_repos" collection: 0


# Get authenticated access to Github for higher requests limit

In [8]:
# import github private token
with open('credentials/token.txt', 'r') as infile:
    token = infile.readline().strip()
    user = infile.readline().strip()

git_client = Github(token)

# Rate limits

In [9]:
def get_remaining(_type ='core'):
    '''
    _type = 'core' or 'search'
    Return: tuple of remaining rate limit quantity, and time till reset
    '''
    
    rate_limit = git_client.get_rate_limit()
    raw = dict(rate_limit.raw_data)
    
    remaining = int(raw['resources'][_type]['remaining'])
    reset = int(raw['resources'][_type]['reset'])
    
    return (remaining, reset)

In [10]:
print('Resource rate limit remaining: {0}'.format(get_remaining('core')))
print('Search rate limit remaining: {0}'.format(get_remaining('search')))

Resource rate limit remaining: (3513, 1472763563)
Search rate limit remaining: (30, 1472761287)


# REMOVE ME - ONLY FOR DEV

In [11]:
def list_subscribers(repo):
    '''
    IN: github repo object
    RETURN: list of repo subscribers by their name for repo (passed)
    '''
    try:
        subscribers = [f.login for f in repo.get_subscribers()]
    except:
        subscribers = ''
        
    return subscribers

In [12]:
def list_stargazers(repo):
    '''
    IN: github repo object
    RETURN: list of repo stargazers by their name for repo (passed)
    '''
    try:
        stargazers = [f.login for f in repo.get_stargazers()]
    except:
        stargazers = ''
        
    return stargazers

In [13]:
def list_contributors(repo):
    '''
    IN: github repo object
    RETURN: list of repo contributors by their name for repo (passed)
    '''
    try:
        contributors = [f.login for f in repo.get_contributors()]
    except:
        contributors = ''
        
    return contributors

In [14]:
def repo_data(repo):
    repo_keys = ['id', 'owner', 'name', 'full_name', 'description', 'private', 'fork', 'html_url', 'homepage', 
             'language', 'forks_count', 'size', 'open_issues_count', 'has_issues', 'has_wiki', 'has_downloads', 
             'pushed_at', 'created_at', 'subscribers_count', 'stargazers_count']
    
    owner_keys = ['gravatar_id', 'type', 'login', 'id', 'site_admin', 'url', 'avatar_url', 'html_url']
    
    # get returned payload raw data into dictionary
    raw_dict = dict(repo.raw_data)
    
    # trim repo by keys listed above
    repo_dict = {k: raw_dict[k] for k in repo_keys}
    
    # trim owner by keys listed above
    owner_dict = {k: raw_dict['owner'][k] for k in owner_keys}
    repo_dict['owner'] = owner_dict
    
    # add in list of subscirbers by login name
    repo_dict['list_subscribers'] = list_subscribers(repo)
    
    # add in list of stargazers by login name
    repo_dict['list_stargazers'] = list_stargazers(repo)
    
    # add in list of contributors by login name
    repo_dict['list_contributors'] = list_contributors(repo)
    
    return repo_dict

In [15]:
# a = repo_data(repo)

# User data support functions

In [16]:
def list_following(user):
    '''
    IN: github user object
    RETURN: list of users by their login name followed by user (passed) 
    '''
    if user.following > 0:
        following = [f.name for f in user.get_following()]
    else:
        following = ''
        
    return following

In [17]:
def list_followers(user):
    '''
    IN: github user object
    RETURN: list of followers of the user by their login name
    '''
    if user.followers > 0:
        followers = [f.login for f in user.get_followers()]
    else:
        followers = ''
    
    return followers

In [18]:
def list_starred(user):
    '''
    IN: github user object
    RETURN: list of starred repos by their full name starred by user (passed)
    '''
    try:
        starred = [f.full_name for f in user.get_starred()]
    except:
        starred = ''
        
    return starred

# User data

In [19]:
def user_data(user):
    user_keys = ['email', 'followers', 'hireable', 'login', 'id', 'bio', 'avatar_url', 'company', 
                 'updated_at', 'type', 'created_at', 'name', 'location', 'html_url', 'public_repos', 
                 'blog', 'public_gists', 'following']

    # get returned payload raw data into dictionary
    raw_dict = dict(user.raw_data)
    
    # trim user by keys listed above
    user_dict = {k: raw_dict[k] for k in user_keys}
    
    # add in list of followers by login name
    user_dict['list_followers'] = list_followers(user)
    
    # add in list of following by login name
    user_dict['list_following'] = list_following(user)
    
    # add in list of starred repos by full name
    user_dict['list_starred'] = list_starred(user)
    
    return user_dict

# Upsert MongoDB document (insert/update)

In [50]:
def files_upsert_doc(repo_fullname, file_name, raw_file_contents, file_extension):
    '''
    '''
    # strip newline and tab chars from raw file content
    stripped_file_contents = raw_file_contents.replace('\n', ' ')
    stripped_file_contents = stripped_file_contents.replace('\t', ' ')
    
    try:
        db['github_repos'].update_one({
            'full_name': repo_fullname,
        }, 
        {
            '$set':{
                file_name: {
                        'raw_file': raw_file_contents,
                        'stripped_file': stripped_file_contents,
                        'extension': file_extension,
                        }
            }
        }, upsert=True
        )
    except:
        pass
    return None

In [51]:
def file_too_large(file_str):
    '''
    IN: file contents as string of text
    RETURN: if the file is too large for mongodb (greater than 16mb), return True
            *** if its too large, will eventually have to use GridFS to push to mongo ***
    '''
    if sys.getsizeof(file_str) > 16000000:
        return True
    else:
        return False

In [52]:
def upload_files(repo, _path='.'):
    '''
    IN: github repo object and path to folder
    RETRUN: all files from repo except those in the exclusion list
    '''
    # exclude files with these extensions
    exclude = ['png', 'jpg', 'gif', 'tif']

    # grab all contents in the main directory
    dir_contents = repo.get_dir_contents(_path)
    repo_fullname = repo.full_name

    for content in dir_contents:
        # if item is a directory then recursively navigate lower to get files inside
        if content.type == 'dir':
            upload_files(repo, content.path)

        else:
            # get file extension
            file_split = content.name.split('.')
            file_ext = file_split[-1]

            if file_ext not in exclude:
                try:
                    raw_file_content = content.decoded_content.decode(errors='replace') # decode githubs compression
                except:
                    raw_file_content = ''
                    
                file_name = content.name.replace('.', '||') # periods not allowed in keys
                
                # if file is not too large (>16mb), add to collection
                file_size = sys.getsizeof(raw_file_content)
                
                if file_size < 16000000:
                    files_upsert_doc(repo_fullname, file_name, raw_file_content, file_ext)
                else:
                    print('FILE SIZE LIMIT: {0} -- {1}'.format(file_name, file_size))
    return None

In [53]:
# upload_files(repo)

In [54]:
#cursor = db.find(repo.full_name)
#for document in cursor: 
#    pprint(document.keys())

In [55]:
#for cursor in db['github_repo'].find()[0]:
    #pprint(cursor)

# Search Github for 'Python' repositories

In [56]:
import time

for repo in git_client.search_repositories('python')[5:6]:
        
    # ensure its a python tagged dirctory
    if repo.language.lower() == 'python':
        
        #############################
        # INSERT FURTHER CALLS HERE #
        
        rate, _ = get_remaining('core')
        
        if (rate > 10):
            print('Working on {0}'.format(repo.full_name))
            upload_files(repo)
        
        else:
            print('Waiting to continue...')
            time.sleep(1800)
            
            if (rate > 10):
                print('Retrying {0}'.format(repo.full_name))
            else:
                print('Waiting to continue...')
                time.sleep(1800)
            
                if (rate > 10):
                    print('Final retry {0}'.format(repo.full_name))
                else:
                    print('Waiting to continue...')
                    time.sleep(900)
            
        #############################
        
    # don't get cut off
    time.sleep(randint(2,9))
    
print('DONE')

Working on joe011/python


In [57]:
db.collection_names()

['github_users', 'github_repos']

In [58]:
db.github_repos.count()

1

In [None]:
db.gihub_repos.find({})

In [61]:
for cursor in db['github_repos'].find():
    pprint(cursor)

{'README||md': {'extension': 'md',
                            '\t\n'
                            '##本项目为日常工作中的使用的python脚本\n'
                            '\n'
                            '\n'
                            '###1.  ssh_thread.py  '
                            '是一个批量执行命令的脚本，支持直接执行ssh命令及文件传输，支持多线程\n'
                            '\n'
                            '\t\t使用说明如下：\n'
                            '\t\n'
                            '\t\t-h,-H,--help         帮助页面 \n'
                            '        -C, --cmd            执行命令模式 \n'
                            '        -M, --command        执行命令模式 \n'
                            '        -S, --sendfile       传输文件模式 \n'
                            '        -L, --localpath      本地文件路径 \n'
                            '        -R, --remotepath     远程服务器路径 \n'
                            '\n'
                            '\t    IP列表格式:\n'
                            '\n'
                            '   \t    IP地址\t\t用户名     