In [1]:
import os
import re
import time
from github import Github

# Credentials

In [2]:
from pymongo import MongoClient
pw_file = 'credentials/pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to MongoDB

In [3]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

# get reference to  resume_db
db = client['github_db']

# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Create a collection

In [4]:
# create a collection called 'github'
if 'github_users' not in db.collection_names():
    db.create_collection('github_users')

In [5]:
db.collection_names()

['github_users']

# Upsert Document (Insert/Update)

In [6]:
def upsert_doc(collection, link_id, term, res_txt):
    '''
    collection: mongodb collection data will be upserted into
    link_id: unique id for users resume online
    term: term searched when resume was found
    res_text: text only from resume
    '''
    db[collection].update_one({
        'link_id': link_id,
    }, 
    {
        '$set':{
            'search_term': term,
            'resume_text': res_txt,
        }
    }, upsert=True
    )

    return None

# Get authenticated access to Github for higher requests limit

In [10]:
# import github private token
with open('credentials/token.txt', 'r') as infile:
    token = infile.readline().strip()
    user = infile.readline().strip()

git_client = Github(token)

In [11]:
USER = 'ptwobrussell'
REPO = 'Mining-the-Social-Web'

user = git_client.get_user(USER)
repo = user.get_repo(REPO)

# Repo Info

In [133]:
print('Repo created: {0}'.format(repo.created_at))
print('Repo description: {0}'.format(repo.description))
print('Repo owner: {0}'.format(repo.owner.name))
print('############## GO DEEPER INTO OWNER ##############')
print('Repo number of forks: {0}'.format(repo.forks_count))
print('Repo full (semi-unique) name: {0}'.format(repo.full_name))
print('Repo name: {0}'.format(repo.name))
print('Repo homepage: {0}'.format(repo.homepage))
print('Repo id: {0}'.format(repo.id))
print('Repo language: {0}'.format(repo.language))
print('Repo size: {0}'.format(repo.size))
print('Repo last pushed at (is this pull requests?): {0}'.format(repo.pushed_at))

Repo created: 2010-11-01 01:26:07
Repo description: The official online compendium for Mining the Social Web (O'Reilly, 2011)
Repo owner: Matthew A. Russell
############## GO DEEPER INTO OWNER ##############
Repo number of forks: 451
Repo full (semi-unique) name: ptwobrussell/Mining-the-Social-Web
Repo name: Mining-the-Social-Web
Repo homepage: http://bit.ly/135dHfs
Repo id: 1040700
Repo language: JavaScript
Repo number of open issues: 10
Repo size: 2016
Repo last pushed at (is this pull requests?): 2013-10-18 13:41:23


# Repo Open Issues Notes

In [163]:
# issues
print('Repo number of open issues: {0}'.format(repo.open_issues_count))
if repo.open_issues_count > 0:
    issues = [f for f in repo.get_issues()]
    issues = issues[0]
    issue = issues.body
else:
    issue = ''
print('Repo stargazers: {0}'.format(issue))

Repo number of open issues: 10
Repo stargazers: On pages 25-26, there is this code:

q = '#MentionSomeoneImportantForYou'

count = 100

search_results = twitter_api.search.tweets(q=q, count=count)
#twitter_api is predefined and is working fine.

statuses = search_results['statuses']

for _ in range(5):
   print "Length of statuses", len(statuses)
   try:
       next_results = search_results['search_metadata']['next_results']
   except KeyError, e: # No more results when next_results doesn't exist
       break

kwargs = dict([ kv.split('=') for kv in next_results[1:].split("&") ])

The last code throws an error that 'next_results' is not defined.


# Repo Stargazer IDs

In [165]:
# stargazers
print('Repo number of stargazers: {0}'.format(repo.stargazers_count))
if repo.stargazers_count > 0:
    stargazers = [f.id for f in repo.get_stargazers()]
    stargazers = stargazers[:5]
else:
    stargazers = ''
print('Repo stargazers: {0}'.format(stargazers))

Repo number of stargazers: 1072
Repo stargazers: [224, 1080, 1231, 1411, 2664]


# Repo Owner Info

In [174]:
print('Repo owner name: {0}'.format(repo.owner.name))
print('Repo owner id: {0}'.format(repo.owner.id))
print('Repo owner html url: {0}'.format(repo.owner.html_url))
print('Repo owner login: {0}'.format(repo.owner.login))
print('Repo owner type: {0}'.format(repo.owner.type))
print('Repo owner avatar url: {0}'.format(repo.owner.avatar_url))
print('Repo owner bio: {0}'.format(repo.owner.bio))
print('Repo owner blog: {0}'.format(repo.owner.blog))
print('Repo owner company: {0}'.format(repo.owner.company))
print('Repo owner email: {0}'.format(repo.owner.email))
print('Repo owner email: {0}'.format(repo.owner.get_public_events))

Repo owner name: Matthew A. Russell
Repo owner id: 98668
Repo owner html url: https://github.com/ptwobrussell
Repo owner login: ptwobrussell
Repo owner type: User
Repo owner avatar url: https://avatars.githubusercontent.com/u/98668?v=3
Repo owner bio: None
Repo owner blog: http://twitter.com/ptwobrussell
Repo owner company: None
Repo owner email: None
Repo owner email: <bound method NamedUser.get_public_events of <github.NamedUser.NamedUser object at 0x106fd8080>>


# Repo Owner Following

In [171]:
# following
print('Repo owner number following: {0}'.format(repo.owner.following))
if repo.owner.following > 0:
    following = [f.name for f in repo.owner.get_following()[:10]]
else:
    following = ''
print('Repo owner (return list of following): {0}'.format(following))

Repo owner number following: 0
Repo owner (return list of following): 


# Repo Owner Followers

In [168]:
# followers
print('Repo owner number of followers: {0}'.format(repo.owner.followers))
if repo.owner.followers > 0:
    followers = [f.id for f in repo.owner.get_followers()[:5]]
else:
    followers = ''
print('Repo owner (return list of followers): {0}'.format(followers))

Repo owner number of followers: 621
Repo owner (return list of followers): [2836, 4171, 5047, 12640, 13137]


# Repo Owner Orgs

In [173]:
# orgs
orgs = [f.name for f in repo.owner.get_orgs()]
if len(orgs) > 0:
    orgs = orgs[:10]
else:
    orgs = ''
print('Repo owner orgs: {0}'.format(orgs))

Repo owner orgs: 


In [178]:
# public events
events = [f.name for f in repo.owner.get_public_events()]
if len(events) > 0:
    events = events[:5]
else:
    events = ''
print('Repo owner public events: {0}'.format(events))

Repo owner public events: 


# Get Contributors

In [58]:
contributors = [c for c in repo.get_contributors()]

contrib = contributors[2]
print('Contributors name: {0}'.format(contrib.name))
print('Contributors id: {0}'.format(contrib.id))
print('Contributors etag: {0}'.format(contrib.etag))
print('Contributors company: {0}'.format(contrib.company))
print('Contributors email: {0}'.format(contrib.email))
print('Contributors number of followers: {0}'.format(contrib.followers))

Contributors name: Chris Birchall
Contributors id: 106760
Contributors etag: "266076ceed369fa1c37f94b207700f67"
Contributors company: Guardian News & Media
Contributors email: chris.birchall@gmail.com
Contributors number of followers: 78


In [66]:
watches = [w for w in contrib.get_watched()]
watch = watches[1]
watch.raw_data

{'archive_url': 'https://api.github.com/repos/taku910/crfpp/{archive_format}{/ref}',
 'assignees_url': 'https://api.github.com/repos/taku910/crfpp/assignees{/user}',
 'blobs_url': 'https://api.github.com/repos/taku910/crfpp/git/blobs{/sha}',
 'branches_url': 'https://api.github.com/repos/taku910/crfpp/branches{/branch}',
 'clone_url': 'https://github.com/taku910/crfpp.git',
 'collaborators_url': 'https://api.github.com/repos/taku910/crfpp/collaborators{/collaborator}',
 'comments_url': 'https://api.github.com/repos/taku910/crfpp/comments{/number}',
 'commits_url': 'https://api.github.com/repos/taku910/crfpp/commits{/sha}',
 'compare_url': 'https://api.github.com/repos/taku910/crfpp/compare/{base}...{head}',
 'contents_url': 'https://api.github.com/repos/taku910/crfpp/contents/{+path}',
 'contributors_url': 'https://api.github.com/repos/taku910/crfpp/contributors',
 'created_at': '2015-03-14T05:51:04Z',
 'default_branch': 'master',
 'deployments_url': 'https://api.github.com/repos/taku9

In [59]:
orgs = [o for o in contrib.get_orgs()]
org = orgs[0] # May not have org
org.name

'The Guardian'

# Get Followers/Following

In [52]:
who_i_follow = [f for f in contrib.get_followers()]
a = who_i_follow[0]
a.name

'Sergey N. Poulikov'

# Convert Jupyter notebook to Python script file (NOT BEING USED)

# Download Python files

In [8]:
def directory_download(directory, location):
    '''
    Download all *.py files inside repo
    '''
    try:
        # grab all contents in the main directory
        dir_contents = directory.get_dir_contents('.')

        for content in dir_contents:
            
            # if item grabbed is a directory then recursively navigate lower to get files inside
            if content.type == 'dir':
                directory_download(content, location)
            
            # if item grabbed is a python script, download
            elif content.path[-2:] == 'py':
                print('Processing - {0}'.format(content.name))
                with open('{0}/{1}'.format(location, content.name), 'wb') as outfile:
                    outfile.write(content.decoded_content)
    except:
        pass

# Repo crawler

In [9]:
for repo in g.search_repositories('*.ipynb')[5:15]:
    # create folder to store repos python files
    if not os.path.exists('files'):
        os.makedirs('files')
        
    # ensure language is python
    try:
        # ensure its a python tagged dirctory
        if repo.language.lower() == 'python':
            username, dir_name = repo.full_name.split('/')
            print('Name: {0} \t Username: {1} \t Repo: {2}'.format(repo.owner.name, username, dir_name))
            
            # make directory named after the repo name
            folder = 'files/{0}__{1}'.format(username, dir_name)
            
            # create folder to store repos python files
            if not os.path.exists(folder):
                os.makedirs(folder)
            
            # download repo files
            directory_download(repo, folder)
            
            # delete empty directories (did not have python files)
            if not os.listdir(folder):
                os.rmdir(folder)
        
        # don't get cut off
        time.sleep(3)
    except:
        pass

Name: tjwei 	 Username: tjwei 	 Repo: tjw_ipynb
Processing - VideoSlidesSyncWrite.py
Processing - atitweak.py
Processing - personal_info.example.py
Name: Charl P. Botha 	 Username: cpbotha 	 Repo: django-shell-ipynb
Processing - setup.py
Name: Yiannis Gatsoulis 	 Username: gatsoulis 	 Repo: py2ipynb
Processing - py2ipynb.py


# What data do I want from the repo
* Python files
* Owners name
* Owners username
* Number of stars
* Number of watchers
* Number of forks
* 

# Experimental

In [10]:
for repo in g.search_repositories('python')[:15]:
    print('{0} -- {1}'.format(repo.url, repo.stargazers_count))
    
    print(repo.network_count)

https://api.github.com/repos/geekcomputers/Python -- 1169
1090
https://api.github.com/repos/xxg1413/python -- 549
540
https://api.github.com/repos/poise/python -- 448
684
https://api.github.com/repos/Show-Me-the-Code/python -- 654
950
https://api.github.com/repos/docker-library/python -- 248
120
https://api.github.com/repos/python-git/python -- 485
277
https://api.github.com/repos/joe011/python -- 158
176
https://api.github.com/repos/TheAlgorithms/Python -- 337
67
https://api.github.com/repos/pubnub/python -- 61
73
https://api.github.com/repos/zhanghe06/python -- 87
97
https://api.github.com/repos/rippleblue/Python -- 78
72
https://api.github.com/repos/Tim9Liu9/python -- 143
88
https://api.github.com/repos/bigmlcom/python -- 222
91
https://api.github.com/repos/smilejay/python -- 30
36
https://api.github.com/repos/LightTable/Python -- 86
59


In [11]:
# access users github repos
lst_repos = []
for repo in g.get_user().get_repos():
    lst_repos.append(repo)

In [12]:
rep = lst_repos[-1]

# username/repo name
username, repo_name = rep.full_name.split('/')

# owner/repo => base repo
print(rep.full_name)

# repo owner name
print(rep.owner.name)

print(username)
print(repo_name)
print(rep.full_name)
print(rep.forks)
print(rep.owner.collaborators)
print(rep.owner.company)
print(rep.owner.followers)
print(rep.owner.following)
print(rep.owner.id)
print(rep.network_count) # number of contributors

thisismetis/nyc16_ds8
Metis
thisismetis
nyc16_ds8
thisismetis/nyc16_ds8
28
None
None
0
0
6126166
28


In [14]:
rep.stargazers_count

1

In [23]:
repo.stargazers

AttributeError: 'Repository' object has no attribute 'stargazers'