In [24]:
import pandas as pd
from geopy.geocoders import Nominatim
from github import Github
from creds import GH_TOKEN
from retrying import retry
import matplotlib.pyplot as plt

%matplotlib inline

In [25]:
@retry(stop_max_attempt_number=3)
def do_geocode(location,geolocator):
    ''' Function to use geopy to do a geocode, with retry as it fails sometimes
    '''
    res = geolocator.geocode(location,addressdetails=True)
    return res

In [26]:
# github setup
g = Github(GH_TOKEN)

# point at a repo
repo = g.get_repo("netdata/netdata")

RateLimitExceededException: 403 {'message': 'API rate limit exceeded for user ID 2178292.', 'documentation_url': 'https://developer.github.com/v3/#rate-limiting'}

In [None]:
# get info on users who starred the repo
data = []
n_max = 50
n = 0
for x in repo.get_stargazers_with_dates():
    if n >= n_max:
        break
    data.append([x.starred_at, x.user.name, x.user.company, x.user.email, x.user.location, x.user.followers, x.user.url, x.user.bio])
    n += 1
cols = ['starred_at','name','company','email','location','followers','url','bio']
df = pd.DataFrame(data,columns=cols)

# look at data
print(df.shape)
df.sample(5)

In [None]:
# add email domain 
df['email_domain'] = df['email'].str.split('@').str[-1]

In [None]:
# add country code via geopy 

# define geo app
geolocator = Nominatim(user_agent="andrewm4894_dev")

# loop over each unique location
user_country_map = dict()
for location in df['location'].unique():
    # default to none
    country_code = None
    # fill it in if possible
    if location != None:
        try: 
            geo_res = do_geocode(location,geolocator)
        except Exception as e: 
            print(e)
            geo_res = None
        if geo_res:
            country_code = geo_res.raw['address']['country_code']
    user_country_map[location] = country_code

# map lookup to col in df
df['country_code'] = df['location'].map(user_country_map)

In [None]:
# look at data
print(df.shape)
df.sample(5)

In [None]:
# look at top N stargazers by followers
df.sort_values(by='followers',ascending=False).head(25)

In [None]:
# look at some plots
for col in ['country_code','company','email_domain']:
    ax = df[col].value_counts(dropna=True).head(20).sort_values().plot(title=col, kind='barh', figsize=(10,6))
    plt.show()