# Getting Github statistics from PyPI packages

### Requirements

* Python 3.5+ only
* ~1GB spare ram
* 7 minutes of waiting (for github & pypi)

#### External packages

Run `pip install pandas bs4 requests joblib`

In [1]:
from itertools  import islice
from urllib.parse import urlparse
from typing import List, Iterable, Dict, Union, Tuple, Callable, Any
import datetime
import json
import locale
import logging
import re
import time

from joblib import Parallel, delayed
from tqdm import tqdm
import bs4
import pandas as pd
import requests

BeautifulSoup = bs4.BeautifulSoup

# I use this locale to interperate the comma separated starred count on github.
# Is github locale dependent?
locale.setlocale(locale.LC_ALL, 'en_US.UTF8')

'en_US.UTF8'

### Setup logging with H:M:S + milliseconds time

In [2]:
class MyFormatter(logging.Formatter):
    def formatTime(self, record):
        ct = datetime.datetime.fromtimestamp(record.created)
        t = ct.strftime("%H:%M:%S")
        s = "%s,%03d" % (t, record.msecs)
        return s
fmt = MyFormatter("%(asctime)s: %(message)s")
handler = logging.StreamHandler()
handler.setFormatter(fmt)
handler.setLevel(logging.INFO)
log = logging.getLogger(__name__)
log.handlers = []
log.setLevel(logging.INFO)
log.addHandler(handler)

In [3]:
def timeit(func: Callable) -> Callable:
    """Decorator for timing functions."""
    def timed(*args: Any, **kw: Any) -> Any:
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print('%r: %2.2f sec' % (func.__name__, end-start))
        return result
    return timed

# Functions which extract data from pypi

In [4]:
@timeit
def get_pypi_table(pypi_topic_listing_url: str) -> bs4.element.Tag:
    """Take pypi classifier package list, and return the table body BeautifulSoup object.
    
    Args:
        pypi_topic_listing_url: fully qualified url to the pypi package listing.
        e.g. 'https://pypi.python.org/pypi?:action=browse&show=all&c=595'
        
    Returns:
        A bs4.element.Tag which is the <tbody> tag from 
        the `pypi_topic_listing` html source.
    """
    response = requests.get(pypi_topic_listing_url)
    content = BeautifulSoup(response.content, "lxml")
    link_table = content.find("table", attrs={"class": "list"})
    tbody = content.findChild("tbody")
    return tbody


def yield_valid_links(pypi_tbody: bs4.element.Tag) -> Iterable[str]:
    """Get valid links from the pypi table body.
    
    Note:
        Some of the table rows do not contain packages/links
    
    Args:
        tbody: The table body of a Pypi classifier list.
    
    Yields:
        hrefs from the python packages inside `tbody`.
        They only contain the path from the web root.
    """
    for child in pypi_tbody.childGenerator():
        try:
            yield child.find("a").attrs["href"]
        except AttributeError:
            pass


def get_link_contents(pypi_package_href: str) -> dict:
    """Take href of package, get pypi json data.
    
    Args:
        href(str): A url path, excluding everything else including the host.
            Should be a url to a pypi package.
            E.g. '/pypi/12factor-vault/'
        
    Returns:
        A dictionary of the json found for the package.
    """
    url = "https://pypi.python.org{href}json".format(href=pypi_package_href)
    response = requests.get(url)
    return json.loads(response.content.decode())

# Here I use the pypi table for python 3 exclusive packages

In [5]:
# Does IO
tbody = get_pypi_table("https://pypi.python.org/pypi?:action=browse&show=all&c=595")

'get_pypi_table': 0.69 sec


# Downloading from pypi is embarassingly slow without parallelization

Sorry pypi

In [6]:
@timeit
def retrieve_pypi_json_dicts(tbody):
    link_list = yield_valid_links(tbody)
    pypi_json_dicts = Parallel(n_jobs=32)(delayed(get_link_contents)(link) for link in link_list)
    return pypi_json_dicts

# IO here. This took me about 30 seconds. Without parallelization it takes minutes.
pypi_json_dicts = retrieve_pypi_json_dicts(tbody)


'retrieve_pypi_json_dicts': 34.56 sec


# The code which for downloading github info

In [7]:
def normalize_github_path(path: str) -> str:
    """Normalize github path to the main repository
    
    Args:
        path: A path which points to a github repo or
            a subdirectory of a github repo
    Returns:
        A path which points to a github repo
    
    Some github links point to a wiki
    
    >>> normalize_github_path('/foo/bar/baz')
    '/foo/bar'
    >>> normalize_github_path('/foo/bar')
    '/foo/bar'
    """
    depth = path.count("/")
    if depth > 2:
        path = '/'.join(path.split("/")[:3])
    return path
    

def yield_valid_github_url(pypi_json_dicts: Iterable[dict]) -> Iterable[str]:
    """Yield homepage urls that are pointing to github.
    
    Args:
        pypi_json_dicts: A container of json dictionaries for each pypi package
        
    Yields:
        A fully qualified url for pypi packages
        that have a github url as their homepage.
        """
    for pypi_json in pypi_json_dicts:
        url = pypi_json["info"]["home_page"]
        parse_result = urlparse(url)
        repo_path = normalize_github_path(parse_result.path)
        if parse_result.netloc == "github.com":
            url = "https://github.com{}".format(repo_path)
            yield url

def yield_github_soup(github_urls: Iterable[str]) -> Tuple[Iterable[BeautifulSoup], int]:
    """Yield github soup for each github url in url list.
    
    Note:
        A least one of the github links on pypi no longer exist.
    
    Args:
        github_urls: A container for github repository urls

    Yields:
        The parsed BeautifulSoup from url if the resource exists AND
        the url itself
    """
    for url in github_urls:
        response = requests.get(url)
        if response.status_code != 404:
            yield BeautifulSoup(response.content, "lxml"), url
        else:
            log.debug("404 for %s", url)

# No need for paralellization here. Github throttles

I tried to use the github josn api, but immidately hit the rate limit.

In [8]:
@timeit
def get_gh_soup_list(pypi_json_dicts):
    gh_url_container = yield_valid_github_url(pypi_json_dicts)
    gh_soup_list = []
    # This code will take forever, so printing the url is an alright indication of progress
    # Does IO
    for gh_soup, url in yield_github_soup(gh_url_container):
        log.debug("url: %s", url)
        gh_soup_list.append((gh_soup, url))
    return gh_soup_list

gh_soup_list = get_gh_soup_list(pypi_json_dicts)

# All done with retrieving data from urls

'get_gh_soup_list': 392.73 sec


In [9]:
print("There were %s packages" % len(pypi_json_dicts))
print("%s had valid github urls" % len(gh_soup_list))

There were 1109 packages
734 had valid github urls


# The code which scrapes data from Github

Scraping Github is non-ideal since the data is somewhat inconsistent. The alternative would to to check each download for unavailable data, and retry if there is any, however I don't think the missing data is to important to lengthen the download time any more.

In [10]:
def get_contributors(gh_soup: BeautifulSoup,
                     contributors_regex=re.compile("contributors")) -> str:
    """Retrieve number of contributors from github source
    
    Args:
        gh_soup: The parsed source of a github repository in a BeautifulSoup
    Returns:
          A string since this metric may not be available in which
          case an empty string will need to be returned.
    """
    try:
        return gh_soup.find(text=contributors_regex).previous.strip()
    except (TypeError, AttributeError) as e:
        # Github sometimes doesn't load the contributors for some reason
        log.debug("contributors exception %s", e)
        return ""

def get_starred_count(gh_soup: BeautifulSoup,
                      starred_substring_regex=re.compile('starred')) -> int:
    """Get the number of times a repository has been starred."""
    return int(locale.atoi(
        gh_soup.find(
            "a", attrs={'aria-label': starred_substring_regex}
        ).text.strip()
    ))


def get_github_description(gh_soup: BeautifulSoup) -> str:
    """Get the description of a github repository."""
    parent_about_div = gh_soup.find("div", class_="js-details-container")
    if parent_about_div is None:
        # This usually means there is no description
        return ""
    about_span = parent_about_div.find("span", itemprop="about")
    if about_span is None:
        # A missing span occurs when github cannot retireve the description
        return ""
    return about_span.text.strip()


def get_last_commit_time(gh_soup: BeautifulSoup) -> str:
    """Get the last commit time of a github repository.
    
    For some reason a lot of times, this cannot be retrieved.
    """
    relative_time = gh_soup.find("relative-time")
    if relative_time is None:
        # For some of the urls, github didn't load a 'last commit' time.
        return ""
    date_str = relative_time.attrs['datetime']
    date_struct = datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
    human_readable_date = datetime.datetime.strftime(date_struct, "%B %d %Y")
    return human_readable_date

In [11]:
github_info = []
clean_gh_soup_list = [gh_soup for gh_soup in gh_soup_list if gh_soup is not None and gh_soup[0] is not None]
for gh_soup, url in clean_gh_soup_list:
    stars = get_starred_count(gh_soup)
    description = get_github_description(gh_soup)
    last_commit = get_last_commit_time(gh_soup)
    contributors = get_contributors(gh_soup)
    github_info.append((url, stars, description, last_commit, contributors))

# Organizing the data via pandas

In [12]:
df = pd.DataFrame.from_records(github_info, columns=["url", "stars", "description", "last_commit", "contributors"])
df_sorted = df.sort_values(by="stars", ascending=False)

# Result

In [13]:
df_100stars = df_sorted[df_sorted.stars > 100]
df_100stars

Unnamed: 0,url,stars,description,last_commit,contributors
724,https://github.com/MagicStack/uvloop,2571,Ultra fast implementation of asyncio event loo...,November 28 2016,9.0
247,https://github.com/np1/mps-youtube,2071,Terminal based YouTube player and downloader,November 11 2016,41.0
371,https://github.com/tdryer/hangups,1231,the first third-party instant messaging client...,December 03 2016,28.0
3,https://github.com/yadayada/acd_cli,1113,A command line interface and FUSE filesystem f...,November 24 2016,
450,https://github.com/rasguanabana/ytfs,979,YouTube File System,July 31 2016,
639,https://github.com/cosven/FeelUOwn,913,nothing but the alternate,October 13 2016,12.0
185,https://github.com/BigchainDB/bigchaindb,840,BigchainDB is a scalable blockchain database,December 10 2016,28.0
335,https://github.com/jarun/Buku,807,Powerful command-line bookmark manager. Your m...,December 11 2016,
480,https://github.com/mschwager/dhcpwn,512,All your IPs are belong to us.,,
235,https://github.com/machinalis/iepy,480,Information Extraction in Python,October 13 2016,12.0


# Printing table in markdown

In [14]:
def pandas_df_to_markdown_table(df):
    """Takes a dataframe and creates a markdown object from it.
    
    The markdown is compatible with reddit's markdown syntax.
    
    Source:
    
    - http://stackoverflow.com/questions/33181846/programmatically-convert-pandas-dataframe-to-markdown-table
    """
    from IPython.display import Markdown, display
    fmt = ['---' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    return Markdown(df_formatted.to_csv(sep="|", index=False))


In [19]:
markdown_obj = pandas_df_to_markdown_table(df_100stars[["url", "stars", "description", "last_commit"]])

I pasted the markdown to reddit -- 
Here I shortened the url links by injecting markdown links.

In [37]:
print(re.sub(r"(https://github.com/(.*?))\|", r"[\2](\1)|", markdown_obj.data))

url|stars|description|last_commit
---|---|---|---
[MagicStack/uvloop](https://github.com/MagicStack/uvloop)|2571|Ultra fast implementation of asyncio event loop on top of libuv.|November 28 2016
[np1/mps-youtube](https://github.com/np1/mps-youtube)|2071|Terminal based YouTube player and downloader|November 11 2016
[tdryer/hangups](https://github.com/tdryer/hangups)|1231|the first third-party instant messaging client for Google Hangouts|December 03 2016
[yadayada/acd_cli](https://github.com/yadayada/acd_cli)|1113|A command line interface and FUSE filesystem for Amazon (Cloud) Drive|November 24 2016
[rasguanabana/ytfs](https://github.com/rasguanabana/ytfs)|979|YouTube File System|July 31 2016
[cosven/FeelUOwn](https://github.com/cosven/FeelUOwn)|913|nothing but the alternate|October 13 2016
[BigchainDB/bigchaindb](https://github.com/BigchainDB/bigchaindb)|840|BigchainDB is a scalable blockchain database|December 10 2016
[jarun/Buku](https://github.com/jarun/Buku)|807|Powerful command-lin