In [None]:
import plotly.express
import pandas
import psycopg2


# TODO: move this into a shared place rather than copy-pasting
def call_cached(fn):
    import inspect
    import hashlib
    digest = hashlib.sha256(inspect.getsource(get_downloads_data).encode()).hexdigest()
    cache_filename = f"../cache/{fn.__name__}-{digest}.parquet"

    try:
        data = pandas.read_parquet(cache_filename)
        return data
    except FileNotFoundError:
        pass

    data = fn()
    data.to_parquet(cache_filename)
    # TODO:
    # * Delete every file matching f"../cache/{fn.__name__}-*.parquet"
    #   other than `cache_filename`.
    #   (or touch the file we used, if we want an LRU cache with size bigger than 1)
    # * Log cache misses and timings.
    return data

# TODO: move this into a shared place rather than copy-pasting
def get_downloads_data():
    conn = psycopg2.connect(
        database="cratesio",
    )
    downloads = pandas.read_sql_query("""
        select
            c.name as package_name, v.num as package_version, d.downloads
        from
            version_downloads as d
        join
            versions as v on v.id = d.version_id
        join
            crates as c on c.id = v.crate_id
        where
            date = '2021-03-29'
        order by
            package_name, package_version
        ;
    """, conn)
    return downloads

downloads = call_cached(get_downloads_data)

downloads

In [None]:
sorted_downloads = downloads.sort_values(by='downloads', ascending=False)

In [None]:
cumulative_downloads = sorted_downloads['downloads'].cumsum()
sorted_downloads['cumulative_downloads'] = cumulative_downloads
in_25_percent_downloads = cumulative_downloads < (cumulative_downloads.iloc[-1] / 4)

sorted_downloads[in_25_percent_downloads]

In [None]:
plotly.express.scatter(
        sorted_downloads[in_25_percent_downloads],
        x='downloads', y='cumulative_downloads',
        hover_name='package_name',
    )