In [1]:
import aggregate

aggregate.main()

In [2]:
import pandas as pd
import aggregate

df = pd.read_csv(aggregate.PATH, parse_dates=["oldest_commit", "newest_commit"])

category_usage_df = pd.read_csv(aggregate.CATEGORY_USAGE_PATH)
newest_commit = df["newest_commit"].max()
df["age"] = (df["newest_commit"] - df["oldest_commit"]).dt.days


# By module

## Number of public modules

In [3]:
len(df)

208

## Modules required to run `pass`

In [4]:
required_modules = df[df["required"]]
print(len(required_modules), "modules required")
required_modules

14 modules required


Unnamed: 0,name,required,category,project_count,oldest_commit,newest_commit,commit_count,age
1,_thread,True,concurrency,80,2008-05-25,2021-02-19,119,4653
2,abc,True,python,604,2007-06-14,2020-10-26,50,4883
28,codecs,True,binary,767,2000-03-10,2021-01-06,175,7607
52,encodings,True,binary,36,2000-03-10,2021-01-08,307,7609
80,imp,True,superseded,202,1990-10-14,2021-02-19,799,11086
81,importlib,True,modules,600,2009-01-18,2021-03-04,490,4428
83,io,True,allos,1542,2007-02-27,2021-02-22,719,5109
95,marshal,True,persistence,49,1991-06-04,2020-12-01,243,10773
120,posix,True,unix,12,1990-10-14,2021-02-04,1171,11071
145,signal,True,ipc,354,1994-05-11,2021-03-05,247,9795


## Modules sorted by how many of the top 4000 most downloaded projects on PyPI use the module

In [5]:
import altair as alt

alt.Chart(df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("project_count:Q"),
    tooltip=["name", "project_count"]
).properties(
    title="Modules by project usage count"
)

## Modules sorted by commit count

In [6]:
import altair as alt

alt.Chart(df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("commit_count:Q"),
    tooltip=["name", "commit_count"]
).properties(
    title="Modules by commit count"
)

## Modules sorted by average number of days between commits

In [7]:
df["age_commit_ratio"] = df["age"] / df["commit_count"]

import altair as alt

alt.Chart(df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("age_commit_ratio:Q"),
    tooltip=["name", "age_commit_ratio"]
).properties(
    title="Modules by age / commit count"
)

## Modules sorted by project usage / commit count

In [8]:
df["usage_commit_ratio"] = df["project_count"] / df["commit_count"]

import altair as alt

alt.Chart(df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("usage_commit_ratio:Q"),
    tooltip=["name", "usage_commit_ratio"]
).properties(
    title="Modules by project usage / commit count"
)

## Modules sorted by project usage / age

In [9]:
df["usage_age_ratio"] = df["project_count"] / df["age"]

import altair as alt

alt.Chart(df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("usage_age_ratio:Q"),
    tooltip=["name", "usage_age_ratio"]
).properties(
    title="Modules by project usage / age"
)

# By category

## Module categories sorted by how many of the top 4000 downloaded projects use a module from that category

In [10]:
import altair as alt

alt.Chart(category_usage_df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("project_count:Q"),
    tooltip=["name", "project_count"]
).properties(
    title="Categories by project usage"
)

## Categories by project usage / commit count

In [11]:
category_usage_df["usage_commit_ratio"] = category_usage_df["project_count"] / category_usage_df["commits"]

import altair as alt

alt.Chart(category_usage_df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("usage_commit_ratio:Q"),
    tooltip=["name", "usage_commit_ratio"]
).properties(
    title="Categories by project usage / commit count"
)