In [1]:
import aggregate

aggregate.main()

In [2]:
import pandas as pd
import aggregate

df = pd.read_csv(aggregate.PATH, parse_dates=["oldest_commit", "newest_commit"])

category_usage_df = pd.read_csv(aggregate.CATEGORY_USAGE_PATH)
newest_commit = df["newest_commit"].max()
df["age"] = (df["newest_commit"] - df["oldest_commit"]).dt.days


# By module

## Number of public modules
How big is the stdlib?

In [3]:
len(df)

208

## Modules required to run `pass`
How big is the bare minimum stdlib to CPython?

In [4]:
required_modules = df[df["required"]]
print(len(required_modules), "modules required")
required_modules

14 modules required


Unnamed: 0,name,required,category,project_count,oldest_commit,newest_commit,commit_count,pr_count,age
1,_thread,True,concurrency,80,2008-05-25,2021-02-19,119,10,4653
2,abc,True,python,604,2007-06-14,2020-10-26,50,2,4883
28,codecs,True,binary,767,2000-03-10,2021-01-06,175,2,7607
52,encodings,True,binary,36,2000-03-10,2021-01-08,307,9,7609
80,imp,True,superseded,202,1990-10-14,2021-02-19,799,5,11086
81,importlib,True,modules,600,2009-01-18,2021-03-04,490,11,4428
83,io,True,allos,1542,2007-02-27,2021-02-22,719,27,5109
95,marshal,True,persistence,49,1991-06-04,2020-12-01,243,7,10773
120,posix,True,unix,12,1990-10-14,2021-02-04,1171,31,11071
145,signal,True,ipc,354,1994-05-11,2021-03-05,247,2,9795


## Modules with no usage in the 4000 most downloaded projects on PyPI over a year

In [5]:
df[df["project_count"] == 0]

Unnamed: 0,name,required,category,project_count,oldest_commit,newest_commit,commit_count,pr_count,age
4,antigravity,False,eastereggs,0,2008-10-19,2020-04-14,5,0,4195
16,binhex,False,netdata,0,1995-08-07,2020-11-01,48,0,9218
24,chunk,False,mm,0,1999-06-09,2017-04-05,20,0,6510
69,graphlib,False,datatypes,0,2020-06-01,2020-10-12,2,0,133
94,mailcap,False,netdata,0,1995-09-30,2020-04-02,22,0,8951
104,nis,False,unix,0,1992-08-12,2020-06-12,44,2,10166
127,pyclbr,False,language,0,1995-07-28,2021-02-01,53,1,9320
184,turtledemo,False,frameworks,0,2010-11-01,2020-04-29,39,0,3467
198,winsound,False,windows,0,1999-02-04,2017-11-05,34,0,6849


## Modules sorted by project usage
What modules are directly used the most*?

In [6]:
import altair as alt

alt.Chart(df[df["project_count"] > 0]).mark_bar().encode(
    alt.Y("name:N", sort="-x"),
    alt.X("project_count:Q"),
    alt.Color("category"),
    tooltip=["name", "project_count"]
).properties(
    title="Modules by project usage count"
)

## Modules sorted by commit count
What's the cost* of a module so far?

In [7]:
import altair as alt

alt.Chart(df).mark_bar().encode(
    alt.Y("name:N", sort="-x"),
    alt.X("commit_count:Q"),
    alt.Color("category"),
    tooltip=["name", "commit_count"]
).properties(
    title="Modules by commit count"
)

## Modules sorted by age / commits
What modules are the cheapest* to maintain over time*?

In [8]:
df["age_commit_ratio"] = df["age"] / df["commit_count"]

import altair as alt

alt.Chart(df).mark_bar().encode(
    alt.Y("name:N", sort="-x"),
    alt.X("age_commit_ratio:Q"),
    alt.Color("category"),
    tooltip=["name", "age_commit_ratio"]
).properties(
    title="Modules by age / commit count"
)

## Modules sorted by project usage / commit count
What's a module's sunk cost worth* to the public*?

In [9]:
df["usage_commit_ratio"] = df["project_count"] / df["commit_count"]

import altair as alt

alt.Chart(df[df["usage_commit_ratio"] > 0]).mark_bar().encode(
    alt.Y("name:N", sort="-x"),
    alt.X("usage_commit_ratio:Q"),
    alt.Color("category"),
    tooltip=["name", "usage_commit_ratio"]
).properties(
    title="Modules by project usage / commit count"
)

## Modules sorted by project usage / age
What's a module's popularity* in regards to how long it has existed*?

I.e., try to take length of availability out popularity.

In [10]:
df["usage_age_ratio"] = df["project_count"] / df["age"]

import altair as alt

alt.Chart(df[df["usage_age_ratio"] > 0]).mark_bar().encode(
    alt.Y("name:N", sort="-x"),
    alt.X("usage_age_ratio:Q"),
    alt.Color("category"),
    tooltip=["name", "usage_age_ratio"]
).properties(
    title="Modules by project usage / age"
)

## Modules with no open PRs

In [11]:
df[df["pr_count"] == 0]

Unnamed: 0,name,required,category,project_count,oldest_commit,newest_commit,commit_count,pr_count,age,age_commit_ratio,usage_commit_ratio,usage_age_ratio
4,antigravity,False,eastereggs,0,2008-10-19,2020-04-14,5,0,4195,839.0,0.0,0.0
11,atexit,False,python,256,2007-03-21,2021-02-19,25,0,5084,203.36,10.24,0.050354
16,binhex,False,netdata,0,1995-08-07,2020-11-01,48,0,9218,192.041667,0.0,0.0
17,bisect,False,datatypes,125,1992-09-02,2020-10-19,42,0,10274,244.619048,2.97619,0.012167
24,chunk,False,mm,0,1999-06-09,2017-04-05,20,0,6510,325.5,0.0,0.0
29,codeop,False,custominterp,9,1998-10-22,2021-02-13,23,0,8150,354.347826,0.391304,0.001104
31,colorsys,False,mm,34,1992-09-07,2020-11-28,10,0,10309,1030.9,3.4,0.003298
36,contextvars,False,concurrency,39,2018-01-22,2020-11-04,5,0,1017,203.4,7.8,0.038348
37,copy,False,datatypes,1002,1995-01-10,2020-05-28,78,0,9270,118.846154,12.846154,0.108091
39,crypt,False,unix,12,2011-02-22,2020-02-17,22,0,3282,149.181818,0.545455,0.003656


## Modules by open PR count
Which modules have the most open PRs*?

In [12]:
import altair as alt

alt.Chart(df[df["pr_count"] > 0]).mark_bar().encode(
    alt.Y("name:N", sort="-x"),
    alt.X("pr_count:Q"),
    alt.Color("category"),
    tooltip=["name", "pr_count"]
).properties(
    title="Modules by open PR count"
)

## Modules by usage / open PR count
How is a module's popularity* in terms of open PR count*?

In [13]:
df["usage_pr_ratio"] = df["project_count"] / df["pr_count"]

import altair as alt
import numpy as np

alt.Chart(df[df["usage_pr_ratio"] < np.inf]).mark_bar().encode(
    alt.Y("name:N", sort="-x"),
    alt.X("usage_pr_ratio:Q"),
    alt.Color("category"),
    tooltip=["name", "usage_pr_ratio"]
).properties(
    title="Modules by project usage / open PRs"
)

# By category

## Categories sorted by usage by the top 4000 downloaded projects
Which categories are used the most*?

In [14]:
import altair as alt

alt.Chart(category_usage_df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("project_count:Q"),
    tooltip=["name", "project_count"]
).properties(
    title="Categories by project usage"
)

## Categories by project usage / commit count
Which categories get the most usage* baesd on their sunk cost*?

In [15]:
category_usage_df["usage_commit_ratio"] = category_usage_df["project_count"] / category_usage_df["commits"]

import altair as alt

alt.Chart(category_usage_df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("usage_commit_ratio:Q"),
    tooltip=["name", "usage_commit_ratio"]
).properties(
    title="Categories by project usage / commit count"
)

In [16]:
import altair as alt

alt.Chart(df[df["usage_commit_ratio"] > 0]).mark_boxplot().encode(
    alt.Y("category:N", sort="-x"),
    alt.X("usage_commit_ratio:Q"),
    tooltip=["name", "usage_commit_ratio"]
).properties(
    title="Modules by project usage / commit count"
)

In [17]:
import altair as alt

alt.Chart(df[df["usage_commit_ratio"] > 0]).mark_point().encode(
    alt.Y("category:N", sort="-x"),
    alt.X("usage_commit_ratio:Q"),
    tooltip=["name", "usage_commit_ratio"]
).properties(
    title="Modules by project usage / commit count"
)

## Categories by open PR count
Which categories are the most neglected*?

In [18]:
import altair as alt

alt.Chart(category_usage_df).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("pr_count:Q"),
    tooltip=["name", "pr_count"]
).properties(
    title="Modules by open PR count"
)

## Categories by usage / open PRs
Which categories get used the most in spite their open PR count?

In [19]:
category_usage_df["usage_pr_ratio"] = category_usage_df["project_count"] / category_usage_df["pr_count"]

import altair as alt

alt.Chart(df[df["usage_pr_ratio"] < np.inf]).mark_bar().encode(
    alt.X("name:N", sort="-y"),
    alt.Y("usage_pr_ratio:Q"),
    tooltip=["name", "usage_pr_ratio"]
).properties(
    title="Modules by category usage / open PRs"
)

In [20]:
import altair as alt
import numpy as np

alt.Chart(df[df["usage_pr_ratio"] < np.inf]).mark_boxplot().encode(
    alt.Y("category:N", sort="-x"),
    alt.X("usage_pr_ratio:Q"),
    tooltip=["name", "usage_pr_ratio"]
).properties(
    title="Modules by project usage / open PRs"
)

In [21]:
import altair as alt
import numpy as np

alt.Chart(df[df["usage_pr_ratio"] < np.inf]).mark_point().encode(
    alt.Y("category:N", sort="-x"),
    alt.X("usage_pr_ratio:Q"),
    tooltip=["name", "usage_pr_ratio"]
).properties(
    title="Modules by project usage / open PRs"
)