# Analysing mybinder.org launches

The first few cells download and massage the data. Later on we answer questions on which repositories are popular and such.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import datetime

import pandas as pd

In [2]:
index = pd.read_json("https://archive.analytics.mybinder.org/index.jsonl",
                     lines=True)

In [3]:
# get all days since start of December 2018
now = datetime.datetime.now()
n = (now - datetime.datetime(2018, 12, 1)).days

frames = []
for idx, day in index.sort_index(ascending=False).iterrows():
    df = pd.read_json("https://archive.analytics.mybinder.org/{}".format(day['name']),
                      lines=True, )
    frames.append(df)
    if len(frames) > n:
        break
        
print(f"Fetched data for {n} days.")

Fetched data for 89 days.


In [4]:
df = pd.concat(frames)

In [5]:
# make it easier to grab the ref
df['repo'] = df['spec'].apply(lambda s: s.rsplit("/", 1)[0])
df['org'] = df['spec'].apply(lambda s: s.split("/", 1)[0])
df['ref'] = df['spec'].apply(lambda s: s.rsplit("/", 1)[1])

In [6]:
# take a look at the data, does it look sensible?
df.sample(10)

Unnamed: 0,provider,schema,spec,status,timestamp,version,repo,org,ref
9162,GitHub,binderhub.jupyter.org/launch,ysalaun1/snt/master,success,2018-12-20 22:50:00,1,ysalaun1/snt,ysalaun1,master
8723,GitHub,binderhub.jupyter.org/launch,Microsoft/cognitive-services-notebooks/master,success,2018-12-07 17:46:00,1,Microsoft/cognitive-services-notebooks,Microsoft,master
11367,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2018-12-13 15:40:00,1,ipython/ipython-in-depth,ipython,master
16282,GitHub,binderhub.jupyter.org/launch,rationalmatter/juno-demo-notebooks/master,success,2019-01-23 23:52:00,1,rationalmatter/juno-demo-notebooks,rationalmatter,master
6428,GitHub,binderhub.jupyter.org/launch,bokeh/bokeh-notebooks/master,success,2019-01-24 08:37:00,1,bokeh/bokeh-notebooks,bokeh,master
8674,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-01-18 15:45:00,1,ipython/ipython-in-depth,ipython,master
16617,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-02-11 23:03:00,1,ipython/ipython-in-depth,ipython,master
13127,GitHub,binderhub.jupyter.org/launch,rationalmatter/juno-demo-notebooks/master,success,2019-02-08 21:16:00,1,rationalmatter/juno-demo-notebooks,rationalmatter,master
5193,GitHub,binderhub.jupyter.org/launch,jupyterlab/jupyterlab-demo/master,success,2019-02-23 15:30:00,1,jupyterlab/jupyterlab-demo,jupyterlab,master
1864,GitHub,binderhub.jupyter.org/launch,jupyterlab/jupyterlab-demo/master,success,2019-02-13 04:16:00,1,jupyterlab/jupyterlab-demo,jupyterlab,master


In [7]:
df = df.drop(columns=['schema', 'version', 'spec'])

In [8]:
df.head()

Unnamed: 0,provider,status,timestamp,repo,org,ref
0,GitHub,success,2019-02-28,binder-examples/julia-python,binder-examples,master
1,GitHub,success,2019-02-28,ipython/ipython-in-depth,ipython,master
2,GitHub,success,2019-02-28,ipython/ipython-in-depth,ipython,master
3,GitHub,success,2019-02-28,takluyver/mobilechelonian,takluyver,master
4,GitHub,success,2019-02-28,ipython/ipython-in-depth,ipython,master


In [9]:
# Sneak peek: total launches!
df.shape

(999979, 6)

In [10]:
# add a nnew column showing total launches per repo
totals_per_repo = (df.groupby(["repo"])
 .size()
 .reset_index(name='repo_counts'))
totals_per_repo.head()

Unnamed: 0,repo,repo_counts
0,00251716/juliasets,4
1,00quanta/practicalAI,1
2,00quanta/text,5
3,0Shie0/Study,1
4,1-Nameless-1/Lign167.git,21


In [11]:
# add a nnew column showing total launches per org
totals_per_org = (df.groupby(["org"])
 .size()
 .reset_index(name='org_counts'))
totals_per_org.head()

Unnamed: 0,org,org_counts
0,00251716,4
1,00quanta,6
2,0Shie0,1
3,1-Nameless-1,21
4,10446012,8


In [12]:
df_ = pd.merge(df, totals_per_repo, on='repo')
df_ = pd.merge(df_, totals_per_org, on='org')
df_.sample(10)

Unnamed: 0,provider,status,timestamp,repo,org,ref,repo_counts,org_counts
452246,GitHub,success,2018-12-11 12:29:00,ipython/ipython-in-depth,ipython,master,471795,472453
136146,GitHub,success,2019-02-13 09:20:00,ipython/ipython-in-depth,ipython,master,471795,472453
747235,GitHub,success,2019-02-25 23:52:00,QuantStack/xeus-cling,QuantStack,stable,13266,15039
355335,GitHub,success,2019-01-06 06:16:00,ipython/ipython-in-depth,ipython,master,471795,472453
225697,GitHub,success,2019-01-30 15:06:00,ipython/ipython-in-depth,ipython,master,471795,472453
752972,GitHub,success,2019-01-22 10:07:00,QuantStack/xeus-cling,QuantStack,stable,13266,15039
483019,GitHub,success,2018-12-05 16:10:00,ipython/ipython-in-depth,ipython,master,471795,472453
342186,GitHub,success,2019-01-09 20:42:00,ipython/ipython-in-depth,ipython,master,471795,472453
317033,GitHub,success,2019-01-14 09:17:00,ipython/ipython-in-depth,ipython,master,471795,472453
648366,GitHub,success,2019-01-25 18:18:00,DS-100/textbook,DS-100,master,32741,32744


## Popular repositories and their branches

In [13]:
(df_.groupby(["org", "repo", "ref", "repo_counts", "org_counts"])
 .size()
 # give the column a nice name
 .reset_index(name='ref_counts')
 # sort first by total launches, then within a repo by ref launches
 .sort_values(['org_counts', 'repo_counts', 'ref_counts'],
              ascending=[False,False, False])
 .set_index(["org", 'repo', 'ref'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,repo_counts,org_counts,ref_counts
org,repo,ref,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ipython,ipython/ipython-in-depth,master,471795,472453,471795
ipython,ipython/ipython,6.x,652,472453,605
ipython,ipython/ipython,master,652,472453,36
ipython,ipython/ipython,1.x,652,472453,7
ipython,ipython/ipython,2.x,652,472453,4
ipython,ipython/ipyparallel,master,3,472453,3
ipython,ipython/ipynb,master,1,472453,1
ipython,ipython/ipython.git,master,1,472453,1
ipython,ipython/ipywidgets,master,1,472453,1
jupyterlab,jupyterlab/jupyterlab-demo,master,120026,120154,117887


## Where are repositories hosted?

In [14]:
(df.groupby("provider")
   .size()
   .reset_index(name='counts')
   .sort_values('counts', ascending=False))

Unnamed: 0,provider,counts
1,GitHub,995605
2,GitLab,3682
0,Git,692


## Estimate number of unique repositories

Expect the raw number of launches to be bigger than the number of repositories launched more than once. Those launched only once might have been accidents.

In [15]:
len(set(df.repo))

8882

In [16]:
len(set(df_.repo[df_.total_counts>1]))

AttributeError: 'DataFrame' object has no attribute 'total_counts'