# Analysing mybinder.org launches

The first few cells download and massage the data. Later on we answer questions on which repositories are popular and such.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import datetime

import pandas as pd

In [2]:
index = pd.read_json("https://archive.analytics.mybinder.org/index.jsonl",
                     lines=True)

In [3]:
# get all days since start of December 2018
now = datetime.datetime.now()
n = (now - datetime.datetime(2018, 12, 1)).days

frames = []
for idx, day in index.sort_index(ascending=False).iterrows():
    df = pd.read_json("https://archive.analytics.mybinder.org/{}".format(day['name']),
                      lines=True, )
    frames.append(df)
    if len(frames) > n:
        break
        
print(f"Fetched data for {n} days.")

Fetched data for 88 days.


In [17]:
df = pd.concat(frames)

In [18]:
# make it easier to grab the ref
df['repo'] = df['spec'].apply(lambda s: s.rsplit("/", 1)[0])
df['org'] = df['spec'].apply(lambda s: s.split("/", 1)[0])
df['ref'] = df['spec'].apply(lambda s: s.rsplit("/", 1)[1])

In [19]:
# take a look at the data, does it look sensible?
df.sample(10)

Unnamed: 0,provider,schema,spec,status,timestamp,version,repo,org,ref
6218,GitHub,binderhub.jupyter.org/launch,binder-examples/jupyter-rise/master,success,2018-12-10 12:42:00,1,binder-examples/jupyter-rise,binder-examples,master
9243,GitHub,binderhub.jupyter.org/launch,NumEconCopenhagen/exercises-2019/master,success,2019-02-27 14:12:00,1,NumEconCopenhagen/exercises-2019,NumEconCopenhagen,master
9822,GitHub,binderhub.jupyter.org/launch,binder-examples/r/master,success,2019-01-14 17:13:00,1,binder-examples/r,binder-examples,master
1688,GitHub,binderhub.jupyter.org/launch,aprashant1/widget2/master,success,2019-02-08 04:21:00,1,aprashant1/widget2,aprashant1,master
10751,GitHub,binderhub.jupyter.org/launch,mlrubio/medcomp_1/master,success,2019-01-29 16:26:00,1,mlrubio/medcomp_1,mlrubio,master
9881,GitHub,binderhub.jupyter.org/launch,jupyterlab/jupyterlab-demo/master,success,2019-02-04 16:46:00,1,jupyterlab/jupyterlab-demo,jupyterlab,master
5215,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-01-27 17:32:00,1,ipython/ipython-in-depth,ipython,master
2188,GitHub,binderhub.jupyter.org/launch,stencila/examples/elife-30274-binder,success,2019-02-23 06:56:00,1,stencila/examples,stencila,elife-30274-binder
5761,GitHub,binderhub.jupyter.org/launch,jupyterlab/jupyterlab-demo/master,success,2019-01-14 12:19:00,1,jupyterlab/jupyterlab-demo,jupyterlab,master
12111,GitHub,binderhub.jupyter.org/launch,StarkDmt/Computational-Physics/master,success,2019-02-27 17:45:00,1,StarkDmt/Computational-Physics,StarkDmt,master


In [20]:
df = df.drop(columns=['schema', 'version', 'spec'])

In [21]:
df.head()

Unnamed: 0,provider,status,timestamp,repo,org,ref
0,GitHub,success,2019-02-27 00:00:00,QISKit/qiskit-tutorial,QISKit,master
1,GitHub,success,2019-02-27 00:00:00,DS-100/textbook,DS-100,master
2,GitHub,success,2019-02-27 00:00:00,ipython/ipython-in-depth,ipython,master
3,GitHub,success,2019-02-27 00:01:00,jupyterlab/jupyterlab-demo,jupyterlab,18a9793b58ba86660b5ab964e1aeaf7324d667c8
4,GitHub,success,2019-02-27 00:01:00,jupyterlab/jupyterlab-demo,jupyterlab,master


In [22]:
# Sneak peek: total launches!
df.shape

(996027, 6)

In [24]:
# add a nnew column showing total launches per repo
totals = (df.groupby(["repo"])
 .size()
 .reset_index(name='total_counts'))
totals.head()

Unnamed: 0,repo,total_counts
0,00251716/juliasets,4
1,00quanta/practicalAI,1
2,00quanta/text,5
3,0Shie0/Study,1
4,1-Nameless-1/Lign167.git,21


In [25]:
df_ = pd.merge(df, totals, on='repo')
df_.sample(10)

Unnamed: 0,provider,status,timestamp,repo,org,ref,total_counts
9436,GitHub,success,2018-12-09 14:10:00,QISKit/qiskit-tutorial,QISKit,master,10228
865132,GitHub,success,2019-02-05 03:07:00,losc-tutorial/LOSC_Event_tutorial,losc-tutorial,master,349
692401,GitHub,success,2019-01-13 17:00:00,numba/numba-examples,numba,master,7081
257703,GitHub,success,2019-01-25 00:12:00,ipython/ipython-in-depth,ipython,master,470476
329569,GitHub,success,2019-01-13 11:32:00,ipython/ipython-in-depth,ipython,master,470476
900289,GitHub,success,2019-01-11 17:09:00,AStrittmatter/CDA,AStrittmatter,master,356
536137,GitHub,success,2019-02-14 22:35:00,jupyterlab/jupyterlab-demo,jupyterlab,master,119539
982415,GitHub,success,2019-01-11 19:27:00,wiringa/Spatial-Data-Sources-and-Tools,wiringa,master,61
516464,GitHub,success,2019-02-26 17:09:00,jupyterlab/jupyterlab-demo,jupyterlab,master,119539
135797,GitHub,success,2019-02-14 10:52:00,ipython/ipython-in-depth,ipython,master,470476


## Popular repositories and their branches

In [34]:
(df_.groupby(["repo", "ref", "total_counts", "org"])
 .size()
 # give the column a nice name
 .reset_index(name='counts')
 # sort first by total launches, then within a repo by ref launches
 .sort_values(['total_counts', 'counts'], ascending=[False, False])
 .set_index(["org", 'repo'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,ref,total_counts,counts
org,repo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ipython,ipython/ipython-in-depth,master,470476,470476
jupyterlab,jupyterlab/jupyterlab-demo,master,119539,117414
jupyterlab,jupyterlab/jupyterlab-demo,18a9793b58ba86660b5ab964e1aeaf7324d667c8,119539,2113
jupyterlab,jupyterlab/jupyterlab-demo,0b0bb42e3e43ee2ebe1c0424d3a88a9b9edcd055,119539,11
jupyterlab,jupyterlab/jupyterlab-demo,c9df996a3bd27d9715de6da51b3c52d35def90f9,119539,1
ines,ines/spacy-io-binder,live,41613,41523
ines,ines/spacy-io-binder,master,41613,74
ines,ines/spacy-io-binder,nightly,41613,16
DS-100,DS-100/textbook,master,32124,32124
bokeh,bokeh/bokeh-notebooks,master,21645,21645


## Where are repositories hosted?

In [13]:
(df.groupby("provider")
   .size()
   .reset_index(name='counts')
   .sort_values('counts', ascending=False))

Unnamed: 0,provider,counts
1,GitHub,991661
2,GitLab,3677
0,Git,689


## Estimate number of unique repositories

Expect the raw number of launches to be bigger than the number of repositories launched more than once. Those launched only once might have been accidents.

In [14]:
len(set(df.repo))

8856

In [15]:
len(set(df_.repo[df_.total_counts>1]))

5891