# Analysing mybinder.org launches

The first few cells download and massage the data. Later on we answer questions on which repositories are popular and such.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import datetime

import pandas as pd

In [2]:
index = pd.read_json("https://archive.analytics.mybinder.org/index.jsonl",
                     lines=True)

In [3]:
# get all days since start of 2019
now = datetime.datetime.now()
n = (now - datetime.datetime(2019, 1, 1)).days

frames = []
for idx, day in index.sort_index(ascending=False).iterrows():
    df = pd.read_json("https://archive.analytics.mybinder.org/{}".format(day['name']),
                      lines=True, )
    frames.append(df)
    if len(frames) > n:
        break
        
print(f"Fetched data for {n} days.")

Fetched data for 59 days.


In [4]:
df = pd.concat(frames)

In [5]:
# make it easier to grab the ref
df['repo'] = df['spec'].apply(lambda s: s.rsplit("/", 1)[0])
df['org'] = df['spec'].apply(lambda s: s.split("/", 1)[0])
df['ref'] = df['spec'].apply(lambda s: s.rsplit("/", 1)[1])

In [6]:
# take a look at the data, does it look sensible?
df.sample(10)

Unnamed: 0,provider,schema,spec,status,timestamp,version,repo,org,ref
4563,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-02-27 08:43:00,1,ipython/ipython-in-depth,ipython,master
4254,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-02-19 07:38:00,1,ipython/ipython-in-depth,ipython,master
2456,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-01-25 05:59:00,1,ipython/ipython-in-depth,ipython,master
8167,GitHub,binderhub.jupyter.org/launch,ELC/8fdc0f490b3058872a7014f01416dfb6/master,success,2019-01-25 15:28:00,1,ELC/8fdc0f490b3058872a7014f01416dfb6,ELC,master
1251,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-02-25 03:29:00,1,ipython/ipython-in-depth,ipython,master
1389,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-02-19 03:29:00,1,ipython/ipython-in-depth,ipython,master
6087,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-01-31 11:15:00,1,ipython/ipython-in-depth,ipython,master
8517,GitHub,binderhub.jupyter.org/launch,rationalmatter/juno-demo-notebooks/master,success,2019-01-17 14:33:00,1,rationalmatter/juno-demo-notebooks,rationalmatter,master
1991,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2019-02-22 05:14:00,1,ipython/ipython-in-depth,ipython,master
4141,GitHub,binderhub.jupyter.org/launch,alperyilmaz/jupyterlab-python-intro/master,success,2019-01-17 09:03:00,1,alperyilmaz/jupyterlab-python-intro,alperyilmaz,master


In [7]:
df = df.drop(columns=['schema', 'version', 'spec'])

In [8]:
df.head()

Unnamed: 0,provider,status,timestamp,repo,org,ref
0,GitHub,success,2019-03-01,DS-100/textbook,DS-100,master
1,GitHub,success,2019-03-01,ipython/ipython-in-depth,ipython,master
2,GitHub,success,2019-03-01,DS-100/textbook,DS-100,master
3,GitHub,success,2019-03-01,stencila/examples,stencila,elife-30274-binder
4,GitHub,success,2019-03-01,DS-100/textbook,DS-100,master


## Total launches

In [9]:
# Sneak peek: total launches!
df.shape

(753973, 6)

In [10]:
# add a nnew column showing total launches per repo
totals_per_repo = (df.groupby(["repo"])
 .size()
 .reset_index(name='repo_counts'))

In [11]:
# add a nnew column showing total launches per org
totals_per_org = (df.groupby(["org"])
 .size()
 .reset_index(name='org_counts'))

In [12]:
df_ = pd.merge(df, totals_per_repo, on='repo')
df_ = pd.merge(df_, totals_per_org, on='org')
#df_.sample(10)

## Estimate number of unique repositories

Expect the raw number of launches to be bigger than the number of repositories launched more than once. Those launched only once might have been accidents.

In [13]:
len(set(df.repo))

7295

In [14]:
len(set(df_.repo[df_.repo_counts>1]))

4841

## Where are repositories hosted?

In [15]:
(df.groupby("provider")
   .size()
   .reset_index(name='Launches')
   .sort_values('Launches', ascending=False))

Unnamed: 0,provider,Launches
1,GitHub,750792
2,GitLab,2709
0,Git,472


## Popular repositories and their branches

In [16]:
(df_.groupby(["org", "repo", "ref", "repo_counts", "org_counts"])
 .size()
 # give the column a nice name
 .reset_index(name='ref_counts')
 # sort first by total launches, then within a repo by ref launches
 .sort_values(['org_counts', 'repo_counts', 'ref_counts'],
              ascending=[False,False, False])
 .set_index(["org", 'repo', 'ref'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,repo_counts,org_counts,ref_counts
org,repo,ref,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ipython,ipython/ipython-in-depth,master,355781,356329,355781
ipython,ipython/ipython,6.x,543,356329,505
ipython,ipython/ipython,master,543,356329,29
ipython,ipython/ipython,1.x,543,356329,5
ipython,ipython/ipython,2.x,543,356329,4
ipython,ipython/ipyparallel,master,2,356329,2
ipython,ipython/ipynb,master,1,356329,1
ipython,ipython/ipython.git,master,1,356329,1
ipython,ipython/ipywidgets,master,1,356329,1
jupyterlab,jupyterlab/jupyterlab-demo,master,86744,86841,85089
