In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import requests
import time

In [2]:
#The data shown now in gallery is stored in database from 10/05/19 - So to make sure all data is taken, from_dt = 9th may
from_dt = datetime.datetime(2019,5,9).isoformat() 
url = f'https://notebooks.gesis.org/gallery/api/v1.0/launches/{from_dt}/'

In [3]:
launches = []
# because of pagination the api gives 100 results per page so for analysis you have to take data in all pages
next_page = 1
while next_page is not None:
    api_url = url + str('?page=') + str(next_page)
    r = requests.get(api_url)
    response = r.json()
    # check the limit of queries per second/minute,
    message = response.get("message", "")
    if message not in ["2 per 1 second", "100 per 1 minute"]:
        launches.extend(response['launches'])
        next_page = response['next_page']
    else:
        time.sleep(1)
    

In [4]:
data= pd.DataFrame.from_dict(launches)
data.head(5)

Unnamed: 0,provider,schema,spec,status,timestamp,version
0,GitHub,binderhub.jupyter.org/launch,minrk/ligo-binder/master,success,2019-05-10T08:29:40,2
1,GitHub,binderhub.jupyter.org/launch,gesiscss/wikiwho_demo/master,success,2019-05-10T08:29:43,2
2,GitHub,binderhub.jupyter.org/launch,jakevdp/PythonDataScienceHandbook/master,success,2019-05-10T08:29:44,2
3,GitHub,binderhub.jupyter.org/launch,gesiscss/binder-stats/master,success,2019-05-10T08:31:04,2
4,GitHub,binderhub.jupyter.org/launch,CWTSLeiden/CSSS/master,success,2019-05-10T08:33:54,2


In [5]:
# make it easier to grab the ref
data['repo'] = data['spec'].apply(lambda s: s.rsplit("/", 1)[0])
data['org'] = data['spec'].apply(lambda s: s.split("/", 1)[0])
data['ref'] = data['spec'].apply(lambda s: s.rsplit("/", 1)[1])
data = data.drop(columns=['schema', 'version', 'spec'])

In [6]:
data.head()

Unnamed: 0,provider,status,timestamp,repo,org,ref
0,GitHub,success,2019-05-10T08:29:40,minrk/ligo-binder,minrk,master
1,GitHub,success,2019-05-10T08:29:43,gesiscss/wikiwho_demo,gesiscss,master
2,GitHub,success,2019-05-10T08:29:44,jakevdp/PythonDataScienceHandbook,jakevdp,master
3,GitHub,success,2019-05-10T08:31:04,gesiscss/binder-stats,gesiscss,master
4,GitHub,success,2019-05-10T08:33:54,CWTSLeiden/CSSS,CWTSLeiden,master


# Where are repositories hosted?

In [7]:
(data.groupby("provider")
   .size()
   .reset_index(name='Launches')
   .sort_values('Launches', ascending=False))

Unnamed: 0,provider,Launches
2,GitHub,1359
1,Git,14
0,Gist,4


In [8]:
# add a nnew column showing total launches per repo
totals_per_repo = (data.groupby(["repo"])
 .size()
 .reset_index(name='repo_counts'))

In [9]:
# add a nnew column showing total launches per org
totals_per_org = (data.groupby(["org"])
 .size()
 .reset_index(name='org_counts'))

In [10]:
data_ = pd.merge(data, totals_per_repo, on='repo')
data_ = pd.merge(data_, totals_per_org, on='org')

In [11]:
(data_.groupby(["org", "repo", "ref", "repo_counts", "org_counts"])
 .size()
 # give the column a nice name
 .reset_index(name='ref_counts')
 # sort first by total launches, then within a repo by ref launches
 .sort_values(['org_counts', 'repo_counts', 'ref_counts'],
              ascending=[False,False, False])
 .set_index(["org", 'repo', 'ref'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,repo_counts,org_counts,ref_counts
org,repo,ref,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gesiscss,gesiscss/ptm,master,58,190,58
gesiscss,gesiscss/wikiwho_demo,master,58,190,58
gesiscss,gesiscss/introduction_networkx,master,21,190,21
gesiscss,gesiscss/wikiwho_tutorial,master,19,190,19
gesiscss,gesiscss/workshop_girls_day,master,12,190,12
gesiscss,gesiscss/binder-stats,master,11,190,11
gesiscss,gesiscss/methods_seminar_2019,master,5,190,5
gesiscss,gesiscss/wikiwho_chobj,master,3,190,3
gesiscss,gesiscss/btw17_sample_scripts,master,1,190,1
gesiscss,gesiscss/flow,master,1,190,1
