In [1]:
import pandas as pd
import requests
from datetime import datetime, timedelta

In [2]:
PROVIDER_PREFIXES = {
    'Git': 'git',
    'GitHub': 'gh',
    'GitLab': 'gl',
    # 'gist.github.com': 'gist',
}

In [3]:
def binder_url(provider, org, repo):
    provider_prefix = PROVIDER_PREFIXES.get(provider, '')
    if provider_prefix:
        return f'https://notebooks.gesis.org/binder/v2/{provider_prefix}/{org}/{repo}/master'
    return ''

In [4]:
def ts_to_dt(ts):
    return datetime.utcfromtimestamp(ts)

# Launch count data

In [5]:
REPOS_TO_FILTER = [
    'gesiscss/binder-stats'
]

In [6]:
def get_launch_data(time_range='90d', filter_="{status='success'}"):
    # NOTE: increase() doesn't return first +1 (0->1)
    # query = f"increase(binderhub_launch_count_total{filter_}[{time_range}])"
    query = f"binderhub_launch_count_total{filter_}[{time_range}]"
    print(query)
    resp = requests.get('https://notebooks.gesis.org/prometheus/api/v1/query',
                        params={'query': query})
    data = resp.json()['data']['result']
    return data

In [7]:
def process_launch_data(data):
    d = {}  # {repo: [repo, org, provider, [launches], repo_url, binder_url]}
    for container in data:
        # first get meta data
        repo_url = container['metric']['repo']
        provider = container['metric']['provider']
        provider_org_repo = repo_url.replace('https://', '').rstrip('.git').rsplit('/', 2)
        if len(provider_org_repo) == 2:
            # some repo urls (e.g. gist) don't contain org/user info
            provider, repo = provider_org_repo
            org = ''
        else:
            provider_, org, repo = provider_org_repo
        if f'{org}/{repo}' in REPOS_TO_FILTER:
            continue
        # detect changes of launch count
        # get ts of each increase of launch count
        launch_count_increases = []
        launch_count_prev = 0
        for ts, launch_count in container['values']:
            launch_count = int(launch_count)
            if launch_count != launch_count_prev:
                # assert launch_count > launch_count_prev
                launch_count_increases.append((ts, launch_count))
            launch_count_prev = launch_count

        if repo not in d:
            d[repo] = [repo, org, provider, [launch_count_increases], repo_url, binder_url(provider, org, repo)]
        else:
            # same repo can be launched on different instances (after a new deployment/update)
            d[repo][3].append(launch_count_increases)

    # sort and flatten launch_count_increases
    for repo, data in d.items():
        launch_count_increases_ = []
        count_prev = 0
        launch_count_increases = data[3]
        # sort with timestamp (of first element of each sub-list)
        launch_count_increases.sort(key=lambda x: x[0][0])
        for increase in launch_count_increases:
            for i in increase:
                launch_count_increases_.append((ts_to_dt(i[0]), i[1] + count_prev))
            count_prev = launch_count_increases_[-1][1]
        data[3] = launch_count_increases_

    #     print('Since', ts_to_dt(min(value_times)), 'in UTC')
    d = list(d.values())
    return d

In [8]:
launch_data = get_launch_data()
launch_data = process_launch_data(launch_data)

binderhub_launch_count_total{status='success'}[90d]


# Popular Repos

In [9]:
def get_popular_repos(launch_data, time_range):
    if time_range.endswith('h'):
        p = {'hours': int(time_range.split('h')[0])}
    elif time_range.endswith('d'):
        p = {'days': int(time_range.split('d')[0])}
    else:
        raise ValueError('Time range must be in hours or days.')
    popular_repos = []
    time_delta = timedelta(**p)
    start_dt = datetime.utcnow() - time_delta
    for data in launch_data:
        # get the launch count just before time_range
        first_value = 0
        for dt, launch_count in data[3]:
            if dt < start_dt:
                first_value = launch_count
        # increase in launch count during time_range
        data[3] = launch_count - first_value
        if data[3] != 0:
            popular_repos.append(data)
    # sort according to launch count
    popular_repos.sort(key=lambda x: x[3], reverse=True)
    print("Total number of launches: " + str(sum([i[3] for i in popular_repos])) + " in " + str(time_range))
    return popular_repos

In [10]:
def display_popular_repos(data):
    df = pd.DataFrame(data, 
                      columns = ['repo', 'org', 'provider', 'launches', 'repo_url', 'binder_url'])
    df = df.sort_values('launches', ascending=False).reset_index(drop=True)
    df = df.style.format({'repo_url':lambda x: f'<a target="_blank" href="{x}">repo url</a>', 
                          'binder_url': lambda x: f'<a target="_blank" href="{x}">binder url</a>'})
    display(df)

In [11]:
from copy import deepcopy

## Popular repositories in last 1 hour

In [12]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '1h')
display_popular_repos(popular_repos)

Total number of launches: 7 in 1h


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,wikiwho_tutorial,gesiscss,GitHub,3,repo url,binder url
1,stmdemo,arnim,GitHub,2,repo url,binder url
2,RStan-Binder,arnim,GitHub,1,repo url,binder url
3,PythonDataScienceHandbook,gesiscss,GitHub,1,repo url,binder url


## Popular repositories in last 24 hours

In [13]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '24h')
display_popular_repos(popular_repos)

Total number of launches: 221 in 24h


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,66,repo url,binder url
1,pymc3,pymc-devs,GitHub,22,repo url,binder url
2,ptm,gesiscss,GitHub,16,repo url,binder url
3,wikiwho_tutorial,gesiscss,GitHub,8,repo url,binder url
4,BIGSSS,JuKo007,GitHub,6,repo url,binder url
5,bokeh-notebooks,bokeh,GitHub,4,repo url,binder url
6,dask-examples,dask,GitHub,4,repo url,binder url
7,workshop_girls_day,gesiscss,GitHub,4,repo url,binder url
8,ipython-in-depth,ipython,GitHub,4,repo url,binder url
9,jupyterlab-latex,jupyterlab,GitHub,3,repo url,binder url


## Popular repositories in last week

In [14]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '7d')
display_popular_repos(popular_repos)

Total number of launches: 345 in 7d


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,75,repo url,binder url
1,ptm,gesiscss,GitHub,26,repo url,binder url
2,workshop_girls_day,gesiscss,GitHub,23,repo url,binder url
3,pymc3,pymc-devs,GitHub,22,repo url,binder url
4,wikiwho_tutorial,gesiscss,GitHub,13,repo url,binder url
5,requirements,binder-examples,GitHub,9,repo url,binder url
6,bokeh-notebooks,bokeh,GitHub,9,repo url,binder url
7,dataverse-R,wrathofquan,GitHub,7,repo url,binder url
8,simple-binder-repo,bitnik,GitHub,7,repo url,binder url
9,BIGSSS,JuKo007,GitHub,7,repo url,binder url


## Most popular repositories in the last 30 days 

In [15]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '30d')
display_popular_repos(popular_repos)

Total number of launches: 619 in 30d


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,133,repo url,binder url
1,ptm,gesiscss,GitHub,41,repo url,binder url
2,workshop_girls_day,gesiscss,GitHub,32,repo url,binder url
3,pymc3,pymc-devs,GitHub,26,repo url,binder url
4,bokeh-notebooks,bokeh,GitHub,17,repo url,binder url
5,textbook,DS-100,GitHub,13,repo url,binder url
6,wikiwho_tutorial,gesiscss,GitHub,13,repo url,binder url
7,gesis-meta-analysis-2018,berndweiss,GitHub,13,repo url,binder url
8,BIGSSS,JuKo007,GitHub,12,repo url,binder url
9,requirements,binder-examples,GitHub,12,repo url,binder url


## Most popular repositories in the last 60 days 

In [16]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '60d')
display_popular_repos(popular_repos)

Total number of launches: 619 in 60d


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,133,repo url,binder url
1,ptm,gesiscss,GitHub,41,repo url,binder url
2,workshop_girls_day,gesiscss,GitHub,32,repo url,binder url
3,pymc3,pymc-devs,GitHub,26,repo url,binder url
4,bokeh-notebooks,bokeh,GitHub,17,repo url,binder url
5,textbook,DS-100,GitHub,13,repo url,binder url
6,wikiwho_tutorial,gesiscss,GitHub,13,repo url,binder url
7,gesis-meta-analysis-2018,berndweiss,GitHub,13,repo url,binder url
8,BIGSSS,JuKo007,GitHub,12,repo url,binder url
9,requirements,binder-examples,GitHub,12,repo url,binder url
