In [1]:
# import numpy as np
import pandas as pd
import requests
from datetime import datetime, timedelta

In [2]:
def binder_url(org, repo):
    return f'https://notebooks.gesis.org/binder/v2/gh/{org}/{repo}/master'

In [3]:
def ts_to_dt(ts):
    return datetime.utcfromtimestamp(ts)

# The most popular repositories

In [4]:
def query(time_range):
    query = 'binderhub_launch_time_seconds_count{}[{}]'
    query_selectors = "{status='success'}"
    query = query.format(query_selectors, time_range)
    print(query)
    resp = requests.get('https://notebooks.gesis.org/prometheus/api/v1/query', params={'query': query})
    data = resp.json()['data']['result']
    return data

In [5]:
def process_data(data, time_range_beginning):
    d = {'name': [], 'org': [], 'provider': [], 'launches': [], 'repo_url': [], 'binder_url': []}
    for container in data:
        repo_url = container['metric']['repo']
        provider, org, repo = repo_url.replace('https://', '').rsplit('/', 2)
        
        # calculate number of launches for each repo and each binder container/deployment
        values = [int(ii[1]) for ii in container['values']]
        first_value_ts = container['values'][0][0]
        first_value_dt = datetime.utcfromtimestamp(first_value_ts)
        # prometheus scrapes data each minute, so ignore seconds while comparision
        if first_value_dt.replace(second=0, microsecond=0) > time_range_beginning.replace(second=0, microsecond=0):
            # this container is created after beginning of time range
            # NOTE first value in container can be > 1 if there are simultaneous launches
            # first_value = values[0]
            # assert first_value == 1, f'{org}/{repo}---{first_value}---{first_value_dt}---{time_range_beginning}'
            # print(repo, first_value_dt, time_range_beginning, first_value)
            launches = max(values)
        else:
            # this container is created before beginning of time range
            launches = max(values) - min(values)
                
        # print(repo_url, launches, container['metric']['status'], container['metric']['retries'])
        if repo in d['name']:
            # same repo can have status success with different retries values
            i = d['name'].index(repo)
            d['launches'][i] += launches
        else:
            d['launches'].append(launches)
            
            d['name'].append(repo)
            d['org'].append(org)
            d['provider'].append(provider)
            d['repo_url'].append(repo_url)
            d['binder_url'].append(binder_url(org, repo))
    return d

In [6]:
def makedf(time_range, time_delta):
    data = query(f'{time_range}')
    time_range_beginning = datetime.utcnow() - time_delta
    data = process_data(data, time_range_beginning)
    df = pd.DataFrame(data)
    # df = df.drop_duplicates(['name'])
    df = df.groupby(['name', 'org', 'provider','repo_url', 'binder_url']).sum().reset_index().sort_values('launches', ascending=False)
    # df['log_launches'] = df['launches'].apply(np.log)
    df = df.style.format({'repo_url':lambda x: f'<a target="_blank" href="{x}">repo url</a>', 
                 'binder_url': lambda x: f'<a target="_blank" href="{x}">binder url</a>'})
    return df

## Most popular repositories in last hour

In [7]:
hour = 1
time_range = f'{hour}h'
time_delta = timedelta(hours=hour)
df = makedf(time_range, time_delta)
display(df)
"Total number of launches: " + str(sum(df.data['launches'])) + " in "+ str(time_range)

binderhub_launch_time_seconds_count{status='success'}[1h]


Unnamed: 0,name,org,provider,repo_url,binder_url,launches
1,binder-stats,gesiscss,github.com,repo url,binder url,1
4,requirements,binder-examples,github.com,repo url,binder url,1
5,tidyverse,tidyverse,github.com,repo url,binder url,1
0,BIGSSS,JuKo007,github.com,repo url,binder url,0
2,bokeh,binder-examples,github.com,repo url,binder url,0
3,data-quilt,binder-examples,github.com,repo url,binder url,0


'Total number of launches: 3 in 1h'

## Most popular repositories in the last day

In [8]:
day = 1
time_range = f'{day}d'
time_delta = timedelta(days=day)
df = makedf(time_range, time_delta)
display(df)
"Total number of launches: " + str(sum(df.data['launches'])) + " in "+ str(time_range)

binderhub_launch_time_seconds_count{status='success'}[1d]


Unnamed: 0,name,org,provider,repo_url,binder_url,launches
10,pydata-networkx,mriduls,github.com,repo url,binder url,12
2,binder-stats,gesiscss,github.com,repo url,binder url,5
11,requirements,binder-examples,github.com,repo url,binder url,2
0,BIGSSS,JuKo007,github.com,repo url,binder url,1
1,PythonDataScienceHandbook,gesiscss,github.com,repo url,binder url,1
3,bokeh,binder-examples,github.com,repo url,binder url,1
4,data-quilt,binder-examples,github.com,repo url,binder url,1
12,tidyverse,tidyverse,github.com,repo url,binder url,1
5,jupyter-renderers,jupyterlab,github.com,repo url,binder url,0
6,jupyter-sos,binder-examples,github.com,repo url,binder url,0


'Total number of launches: 24 in 1d'

## Most popular repositories in the last 30 days 

In [9]:
day = 30
time_range = f'{day}d'
time_delta = timedelta(days=day)
df = makedf(time_range, time_delta)
display(df)
"Total number of launches: " + str(sum(df.data['launches'])) + " in "+ str(time_range)

binderhub_launch_time_seconds_count{status='success'}[30d]


Unnamed: 0,name,org,provider,repo_url,binder_url,launches
49,workshop_girls_day,gesiscss,github.com,repo url,binder url,153
4,PythonDataScienceHandbook,gesiscss,github.com,repo url,binder url,95
7,binder-stats,gesiscss,github.com,repo url,binder url,68
16,flow,gesiscss,github.com,repo url,binder url,51
36,ptm,gesiscss,github.com,repo url,binder url,39
32,ligo-binder,minrk,github.com,repo url,binder url,26
37,pydata-networkx,mriduls,github.com,repo url,binder url,25
1,CSSproject,nadjalc,github.com,repo url,binder url,13
5,RStan-Binder,arnim,github.com,repo url,binder url,12
42,requirements,binder-examples,github.com,repo url,binder url,11


'Total number of launches: 586 in 30d'

## Most popular repositories in the last 60 days 

In [10]:
day = 60
time_range = f'{day}d'
time_delta = timedelta(days=day)
df = makedf(time_range, time_delta)
display(df)
"Total number of launches: " + str(sum(df.data['launches'])) + " in "+ str(time_range)

binderhub_launch_time_seconds_count{status='success'}[60d]


Unnamed: 0,name,org,provider,repo_url,binder_url,launches
58,workshop_girls_day,gesiscss,github.com,repo url,binder url,258
5,PythonDataScienceHandbook,gesiscss,github.com,repo url,binder url,140
22,flow,gesiscss,github.com,repo url,binder url,85
11,binder-stats,gesiscss,github.com,repo url,binder url,73
23,gesis-meta-analysis-2018,berndweiss,github.com,repo url,binder url,70
44,ptm,gesiscss,github.com,repo url,binder url,57
3,GitHub_traffic_crawler,gesiscss,github.com,repo url,binder url,40
38,ligo-binder,minrk,github.com,repo url,binder url,37
45,pydata-networkx,mriduls,github.com,repo url,binder url,25
8,YoutubeScrapingEmojis,JuKo007,github.com,repo url,binder url,23


'Total number of launches: 994 in 60d'