In [1]:
# import numpy as np
import pandas as pd
import requests
from datetime import datetime, timedelta

In [2]:
def binder_url(org, repo):
    return f'https://notebooks.gesis.org/binder/v2/gh/{org}/{repo}/master'

In [3]:
def ts_to_dt(ts):
    return datetime.utcfromtimestamp(ts)

# The most popular repositories

In [4]:
def query(time_range):
    query = 'binderhub_launch_time_seconds_count{}[{}]'
    query_selectors = "{status='success'}"
    query = query.format(query_selectors, time_range)
    print(query)
    resp = requests.get('https://notebooks.gesis.org/prometheus/api/v1/query', params={'query': query})
    data = resp.json()['data']['result']
    return data

In [5]:
def process_data(data, time_range_beginning):
    d = {'name': [], 'org': [], 'provider': [], 'launches': [], 'repo_url': [], 'binder_url': []}
    for container in data:
        repo_url = container['metric']['repo']
        provider, org, repo = repo_url.replace('https://', '').rsplit('/', 2)
        
        # calculate number of launches for each repo and each binder container/deployment
        values = [int(ii[1]) for ii in container['values']]
        first_value_ts = container['values'][0][0]
        first_value_dt = datetime.utcfromtimestamp(first_value_ts)
        # prometheus scrapes data each minute, so ignore seconds while comparision
        if first_value_dt.replace(second=0, microsecond=0) > time_range_beginning.replace(second=0, microsecond=0):
            # this container is created after beginning of time range
            # NOTE first value in container can be > 1 if there are simultaneous launches
            # first_value = values[0]
            # assert first_value == 1, f'{org}/{repo}---{first_value}---{first_value_dt}---{time_range_beginning}'
            # print(repo, first_value_dt, time_range_beginning, first_value)
            launches = max(values)
        else:
            # this container is created before beginning of time range
            launches = max(values) - min(values)
                
        # print(repo_url, launches, container['metric']['status'], container['metric']['retries'])
        if repo in d['name']:
            # same repo can have status success with different retries values
            i = d['name'].index(repo)
            d['launches'][i] += launches
        else:
            d['launches'].append(launches)
            
            d['name'].append(repo)
            d['org'].append(org)
            d['provider'].append(provider)
            d['repo_url'].append(repo_url)
            d['binder_url'].append(binder_url(org, repo))
    return d

In [6]:
def makedf(time_range, time_delta):
    data = query(f'{time_range}')
    time_range_beginning = datetime.utcnow() - time_delta
    data = process_data(data, time_range_beginning)
    df = pd.DataFrame(data)
    # df = df.drop_duplicates(['name'])
    df = df.groupby(['name', 'org', 'provider','repo_url', 'binder_url']).sum().reset_index().sort_values('launches', ascending=False)
    # df['log_launches'] = df['launches'].apply(np.log)
    df = df.style.format({'repo_url':lambda x: f'<a target="_blank" href="{x}">repo url</a>', 
                 'binder_url': lambda x: f'<a target="_blank" href="{x}">binder url</a>'})
    return df

## Most popular repositories in last hour

In [7]:
hour = 1
time_range = f'{hour}h'
time_delta = timedelta(hours=hour)
df = makedf(time_range, time_delta)
display(df)
"Total number of launches: " + str(sum(df.data['launches'])) + " in "+ str(time_range)

binderhub_launch_time_seconds_count{status='success'}[1h]


Unnamed: 0,name,org,provider,repo_url,binder_url,launches
2,binder-stats,gesiscss,github.com,repo url,binder url,7
6,flow,gesiscss,github.com,repo url,binder url,1
0,Mathics,wolfv,github.com,repo url,binder url,0
12,notebooks,Naereen,github.com,repo url,binder url,0
19,stencils,consideratio,github.com,repo url,binder url,0
18,sage-binder-env,sagemath,github.com,repo url,binder url,0
17,requirements,binder-examples,github.com,repo url,binder url,0
16,python-conda_pip,binder-examples,github.com,repo url,binder url,0
15,ptm,gesiscss,github.com,repo url,binder url,0
14,openrefineder,betatim,github.com,repo url,binder url,0


'Total number of launches: 8 in 1h'

## Most popular repositories in the last day

In [8]:
day = 1
time_range = f'{day}d'
time_delta = timedelta(days=day)
df = makedf(time_range, time_delta)
display(df)
"Total number of launches: " + str(sum(df.data['launches'])) + " in "+ str(time_range)

binderhub_launch_time_seconds_count{status='success'}[1d]


Unnamed: 0,name,org,provider,repo_url,binder_url,launches
20,workshop_girls_day,gesiscss,github.com,repo url,binder url,46
2,binder-stats,gesiscss,github.com,repo url,binder url,9
6,flow,gesiscss,github.com,repo url,binder url,1
16,python-conda_pip,binder-examples,github.com,repo url,binder url,1
14,openrefineder,betatim,github.com,repo url,binder url,1
0,Mathics,wolfv,github.com,repo url,binder url,1
5,examples,nteract,github.com,repo url,binder url,0
7,gmt6demo,GenericMappingTools,github.com,repo url,binder url,0
8,gwapps,gwastro,github.com,repo url,binder url,0
9,ijava-binder,SpencerPark,github.com,repo url,binder url,0


'Total number of launches: 59 in 1d'

## Most popular repositories in the last 30 days 

In [9]:
day = 30
time_range = f'{day}d'
time_delta = timedelta(days=day)
df = makedf(time_range, time_delta)
display(df)
"Total number of launches: " + str(sum(df.data['launches'])) + " in "+ str(time_range)

binderhub_launch_time_seconds_count{status='success'}[30d]


Unnamed: 0,name,org,provider,repo_url,binder_url,launches
44,workshop_girls_day,gesiscss,github.com,repo url,binder url,160
4,PythonDataScienceHandbook,gesiscss,github.com,repo url,binder url,57
16,flow,gesiscss,github.com,repo url,binder url,56
7,binder-stats,gesiscss,github.com,repo url,binder url,45
2,GitHub_traffic_crawler,gesiscss,github.com,repo url,binder url,27
28,ligo-binder,minrk,github.com,repo url,binder url,26
33,ptm,gesiscss,github.com,repo url,binder url,20
1,CSSproject,nadjalc,github.com,repo url,binder url,15
17,gesis-meta-analysis-2018,berndweiss,github.com,repo url,binder url,15
38,requirements,binder-examples,github.com,repo url,binder url,11


'Total number of launches: 506 in 30d'

## Most popular repositories in the last 60 days 

In [10]:
day = 60
time_range = f'{day}d'
time_delta = timedelta(days=day)
df = makedf(time_range, time_delta)
display(df)
"Total number of launches: " + str(sum(df.data['launches'])) + " in "+ str(time_range)

binderhub_launch_time_seconds_count{status='success'}[60d]


Unnamed: 0,name,org,provider,repo_url,binder_url,launches
30,gesis-meta-analysis-2018,berndweiss,github.com,repo url,binder url,406
66,workshop_girls_day,gesiscss,github.com,repo url,binder url,275
7,PythonDataScienceHandbook,gesiscss,github.com,repo url,binder url,109
29,flow,gesiscss,github.com,repo url,binder url,81
17,binder-stats,gesiscss,github.com,repo url,binder url,46
43,ligo-binder,minrk,github.com,repo url,binder url,43
4,GitHub_traffic_crawler,gesiscss,github.com,repo url,binder url,40
52,ptm,gesiscss,github.com,repo url,binder url,39
8,RStan-Binder,arnim,github.com,repo url,binder url,26
65,vtna_frontend,marvinf95,github.com,repo url,binder url,24


'Total number of launches: 1267 in 60d'