In [1]:
import pandas as pd
import requests
from datetime import datetime, timedelta

In [2]:
def binder_url(org, repo):
    return f'https://notebooks.gesis.org/binder/v2/gh/{org}/{repo}/master'

In [3]:
def ts_to_dt(ts):
    return datetime.utcfromtimestamp(ts)

# Launch count data

In [4]:
REPOS_TO_FILTER = [
    'gesiscss/binder-stats'
]

In [5]:
def get_launch_data(time_range='90d', filter="{status='success'}"):
    # increase() doesn't return first +1 (0->1)
    # query = f"increase(binderhub_launch_count_total{filter}[{time_range}])"
    query = f"binderhub_launch_count_total{filter}[{time_range}]"
    print(query)
    resp = requests.get('https://notebooks.gesis.org/prometheus/api/v1/query', 
                        params={'query': query})
    data = resp.json()['data']['result']
    return data

In [6]:
def process_launch_data(data):
    d = {}  # {repo: [repo, org, provider, [launches], repo_url, binder_url]}
    for container in data:
        repo_url = container['metric']['repo']
        provider = container['metric']['provider']
        provider_org_repo = repo_url.replace('https://', '').rstrip('.git').rsplit('/', 2)
        if len(provider_org_repo) == 2:
            # some repo urls (e.g. gist) dont contain org/user info
            provider, repo = provider_org_repo
            org = ''
        else:
            provider_, org, repo = provider_org_repo
        if f'{org}/{repo}' in REPOS_TO_FILTER:
            continue
            
        # get ts of each increase of launch count
        launch_count_increases = []
        launch_count_prev = 0
        for ts, launch_count in container['values']:
            launch_count = int(launch_count)
            if launch_count != launch_count_prev:
                assert launch_count > launch_count_prev
                launch_count_increases.append((ts, launch_count))
            launch_count_prev = launch_count
        
        if repo not in d:
            d[repo] = [repo, org, provider, [launch_count_increases], repo_url, binder_url(org, repo)]
        else:
            # same repo can be launched on different instances (after a new deployment/update)
            d[repo][3].append(launch_count_increases)
            
    # sort and flatten launch_count_increases
    for repo, data in d.items():
        launch_count_increases_ = []
        count_prev = 0
        launch_count_increases = data[3]
        # sort with timestamp
        launch_count_increases.sort(key=lambda x:x[0][0])
        for increase in launch_count_increases:
            for i in increase:
                launch_count_increases_.append((ts_to_dt(i[0]), i[1]+count_prev))
            count_prev = launch_count_increases_[-1][1]
        data[3] = launch_count_increases_
    
#     print('Since', ts_to_dt(min(value_times)), 'in UTC')
    d = list(d.values())
    return d

In [7]:
launch_data = get_launch_data()
launch_data = process_launch_data(launch_data)

binderhub_launch_count_total{status='success'}[90d]


# Popular Repos

In [8]:
def get_popular_repos(launch_data, time_range):
    if time_range.endswith('h'):
        p = {'hours': int(time_range.split('h')[0])}
    elif time_range.endswith('d'):
        p = {'days': int(time_range.split('d')[0])}
    else:
        raise ValueError('Time range must be in hours or days.')
    popular_repos = []
    time_delta = timedelta(**p)
    now_dt = datetime.now()
    start_dt = datetime.now() - time_delta
    for data in launch_data:
        first_value = 0
        for dt, launch_count in data[3]:
            if dt < start_dt:
                first_value = launch_count
        # increase in time_range
        data[3] = launch_count - first_value
        if data[3] != 0:
            popular_repos.append(data)
    print("Total number of launches: " + str(sum([i[3] for i in popular_repos])) + " in "+ str(time_range))
    return popular_repos

In [9]:
def display_popular_repos(data):
    df = pd.DataFrame(data, 
                      columns = ['repo', 'org', 'provider', 'launches', 'repo_url', 'binder_url'])
    df = df.sort_values('launches', ascending=False).reset_index(drop=True)
    df = df.style.format({'repo_url':lambda x: f'<a target="_blank" href="{x}">repo url</a>', 
                          'binder_url': lambda x: f'<a target="_blank" href="{x}">binder url</a>'})
    display(df)

In [10]:
from copy import deepcopy

## Popular repositories in last 1 hour

In [11]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '1h')
display_popular_repos(popular_repos)

Total number of launches: 0 in 1h


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url


## Popular repositories in last 24 hours

In [12]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '24h')
display_popular_repos(popular_repos)

Total number of launches: 33 in 24h


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,4,repo url,binder url
1,simple-binder-repo,bitnik,GitHub,2,repo url,binder url
2,3778422,,gist.github.com,1,repo url,binder url
3,convert2geojson,computational-antiquity,GitHub,1,repo url,binder url
4,sage-binder-env,sagemath,GitHub,1,repo url,binder url
5,showntell,psychemedia,GitHub,1,repo url,binder url
6,ligo-binder,minrk,GitHub,1,repo url,binder url
7,dsfe,matthew-brett,GitHub,1,repo url,binder url
8,jupyterlab-latex,jupyterlab,GitHub,1,repo url,binder url
9,jupyter-renderers,jupyterlab,GitHub,1,repo url,binder url


## Popular repositories in last week

In [13]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '7d')
display_popular_repos(popular_repos)

Total number of launches: 63 in 7d


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,5,repo url,binder url
1,workshop_girls_day,gesiscss,GitHub,3,repo url,binder url
2,simple-binder-repo,bitnik,GitHub,3,repo url,binder url
3,convert2geojson,computational-antiquity,GitHub,2,repo url,binder url
4,conda,binder-examples,GitHub,2,repo url,binder url
5,pangeo-example-notebooks,pangeo-data,GitHub,2,repo url,binder url
6,jupyter-renderers,jupyterlab,GitHub,2,repo url,binder url
7,gwapps,gwastro,GitHub,2,repo url,binder url
8,requirements,binder-examples,GitHub,2,repo url,binder url
9,tidyverse,tidyverse,GitHub,2,repo url,binder url


## Most popular repositories in the last 30 days 

In [14]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '30d')
display_popular_repos(popular_repos)

Total number of launches: 63 in 30d


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,5,repo url,binder url
1,workshop_girls_day,gesiscss,GitHub,3,repo url,binder url
2,simple-binder-repo,bitnik,GitHub,3,repo url,binder url
3,convert2geojson,computational-antiquity,GitHub,2,repo url,binder url
4,conda,binder-examples,GitHub,2,repo url,binder url
5,pangeo-example-notebooks,pangeo-data,GitHub,2,repo url,binder url
6,jupyter-renderers,jupyterlab,GitHub,2,repo url,binder url
7,gwapps,gwastro,GitHub,2,repo url,binder url
8,requirements,binder-examples,GitHub,2,repo url,binder url
9,tidyverse,tidyverse,GitHub,2,repo url,binder url


## Most popular repositories in the last 60 days 

In [15]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '60d')
display_popular_repos(popular_repos)

Total number of launches: 63 in 60d


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,5,repo url,binder url
1,workshop_girls_day,gesiscss,GitHub,3,repo url,binder url
2,simple-binder-repo,bitnik,GitHub,3,repo url,binder url
3,convert2geojson,computational-antiquity,GitHub,2,repo url,binder url
4,conda,binder-examples,GitHub,2,repo url,binder url
5,pangeo-example-notebooks,pangeo-data,GitHub,2,repo url,binder url
6,jupyter-renderers,jupyterlab,GitHub,2,repo url,binder url
7,gwapps,gwastro,GitHub,2,repo url,binder url
8,requirements,binder-examples,GitHub,2,repo url,binder url
9,tidyverse,tidyverse,GitHub,2,repo url,binder url
