In [1]:
import pandas as pd
import requests
from datetime import datetime, timedelta, timezone

In [2]:
PROVIDER_PREFIXES = {
    'Git': 'git',
    'GitHub': 'gh',
    'GitLab': 'gl',
    # 'gist.github.com': 'gist',
}

In [3]:
def binder_url(provider, org, repo):
    #  url in last column in gallery to launch repository we call in d - list
    provider_prefix = PROVIDER_PREFIXES.get(provider, '')
    if provider_prefix:
        return f'https://notebooks.gesis.org/binder/v2/{provider_prefix}/{org}/{repo}/master'
    return ''

In [4]:
def ts_to_dt(ts):
    # timestamp to datetime 1544621843= 12.12.2018 14:26:34
    return datetime.utcfromtimestamp(ts)

# Launch data

In [5]:
REPOS_TO_FILTER = [
    # 'gesiscss/binder-stats'
]

In [6]:
def get_launch_data(time_range='90d', filter_="{status='success'}"):
    # query in promotheus to take all data (90d)
    # NOTE: increase() doesn't return first +1 (0->1)
    # query = f"increase(binderhub_launch_count_total{filter_}[{time_range}])"
    query = f"binderhub_launch_count_total{filter_}[{time_range}]"
    print(query)
    resp = requests.get('https://notebooks.gesis.org/prometheus/api/v1/query',
                        params={'query': query})
    data = resp.json()['data']['result']
    return data

In [7]:
def process_launch_data(data):
    d = {}  # {repo_url: [repo, org, provider, [launches], repo_url, binder_url]}
    for container in data:
        # first get meta data
        # e.g row in data: provider="GitHub",repo="https://github.com/gesiscss/ptm" , status="success"}
        repo_url = container['metric']['repo']
        provider = container['metric']['provider']
        provider_org_repo = repo_url.replace('https://', '').rstrip('.git').rsplit('/', 2)
        if len(provider_org_repo) == 2:
            # some repo urls (e.g. gist) don't contain org/user info
            provider, repo = provider_org_repo
            org = ''
        else:
            provider_, org, repo = provider_org_repo
        if f'{org}/{repo}' in REPOS_TO_FILTER:
            continue
        # detect changes of launch count
        # get ts of each increase of launch count
        launch_count_increases = []
        launch_count_prev = 0

        # e.g of container['value'] (1542986164.636, 1)
        for ts, launch_count in container['values']:
            launch_count = int(launch_count)
            if launch_count != launch_count_prev:
                # assert launch_count > launch_count_prev
                launch_count_increases.append((ts, launch_count))
            launch_count_prev = launch_count

        if repo_url not in d:
            d[repo_url] = [repo, org, provider, [launch_count_increases], repo_url, binder_url(provider, org, repo)]
        else:
            # same repo can be launched on different instances (after a new deployment/update)
            d[repo_url][3].append(launch_count_increases)

    # sort and flatten launch_count_increases
    for _, data in d.items():
        launch_count_increases_ = []
        count_prev = 0
        launch_count_increases = data[3]
        # sort with timestamp (of first element of each sub-list)
        launch_count_increases.sort(key=lambda x: x[0][0])
        for increase in launch_count_increases:
            for i in increase:
                launch_count_increases_.append((ts_to_dt(i[0]), i[1] + count_prev))
            count_prev = launch_count_increases_[-1][1]
        data[3] = launch_count_increases_

    #     print('Since', ts_to_dt(min(value_times)), 'in UTC')
    d = list(d.values())
    return d

In [8]:
launch_data = get_launch_data()
launch_data = process_launch_data(launch_data)

binderhub_launch_count_total{status='success'}[90d]


# Popular Repos

In [9]:
def get_popular_repos(launch_data, time_range):
    if time_range.endswith('h'):
        p = {'hours': int(time_range.split('h')[0])}
    elif time_range.endswith('d'):
        p = {'days': int(time_range.split('d')[0])}
    else:
        raise ValueError('Time range must be in hours or days.')
    popular_repos = []
    time_delta = timedelta(**p)
    start_dt = datetime.utcnow() - time_delta
    for data in launch_data:
        # get the launch count just before time_range
        first_value = 0
        for dt, launch_count in data[3]:
            if dt < start_dt:
                first_value = launch_count
        # increase in launch count during time_range
        data[3] = launch_count - first_value
        if data[3] != 0:
            popular_repos.append(data)
    # sort according to launch count
    popular_repos.sort(key=lambda x: x[3], reverse=True)
    print("Total number of launches: " + str(sum([i[3] for i in popular_repos])) + " in " + str(time_range))
    print("Number of repos launched: " + str(len(popular_repos)))
    return popular_repos

In [10]:
def display_popular_repos(data):
    df = pd.DataFrame(data, 
                      columns = ['repo', 'org', 'provider', 'launches', 'repo_url', 'binder_url'])
    df = df.sort_values('launches', ascending=False).reset_index(drop=True)
    df = df.style.format({'repo_url':lambda x: f'<a target="_blank" href="{x}">repo url</a>', 
                          'binder_url': lambda x: f'<a target="_blank" href="{x}">binder url</a>'})
    display(df)

In [11]:
from copy import deepcopy

## Popular repositories in last 1 hour

In [12]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '1h')
display_popular_repos(popular_repos)

Total number of launches: 1 in 1h
Number of repos launched: 1


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,binder-stats,gesiscss,GitHub,1,repo url,binder url


## Popular repositories in last 24 hours

In [13]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '24h')
display_popular_repos(popular_repos)

Total number of launches: 19 in 24h
Number of repos launched: 5


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PyStan-Binder,arnim,GitHub,9,repo url,binder url
1,binder-stats,gesiscss,GitHub,4,repo url,binder url
2,RStan-Binder,arnim,GitHub,3,repo url,binder url
3,stmdemo,arnim,GitHub,2,repo url,binder url
4,PythonDataScienceHandbook,jakevdp,GitHub,1,repo url,binder url


## Popular repositories in last week

In [14]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '7d')
display_popular_repos(popular_repos)

Total number of launches: 181 in 7d
Number of repos launched: 44


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,gesis_dataday_19,jobreu,GitHub,76,repo url,binder url
1,PyStan-Binder,arnim,GitHub,18,repo url,binder url
2,wikiwho_demo,gesiscss,GitHub,10,repo url,binder url
3,RStan-Binder,arnim,GitHub,7,repo url,binder url
4,binder-stats,gesiscss,GitHub,7,repo url,binder url
5,GitHub_traffic_crawler,gesiscss,GitHub,6,repo url,binder url
6,PythonDataScienceHandbook,jakevdp,GitHub,6,repo url,binder url
7,wikiwho_tutorial,gesiscss,GitHub,4,repo url,binder url
8,ligo-binder,minrk,GitHub,4,repo url,binder url
9,Tweet_sentimen,UdovenkoVolodymyr,GitHub,3,repo url,binder url


## Popular repositories in the last 30 days 

In [15]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '30d')
display_popular_repos(popular_repos)

Total number of launches: 375 in 30d
Number of repos launched: 66


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,gesis_dataday_19,jobreu,GitHub,92,repo url,binder url
1,PythonDataScienceHandbook,jakevdp,GitHub,34,repo url,binder url
2,wikiwho_demo,gesiscss,GitHub,29,repo url,binder url
3,PyStan-Binder,arnim,GitHub,18,repo url,binder url
4,ptm,gesiscss,GitHub,16,repo url,binder url
5,wikiwho_tutorial,gesiscss,GitHub,11,repo url,binder url
6,RStan-Binder,arnim,GitHub,10,repo url,binder url
7,binder-stats,gesiscss,GitHub,10,repo url,binder url
8,BIGSSS,JuKo007,GitHub,9,repo url,binder url
9,pandas-cookbook,jvns,GitHub,9,repo url,binder url


## Popular repositories in the last 60 days 

In [16]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '60d')
display_popular_repos(popular_repos)

Total number of launches: 2189 in 60d
Number of repos launched: 311


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,180,repo url,binder url
1,wikiwho_demo,gesiscss,GitHub,110,repo url,binder url
2,iris_python,bitnik,GitHub,101,repo url,binder url
3,gesis_dataday_19,jobreu,GitHub,92,repo url,binder url
4,ptm,gesiscss,GitHub,85,repo url,binder url
5,ipython-in-depth,ipython,GitHub,76,repo url,binder url
6,RStan-Binder,arnim,GitHub,75,repo url,binder url
7,iris_r,bitnik,GitHub,67,repo url,binder url
8,wikiwho_tutorial,gesiscss,GitHub,65,repo url,binder url
9,gesis-meta-analysis-2018,berndweiss,GitHub,58,repo url,binder url


## Popular repositories in the last 90 days 

In [17]:
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos(launch_data_, '90d')
display_popular_repos(popular_repos)

Total number of launches: 2899 in 90d
Number of repos launched: 373


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,311,repo url,binder url
1,ptm,gesiscss,GitHub,126,repo url,binder url
2,binder-stats,gesiscss,GitHub,113,repo url,binder url
3,wikiwho_demo,gesiscss,GitHub,110,repo url,binder url
4,iris_python,bitnik,GitHub,101,repo url,binder url
5,gesis_dataday_19,jobreu,GitHub,92,repo url,binder url
6,ipython-in-depth,ipython,GitHub,86,repo url,binder url
7,wikiwho_tutorial,gesiscss,GitHub,81,repo url,binder url
8,RStan-Binder,arnim,GitHub,79,repo url,binder url
9,workshop_girls_day,gesiscss,GitHub,73,repo url,binder url


## Get launched repositories in a period

In [18]:
def get_popular_repos_in_period(launch_data, from_dt, to_dt):
    """
    from and to datetimes must be in CET (UTC +1), they will be converted into UTC and then calculations are done.
    For example to get launched repos in December 2018
    from_dt="2018-12-01 00:00:00" (included)
    to_dt="2019-01-01 00:00:00" (not included)   
    """
    popular_repos = []
    start_dt = datetime.strptime(from_dt, '%Y-%m-%d %H:%M:%S') - timedelta(hours=1)
    end_dt = datetime.strptime(to_dt, '%Y-%m-%d %H:%M:%S') - timedelta(hours=1)
    #print(start_dt, end_dt)
    for data in launch_data:
        # get the launch count just before the of the period
        # and just after the end of the period
        first_value = 0
        last_value = 0
        for dt, launch_count in data[3]:
            if dt < start_dt:
                first_value = launch_count
                #print(1, data[0], dt, start_dt, first_value)
            elif dt < end_dt:
                last_value = launch_count
                #print(2, data[0], dt, end_dt, last_value)
            else:
                break
        # increase in launch count in a period
        data[3] = last_value - first_value
        if data[3] > 0:
            popular_repos.append(data)
    # sort according to launch count
    popular_repos.sort(key=lambda x: x[3], reverse=True)
    print(f"Total number of launches: {sum([i[3] for i in popular_repos])} between {from_dt} and {to_dt}")
    print(f"Number of repos launched: {len(popular_repos)}")
    return popular_repos

In [19]:
# get number of launches in January 2019
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos_in_period(launch_data_, "2019-01-01 00:00:00", "2019-02-01 00:00:00")
display_popular_repos(popular_repos)

Total number of launches: 335 between 2019-01-01 00:00:00 and 2019-02-01 00:00:00
Number of repos launched: 65


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,gesis_dataday_19,jobreu,GitHub,92,repo url,binder url
1,PythonDataScienceHandbook,jakevdp,GitHub,25,repo url,binder url
2,PyStan-Binder,arnim,GitHub,18,repo url,binder url
3,wikiwho_demo,gesiscss,GitHub,17,repo url,binder url
4,ptm,gesiscss,GitHub,13,repo url,binder url
5,RStan-Binder,arnim,GitHub,10,repo url,binder url
6,binder-stats,gesiscss,GitHub,10,repo url,binder url
7,wikiwho_tutorial,gesiscss,GitHub,10,repo url,binder url
8,pandas-cookbook,jvns,GitHub,8,repo url,binder url
9,GitHub_traffic_crawler,gesiscss,GitHub,7,repo url,binder url


In [20]:
# get number of launches in December 2018
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos_in_period(launch_data_, "2018-12-01 00:00:00", "2019-01-01 00:00:00")
display_popular_repos(popular_repos)

Total number of launches: 1543 between 2018-12-01 00:00:00 and 2019-01-01 00:00:00
Number of repos launched: 262


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,161,repo url,binder url
1,iris_python,bitnik,GitHub,93,repo url,binder url
2,wikiwho_demo,gesiscss,GitHub,67,repo url,binder url
3,iris_r,bitnik,GitHub,65,repo url,binder url
4,ipython-in-depth,ipython,GitHub,64,repo url,binder url
5,RStan-Binder,arnim,GitHub,59,repo url,binder url
6,ptm,gesiscss,GitHub,48,repo url,binder url
7,gesis-meta-analysis-2018,berndweiss,GitHub,45,repo url,binder url
8,wikiwho_tutorial,gesiscss,GitHub,29,repo url,binder url
9,workshop_girls_day,gesiscss,GitHub,28,repo url,binder url


In [21]:
# get number of launches in November 2018
launch_data_ = deepcopy(launch_data)
popular_repos = get_popular_repos_in_period(launch_data_, "2018-11-01 00:00:00", "2018-12-01 00:00:00")
display_popular_repos(popular_repos)

Total number of launches: 1021 between 2018-11-01 00:00:00 and 2018-12-01 00:00:00
Number of repos launched: 202


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PythonDataScienceHandbook,gesiscss,GitHub,150,repo url,binder url
1,binder-stats,gesiscss,GitHub,81,repo url,binder url
2,ptm,gesiscss,GitHub,65,repo url,binder url
3,wikiwho_tutorial,gesiscss,GitHub,42,repo url,binder url
4,workshop_girls_day,gesiscss,GitHub,42,repo url,binder url
5,pymc3,pymc-devs,GitHub,27,repo url,binder url
6,textbook,DS-100,GitHub,26,repo url,binder url
7,wikiwho_demo,gesiscss,GitHub,26,repo url,binder url
8,bokeh-notebooks,bokeh,GitHub,25,repo url,binder url
9,gesis-meta-analysis-2018,berndweiss,GitHub,22,repo url,binder url


In [22]:
# get number of launches in last 24 hours
launch_data_ = deepcopy(launch_data)
# datetime.now() returns UTC+2
dt1 = datetime.utcnow() + timedelta(hours=1, minutes=1)
dt2 = datetime.utcnow() - timedelta(hours=23)
popular_repos = get_popular_repos_in_period(launch_data_, dt2.strftime('%Y-%m-%d %H:%M:%S'), dt1.strftime('%Y-%m-%d %H:%M:%S'))
display_popular_repos(popular_repos)

Total number of launches: 19 between 2019-01-22 11:06:03 and 2019-01-23 11:07:03
Number of repos launched: 5


Unnamed: 0,repo,org,provider,launches,repo_url,binder_url
0,PyStan-Binder,arnim,GitHub,9,repo url,binder url
1,binder-stats,gesiscss,GitHub,4,repo url,binder url
2,RStan-Binder,arnim,GitHub,3,repo url,binder url
3,stmdemo,arnim,GitHub,2,repo url,binder url
4,PythonDataScienceHandbook,jakevdp,GitHub,1,repo url,binder url
