In [None]:
import pandas as pd
import requests
from time import sleep
from datetime import datetime, timedelta

start_date = datetime(2020, 2, 6)
end_date = datetime.today().replace(hour=0, minute=0, second=0, microsecond=0) 

delta = timedelta(days=1)
current_date = start_date
output_file = f'output_{datetime.now().isoformat()}.txt'
while current_date <= end_date:
    a_name = f"events-{str(current_date.date())}.jsonl"
    archive_url = f"https://archive.analytics.mybinder.org/{a_name}"
    domain = 'notebooks.gesis.org'
    from_dt = current_date.isoformat()
    to_dt = (current_date+delta-timedelta(seconds=1)).isoformat()
    api_url = f'https://{domain}/gallery/api/v1.0/launches/{from_dt}/{to_dt}'

    print(a_name, from_dt, to_dt)
    print(archive_url)
    print(api_url)
    
    # first read events from archive
    df = pd.read_json(archive_url, lines=True)
    # handle exections in events archive
    # events before 12.06.2019 has no origin value
    if 'origin' not in df.columns:
        df["origin"] = "mybinder.org"
    # events-2019-06-12.jsonl has mixed rows: with and without origin value
    if a_name == "events-2019-06-12.jsonl":
        df['origin'].fillna('mybinder.org', inplace=True)
    # in some archives Gist launches have wrong provider (GitHub)
    elif a_name == "events-2018-11-25.jsonl":
        df.loc[df['spec'] == "https%3A%2F%2Fgist.github.com%2Fjakevdp/256c3ad937af9ec7d4c65a29e5b6d454",
                  "provider"] = "Gist"
        df.loc[df['spec'] == "https%3A%2F%2Fgist.github.com%2Fjakevdp/256c3ad937af9ec7d4c65a29e5b6d454",
                  "spec"] = "jakevdp/256c3ad937af9ec7d4c65a29e5b6d454"
    elif a_name == "events-2019-01-28.jsonl":
        df.loc[df['spec'] == "loicmarie/ade5ea460444ea0ff72d5c94daa14500",
                  "provider"] = "Gist"
    elif a_name == "events-2019-02-22.jsonl":
        df.loc[df['spec'] == "minrk/6d61e5edfa4d2947b0ee8c1be8e79154",
                  "provider"] = "Gist"
    
    # filter out gesis launches, because archive is updated periodically, 
    # but binder gallery is updated instantly for gesis launches
    df = df.loc[df['origin'] != domain]
    df = df.loc[df['origin'] != "gesis.mybinder.org"]
    
    # now read launch events from binder gallery api
    launches = []
    # because of pagination the api gives 100 results per page 
    # so for analysis you have to take data in all pages
    next_page = 0
    while next_page is not None:
        api_query_url = api_url
        if next_page:
            api_query_url = api_url + str('?page=') + str(next_page)
        r = requests.get(api_query_url)
        response = r.json()
        # check the limit of queries per second/minute,
        message = response.get("message", "")
        if message not in ["2 per 1 second", "100 per 1 minute"]:
            launches.extend(response['launches'])
            next_page = response['next_page']
        else:
            sleep(1)
    data = pd.DataFrame(launches)
    # filter out gesis launches, because archive is updated periodically, 
    # but binder gallery is updated instantly for gesis launches
    df2 = data.loc[data['origin'] != ""]
    df2 = df2.loc[data['origin'] != domain]
    df2 = df2.loc[data['origin'] != "gesis.mybinder.org"]

    # convert string timestamp to pandas timestamp
    df2['timestamp'] = pd.to_datetime(df2['timestamp'])
    
    # do the comparison
    a = df.drop(columns=['schema', 'status']).groupby(["origin", "provider", "spec", "version"]).count().sort_values("timestamp", ascending=False)
    b = df2.drop(columns=['schema', 'status']).groupby(["origin", "provider", "spec", "version"]).count().sort_values("timestamp", ascending=False)
    is_equal = a.equals(b)
    if len(df) != len(df2) or not is_equal:
        with open("error_"+output_file, 'a') as f:
            f.write(f"{current_date.isoformat()}, archives len: {len(df)}, "
                    f"gallery api len: {len(df2)}, diff: {len(df)-len(df2)}, is_equal: {is_equal}\n")
        print("-------> error")
        #print(df.origin.unique())
        #print(df2.origin.unique())
        #print(set(df.origin.unique())-set(df2.origin.unique()))
    else:
        with open("success_"+output_file, 'a') as f:
            f.write(f"{current_date.isoformat()}\n")
            
    current_date += delta
    print("\n")