analyse launches, which contains "corona" or "covid" in `spec`

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from urllib.parse import unquote
%matplotlib inline

In [None]:
df = pd.read_csv("covid_binder_launches_2019_12_01_2020_09_10.csv")

In [None]:
df.info()

In [None]:
# convert timestamp to datetime
df["date"] = pd.to_datetime(df["timestamp"])
# select only the columns that are needed for analysis
df = df[["date", "provider", "spec"]]
# set date as index
df.set_index('date',inplace=True)

In [None]:
df.head()

## Launch analysis

### Number of launches

In [None]:
len(df)

### Number of launches per day

In [None]:
# .size() returns Series, so convert it into dataframe
df_launch = df.groupby([df.index.date]).size().to_frame(name="launches")

In [None]:
ax = df_launch.plot(y="launches", kind="bar", use_index=True, figsize=(20, 5))

# show x labels only for beginning of weeks, otherwise it not readable
x = [i.strftime('%b %d') if i.isoweekday() == 1 else "" for i in df_launch.index]
# set_xticklabels return the list, pass it to a variable in order not to output them
_ = ax.set_xticklabels(x, rotation=0)

## Repo analysis

In [None]:
df.provider.unique()

In [None]:
def unique_repo_info(provider, spec):
    """
    Strips out the ref info and returns the unique repo info from provider and spec.
    """
    prefix = {
        'GitHub': 'gh',
        'Gist': 'gist',
        'GitLab': 'gl',
        'Git': 'git',
        'Zenodo': 'zenodo',
        'Figshare': 'figshare',
        'Hydroshare': 'hydroshare',
        'Dataverse': 'dataverse',
    }

    if provider == 'GitHub':
        org, repo_name, _ = spec.split('/', 2)
        namespace = f"{org}/{repo_name}"
    elif provider == 'GitLab':
        quoted_namespace, _ = spec.split('/', 1)
        namespace = unquote(quoted_namespace)
    elif provider == 'Git':
        quoted_repo_url, _ = spec.rsplit('/', 1)
        namespace = unquote(quoted_repo_url)
    else:
        raise Exception(f"parsing {provider} is not implemented")

    if namespace.endswith(".git"):
        namespace = namespace[:-(len(".git"))]
    repo = f'{prefix[provider]}/{namespace}'

    return repo

In [None]:
df["repo"] = df.apply(lambda row: unique_repo_info(row["provider"], row["spec"]), axis=1)

In [None]:
df_repo = df[["repo"]]

In [None]:
df_repo.head()

### Number of unique repos

In [None]:
len(df_repo.repo.unique())

### Popular repos

In [None]:
df_repo.groupby(["repo"]).size().reset_index(name="launches").sort_values("launches", ascending=False).head(10)

### Number of launched repos per day

In [None]:
# nunique: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.SeriesGroupBy.nunique.html
df_repo_unique = df_repo.groupby([df_repo.index.date]).nunique()
df_repo_unique.columns = ["repos"]

In [None]:
ax = df_repo_unique.plot(y="repos", kind="bar", use_index=True, figsize=(20, 5))

# show x labels only for beginning of weeks, otherwise it not readable
x = [i.strftime('%b %d') if i.isoweekday() == 1 else "" for i in df_repo_unique.index]
# set_xticklabels return the list, pass it to a variable in order not to output them
_ = ax.set_xticklabels(x, rotation=90)