In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from pathlib import Path
from collections import defaultdict

In [None]:
path_to_mavedb_data = Path("/path/to/mavedb-dump.20241114101443")
with open(Path(path_to_mavedb_data, "main.json")) as handle:
    api_data = json.load(handle)

In [None]:
len(api_data['experimentSets'])

In [None]:
dump_date = api_data['asOf'].split('T')[0]
dump_date

In [None]:
total_experiments = 0
for eset in api_data['experimentSets']:
    total_experiments += len(eset['experiments'])
total_experiments

In [None]:
experiments_by_year = defaultdict(int)
variants_by_year = defaultdict(int)
for eset in api_data['experimentSets']:
    for exp in eset['experiments']:
        year = int(exp['publishedDate'].split('-')[0])
        experiments_by_year[year] += 1
        for ss in exp['scoreSets']:
            variants_by_year[year] += ss['numVariants']

In [None]:
experiments_by_year

In [None]:
experiments_by_year_cumulative = {k: v for k, v in zip(experiments_by_year.keys(), np.cumsum(list(experiments_by_year.values())))}
experiments_by_year_cumulative

In [None]:
variants_by_year

In [None]:
variants_by_year_cumulative = {k: v for k, v in zip(variants_by_year.keys(), np.cumsum(list(variants_by_year.values())))}
variants_by_year_cumulative

In [None]:
# set the font
font = {'family': 'Lato',
        'weight': 'normal',
        'size' : 15,}
mpl.rc('font', **font)

# create the figure
fig, ax = plt.subplots(figsize=(10, 6))

# make the barplot of experiments
bars = ax.bar(range(len(experiments_by_year_cumulative)), list(experiments_by_year_cumulative.values()), align="center", zorder=10, color="tab:blue")
ax.set_xticks(range(len(experiments_by_year_cumulative)), list(experiments_by_year_cumulative.keys()))
ax.set_ylabel("Cumulative Datasets")
ax.set_xlabel("Data Release Year")
ax.grid(which="major", axis="y", zorder=0)
ax.set_ylim(0, ax.get_yticks()[-1])

# make the lineplot of variants
ax2 = ax.twinx()
lines = ax2.plot(range(len(variants_by_year_cumulative)), [x / 1e6 for x in variants_by_year_cumulative.values()], marker='s', color="tab:green", zorder=20)[0]
x, y = lines.get_data()
ax2.set_ylabel("Cumulative Variants (Millions)")
# rescale the variant y-axis so that the ticks line up
ax2.set_ylim(0, 8.0)
ax2.set_yticks(np.linspace(*ax2.get_ylim(), len(ax.get_yticks())))
ax.legend((bars, lines), ("Datasets", "Variants"))
plt.savefig(f"mavedb_growth_{dump_date}.pdf")
plt.savefig(f"mavedb_growth_{dump_date}.png")