## Imports

In [None]:
import sfacts as sf

In [None]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy as sp
import pyro
import pyro.distributions as dist
import torch
from functools import partial
from tqdm import tqdm
import xarray as xr
import warnings
from torch.jit import TracerWarning

In [None]:
warnings.filterwarnings(
    "ignore",
    message="torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.",
    category=TracerWarning,
#     module="trace_elbo",  # FIXME: What is the correct regex for module?
#     lineno=5,
)

## Experiments

### Experiment 0: Average and variation in fitting accuracy

In [None]:
results = []
for seed_fit in range(10):
    for seed in [0, 1, 3, 4, 5]:
        generr, comperr, scounter, entropy, runtime, sim, fit = sf.workflow.simulate_fit_and_evaluate(
            s_sim=200,
            n_sim=1000,
            g_sim=1000,
            n_fit=1000,
            g_fit=1000,
            sim_kwargs=dict(
                data=dict(
                    alpha_hyper_mean=100.
                ),
                gamma_hyper=0.01,
                delta_hyper_temp=0.01,
                delta_hyper_p=0.7,
                pi_hyper=0.5,
                rho_hyper=2.,
                mu_hyper_mean=1.,
                mu_hyper_scale=0.5,
                m_hyper_r=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5/0.01,
                device='cuda'
            ),
            preclust=False,
            fit_kwargs=dict(
                s=200,
                gamma_hyper=0.01,
                pi_hyper=0.01,
                rho_hyper=0.5,
                mu_hyper_mean=5,
                mu_hyper_scale=5.,
                m_hyper_r=10.,
                delta_hyper_temp=0.1,
                delta_hyper_p=0.9,
                alpha_hyper_hyper_mean=100.,
                alpha_hyper_hyper_scale=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5 / 0.01,
                device='cuda',
                lag=10,
                lr=1e-1,
                progress=True
            ),
            postclust_kwargs=dict(
                thresh=0.1,
            ),
            seed_sim=seed,
            seed_fit=seed_fit,
            quiet=True,
        )
        results.append((seed_fit, seed, generr, comperr, scounter, entropy, runtime))
        print(seed_fit, seed, generr, comperr, scounter, entropy, runtime, sep='\t')
         
results0 = pd.DataFrame(results, columns=['seed_fit', 'seed', 'generr', 'comperr', 'scounterr', 'entropy', 'runtime'])

In [None]:
fig, axs = plt.subplots(3, 2)

for stat, ax in zip(['generr', 'comperr', 'scounterr', 'entropy', 'runtime'], axs.flatten()):
    results0.set_index(['seed_fit', 'seed'])[stat].unstack().plot(ax=ax)
    ax.set_title(stat)
    ax.legend_.set_visible(False)
fig.tight_layout()

### Experiment 1: Sensitivity to strain misspecification

In [None]:
results = []
for s_fit in [100, 175, 200, 225, 300, 500]:
    for seed in [0, 1, 3, 4, 5]:
        generr, comperr, scounter, entropy, runtime, sim, fit = sf.workflow.simulate_fit_and_evaluate(
            s_sim=200,
            n_sim=1000,
            g_sim=1000,
            n_fit=1000,
            g_fit=1000,
            sim_kwargs=dict(
                data=dict(
                    alpha_hyper_mean=100.
                ),
                gamma_hyper=0.01,
                delta_hyper_temp=0.01,
                delta_hyper_p=0.7,
                pi_hyper=0.5,
                rho_hyper=2.,
                mu_hyper_mean=1.,
                mu_hyper_scale=0.5,
                m_hyper_r=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5/0.01,
                device='cuda'
            ),
            preclust=False,
            fit_kwargs=dict(
                s=s_fit,
                gamma_hyper=0.01,
                pi_hyper=0.01,
                rho_hyper=0.5,
                mu_hyper_mean=5,
                mu_hyper_scale=5.,
                m_hyper_r=10.,
                delta_hyper_temp=0.1,
                delta_hyper_p=0.9,
                alpha_hyper_hyper_mean=100.,
                alpha_hyper_hyper_scale=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5 / 0.01,
                device='cuda',
                lag=10,
                lr=1e-1,
                progress=True
            ),
            postclust_kwargs=dict(
                thresh=0.1,
            ),
            seed_sim=seed,
            seed_fit=seed,
            quiet=True,
        )
        results.append((s_fit, seed, generr, comperr, scounter, entropy, runtime))
        print(s_fit, seed, generr, comperr, scounter, entropy, runtime, sep='\t')
         
results1 = pd.DataFrame(results, columns=['s_fit', 'seed', 'generr', 'comperr', 'scounterr', 'entropy', 'runtime'])

In [None]:
fig, axs = plt.subplots(3, 2)

for stat, ax in zip(['generr', 'comperr', 'scounterr', 'entropy', 'runtime'], axs.flatten()):
    results1.set_index(['s_fit', 'seed'])[stat].unstack().plot(ax=ax)
    ax.set_title(stat)
    ax.legend_.set_visible(False)
fig.tight_layout()

### Experiment 2: Benefits of increasing sample data

In [None]:
results = []
for n_fit in [500, 1500, 2500, 4000]:
    for seed in [0, 1, 3, 4, 5]:
        generr, comperr, scounter, entropy, runtime, sim, fit = sf.workflow.simulate_fit_and_evaluate(
            s_sim=200,
            n_sim=4000,
            g_sim=1000,
            n_fit=n_fit,
            g_fit=1000,
            sim_kwargs=dict(
                data=dict(
                    alpha_hyper_mean=100.
                ),
                gamma_hyper=0.01,
                delta_hyper_temp=0.01,
                delta_hyper_p=0.7,
                pi_hyper=0.5,
                rho_hyper=2.,
                mu_hyper_mean=1.,
                mu_hyper_scale=0.5,
                m_hyper_r=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5/0.01,
                device='cuda'
            ),
            preclust=False,
            fit_kwargs=dict(
                s=200,
                gamma_hyper=0.01,
                pi_hyper=0.01,
                rho_hyper=0.5,
                mu_hyper_mean=5,
                mu_hyper_scale=5.,
                m_hyper_r=10.,
                delta_hyper_temp=0.1,
                delta_hyper_p=0.9,
                alpha_hyper_hyper_mean=100.,
                alpha_hyper_hyper_scale=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5 / 0.01,
                device='cuda',
                lag=10,
                lr=1e-1,
                progress=True
            ),
            postclust_kwargs=dict(
                thresh=0.1,
            ),
            seed_sim=seed,
            seed_fit=seed,
            quiet=True,
        )
        results.append((n_fit, seed, generr, comperr, scounter, entropy, runtime))
        print(n_fit, seed, generr, comperr, scounter, entropy, runtime, sep='\t')
         
results2 = pd.DataFrame(results, columns=['n_fit', 'seed', 'generr', 'comperr', 'scounterr', 'entropy', 'runtime'])

In [None]:
fig, axs = plt.subplots(3, 2)

for stat, ax in zip(['generr', 'comperr', 'scounterr', 'entropy', 'runtime'], axs.flatten()):
    results2.set_index(['n_fit', 'seed'])[stat].unstack().plot(ax=ax)
    ax.set_title(stat)
    ax.legend_.set_visible(False)
    if stat == 'generr':
        ax.set_yscale('log')
fig.tight_layout()

### Experiment 3: Benefits of increasing genotype data

In [None]:
results = []
for g_fit in [100, 250, 500, 1000, 2000]:
    for seed in [0, 1, 3, 4, 5]:
        generr, comperr, scounter, entropy, runtime, sim, fit = sf.workflow.simulate_fit_and_evaluate(
            s_sim=200,
            n_sim=1000,
            g_sim=2000,
            n_fit=1000,
            g_fit=g_fit,
            sim_kwargs=dict(
                data=dict(
                    alpha_hyper_mean=100.
                ),
                gamma_hyper=0.01,
                delta_hyper_temp=0.01,
                delta_hyper_p=0.7,
                pi_hyper=0.5,
                rho_hyper=2.,
                mu_hyper_mean=1.,
                mu_hyper_scale=0.5,
                m_hyper_r=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5/0.01,
                device='cuda'
            ),
            preclust=False,
            fit_kwargs=dict(
                s=200,
                gamma_hyper=0.01,
                pi_hyper=0.01,
                rho_hyper=0.5,
                mu_hyper_mean=5,
                mu_hyper_scale=5.,
                m_hyper_r=10.,
                delta_hyper_temp=0.1,
                delta_hyper_p=0.9,
                alpha_hyper_hyper_mean=100.,
                alpha_hyper_hyper_scale=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5 / 0.01,
                device='cuda',
                lag=10,
                lr=1e-1,
                progress=True
            ),
            postclust_kwargs=dict(
                thresh=0.1,
            ),
            seed_sim=seed,
            seed_fit=seed,
            quiet=True,
        )
        results.append((g_fit, seed, generr, comperr, scounter, entropy, runtime))
        print(g_fit, seed, generr, comperr, scounter, entropy, runtime, sep='\t')
         
results3 = pd.DataFrame(results, columns=['g_fit', 'seed', 'generr', 'comperr', 'scounterr', 'entropy', 'runtime'])

In [None]:
fig, axs = plt.subplots(3, 2)

for stat, ax in zip(['generr', 'comperr', 'scounterr', 'entropy', 'runtime'], axs.flatten()):
    results3.set_index(['g_fit', 'seed'])[stat].unstack().plot(ax=ax)
    ax.set_title(stat)
    ax.legend_.set_visible(False)
    if stat == 'generr':
        ax.set_yscale('log')
fig.tight_layout()

### Experiment 4: Benefits of increasing depth

In [None]:
results = []
for mu_hyper_mean_sim in [0.5, 1.0, 2.0, 5.0, 15.0]:
    for seed in [0, 1, 3, 4, 5]:
        generr, comperr, scounter, entropy, runtime, sim, fit = sf.workflow.simulate_fit_and_evaluate(
            s_sim=200,
            n_sim=1000,
            g_sim=1000,
            n_fit=1000,
            g_fit=1000,
            sim_kwargs=dict(
                data=dict(
                    alpha_hyper_mean=100.
                ),
                gamma_hyper=0.01,
                delta_hyper_temp=0.01,
                delta_hyper_p=0.7,
                pi_hyper=0.5,
                rho_hyper=2.,
                mu_hyper_mean=mu_hyper_mean_sim,
                mu_hyper_scale=0.5,
                m_hyper_r=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5/0.01,
                device='cuda'
            ),
            preclust=False,
            fit_kwargs=dict(
                s=200,
                gamma_hyper=0.01,
                pi_hyper=0.01,
                rho_hyper=0.5,
                mu_hyper_mean=5,
                mu_hyper_scale=5.,
                m_hyper_r=10.,
                delta_hyper_temp=0.1,
                delta_hyper_p=0.9,
                alpha_hyper_hyper_mean=100.,
                alpha_hyper_hyper_scale=10.,
                alpha_hyper_scale=0.5,
                epsilon_hyper_alpha=1.5,
                epsilon_hyper_beta=1.5 / 0.01,
                device='cuda',
                lag=10,
                lr=1e-1,
                progress=True
            ),
            postclust_kwargs=dict(
                thresh=0.1,
            ),
            seed_sim=seed,
            seed_fit=seed,
            quiet=True,
        )
        results.append((mu_hyper_mean_sim, seed, generr, comperr, scounter, entropy, runtime))
        print(mu_hyper_mean_sim, seed, generr, comperr, scounter, entropy, runtime, sep='\t')
         
results4 = pd.DataFrame(results, columns=['mu_hyper_mean_sim', 'seed', 'generr', 'comperr', 'scounterr', 'entropy', 'runtime'])

In [None]:
fig, axs = plt.subplots(3, 2)

for stat, ax in zip(['generr', 'comperr', 'scounterr', 'entropy', 'runtime'], axs.flatten()):
    results4.set_index(['mu_hyper_mean_sim', 'seed'])[stat].unstack().plot(ax=ax)
    ax.set_title(stat)
    ax.legend_.set_visible(False)
    if stat == 'generr':
        ax.set_yscale('log')
fig.tight_layout()

### Visualize all

In [None]:
# TODO: Big matrix plot.

all_results = [
    (results1, 's_fit', 'linear', 1),
    (results2, 'g_fit', 'log', 2),
]

all_stats = [
    ('generr', 'log'),
    ('comperr', 'log'),
    ('scounterr', 'symlog'),
    ('entropy', 'linear'),
    ('runtime', 'log')
]

nres = len(all_results)
nstat = len(all_stats)

fig, axs = plt.subplots(nstat, nres, figsize=(2 * nres, 2 * nstat), sharex='col', sharey='row')
axs = axs.reshape((nstat, nres))

for (stat, scale_y), row in zip(all_stats, axs):
    for (results, indexer, scale_x, title), ax in zip(all_results, row):
        results.set_index([indexer, 'seed'])[stat].unstack().plot(ax=ax)
        ax.set_ylabel(stat)
        ax.set_xlabel(indexer)
        ax.legend_.set_visible(False)
        ax.set_xscale(scale_x)
        ax.set_yscale(scale_y)
        ax.set_title(title)

fig.tight_layout()