In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

In [None]:
sns.set_style("ticks")

# Disable top and right spines.
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

# Display and save figures at higher resolution for presentations and manuscripts.
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['figure.dpi'] = 120

# Display text at sizes large enough for presentations and manuscripts.
mpl.rcParams['font.weight'] = "normal"
mpl.rcParams['axes.labelweight'] = "normal"
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['legend.fontsize'] = 10
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 14
mpl.rc('text', usetex=False)

## Load data

In [None]:
df = pd.read_csv(snakemake.input.accuracies)

In [None]:
df.head()

## Plot accuracies by method, sequences per group, and replicate

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))

ax = sns.stripplot(
    x="sequences_per_group",
    y="normalized_vi",
    hue="method",
    data=df,
    dodge=True,
    alpha=0.25,
    ax=ax,
)

ax.set_ylim(bottom=0)
ax.legend(
    title="Method",
    frameon=False,
)

ax.set_xlabel("Number of sequences per group in subsampling")
ax.set_ylabel("Accuracy of clusters vs.\nNextstrain clades (normalized VI)")

plt.tight_layout()
plt.savefig(snakemake.output.accuracies)

In [None]:
min_max = df.groupby(["method", "sequences_per_group"]).agg({"normalized_vi": ["min", "max"]}).reset_index(drop=True)

In [None]:
(min_max.iloc[:, 1] - min_max.iloc[:, 0]).value_counts()

In [None]:
df.groupby(["method", "sequences_per_group"]).agg({"normalized_vi": ["std"]})