### Preamble

#### Project Template

In [None]:
%load_ext autoreload

In [None]:
import os as _os

_os.chdir(_os.environ["PROJECT_ROOT"])
_os.path.realpath(_os.path.curdir)

#### Imports

In [None]:
import os
import subprocess
import sys
import time
from datetime import datetime
from glob import glob
from itertools import chain, product
from tempfile import mkstemp

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import xarray as xr
from mpl_toolkits.axes_grid1 import make_axes_locatable
from statsmodels.stats.multitest import fdrcorrection
from tqdm import tqdm

import lib.plot
from lib.pandas_util import align_indexes, aligned_index, idxwhere, invert_mapping

In [None]:
import lib.thisproject.data

#### Set Style

In [None]:
sns.set_context("talk")
plt.rcParams["figure.dpi"] = 50

## Load Data

In [None]:
species_taxonomy_inpath = "ref/gtpro/species_taxonomy_ext.tsv"

species_taxonomy = lib.thisproject.data.load_species_taxonomy(species_taxonomy_inpath)
species_taxonomy

In [None]:
cog_category_description = pd.read_table(
    "ref/cog-20.categories.tsv",
    names=["cog_category", "description"],
    index_col="cog_category",
).assign(description=lambda x: x.index + ": " + x.description)
cog_category_description.loc["no_category", "description"] = "-: No Annotation"
cog_category_description

In [None]:
cog_category_columns = [
    "cog_category_A",
    "cog_category_B",
    "cog_category_C",
    "cog_category_D",
    "cog_category_E",
    "cog_category_F",
    "cog_category_G",
    "cog_category_H",
    "cog_category_I",
    "cog_category_J",
    "cog_category_K",
    "cog_category_L",
    "cog_category_M",
    "cog_category_N",
    "cog_category_O",
    "cog_category_P",
    "cog_category_Q",
    "cog_category_R",
    "cog_category_S",
    "cog_category_T",
    "cog_category_U",
    "cog_category_V",
    "cog_category_W",
    "cog_category_X",
    "cog_category_Y",
    "cog_category_Z",
    "cog_category_no_category",
]

cog_category_order = [x[len('cog_category_'):] for x in cog_category_columns]
prevalence_class_order = ['core', 'shell', 'cloud']

In [None]:
gene_stats = []

for path in tqdm(
    glob(
        "data/group/xjin_ucfmt_hmp2/species/sp-*/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30_thresh-corr100-depth250.gene_stats.tsv"
    )
):
    species = path[
        len("data/group/xjin_ucfmt_hmp2/species/sp-") : -len(
            "/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30_thresh-corr100-depth250.gene_stats.tsv"
        )
    ]
    d = pd.read_table(path)
    gene_stats.append(d.assign(species=species))
gene_stats = pd.concat(gene_stats).set_index("gene_id")
gene_stats[cog_category_columns] = gene_stats[cog_category_columns].fillna(False)

In [None]:
spgc_strain_stats = []

for path in tqdm(
    glob(
        "data/group/xjin_ucfmt_hmp2/species/sp-*/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30_thresh-corr100-depth250.spgc_strain_stats.tsv"
    )
):
    species = path[
        len("data/group/xjin_ucfmt_hmp2/species/sp-") : -len(
            "/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30_thresh-corr100-depth250.spgc_strain_stats.tsv"
        )
    ]
    d = pd.read_table(path)
    spgc_strain_stats.append(d.assign(species=species))
spgc_strain_stats = pd.concat(spgc_strain_stats).join(species_taxonomy, on='species')

In [None]:
ref_strain_stats = []

for path in tqdm(
    glob(
        "data/group/xjin_ucfmt_hmp2/species/sp-*/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30_thresh-corr100-depth250.ref_strain_stats.tsv"
    )
):
    species = path[
        len("data/group/xjin_ucfmt_hmp2/species/sp-") : -len(
            "/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30_thresh-corr100-depth250.ref_strain_stats.tsv"
        )
    ]
    d = pd.read_table(path)
    ref_strain_stats.append(d.assign(species=species))
ref_strain_stats = pd.concat(ref_strain_stats).join(species_taxonomy, on='species')

In [None]:
phylum_palette = lib.plot.construct_ordered_palette(sorted(ref_strain_stats.p__.unique()))

In [None]:
bins = np.logspace(-7, 0, num=50)
# bins = np.linspace(0, 1, num=51)

x = gene_stats[
    lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9) & (x.num_mwas_subject > 20)
].ibd_mwas_pvalue.sort_values()

plt.hist(
    x,
    alpha=0.5,
    bins=bins,
)

plt.plot(bins[1:], (bins[1:] - bins[:-1]) * x.shape)

plt.xscale("log")
plt.yscale("log")

In [None]:
plt.scatter(
    "ibd_mwas_oddsratio_pc",
    "ibd_mwas_pvalue",
    data=gene_stats[
        lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9) & (x.num_mwas_subject > 20)
    ],
    s=4,
    alpha=0.5,
)
plt.xscale("log")
plt.yscale("log")
ax = plt.gca()
ax.invert_yaxis()
plt.xlabel("Odds-Ratio (w/ pseudo-counts)")
plt.ylabel("P-value")

In [None]:
d0 = (gene_stats[
        lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9) & (x.num_mwas_subject > 40)
    ]
    .assign(fdr=lambda x: fdrcorrection(x.ibd_mwas_pvalue)[1])
    .join(species_taxonomy, on="species")
    .sort_values("ibd_mwas_pvalue")
)

plt.hist2d(
    "ibd_mwas_oddsratio_pc",
    "ibd_mwas_pvalue",
    data=d0,
    bins=(np.logspace(-3, 3, num=100), np.logspace(-7, 0, num=100)),
    norm=mpl.colors.SymLogNorm(linthresh=2, linscale=3, vmin=1, vmax=1e5),
    cmin=1,
    cmap="copper_r",
)

plt.colorbar()

plt.scatter(
    "ibd_mwas_oddsratio_pc",
    "ibd_mwas_pvalue",
    data=d0[(d0.fdr < 0.2)],
    s=20,
    marker='o',
    edgecolor='k',
    facecolor='none',
    lw=1,
    alpha=0.5,
)
plt.yscale("log")
plt.xscale("log")
# plt.xscale('log')
plt.gca().invert_yaxis()
plt.xlabel("Log(Odds Ratio)")
plt.ylabel("P-value")

# d1 = d0[lambda x: (x.gene == 'UHGG000638_01150') & (x.pvalue < 1e-3)]
# for _, d2 in d1.iterrows():
#     plt.annotate('', xy=(d2.log_logratio_pc, d2.pvalue), ha='left', va='bottom', xytext=(10, 10), textcoords="offset points", arrowprops=dict())
# d1

In [None]:
d = gene_stats[
    lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9) & (x.num_mwas_subject > 40)
]
d = (
    d.assign(fdr=fdrcorrection(d.ibd_mwas_pvalue)[1])
    .join(species_taxonomy, on="species")
    .sort_values("ibd_mwas_pvalue")
)

d[d.fdr<0.2].assign(log_oddsratio_sign=lambda x: (x.ibd_mwas_oddsratio_pc > 1).map({True: '+', False: '-'}))[['species', 'log_oddsratio_sign']].value_counts().unstack('log_oddsratio_sign', fill_value=0).reindex(columns=['-', '+'], fill_value=0).assign(total=lambda x: x['+'] + x['-']).sort_values(['total', '+'], ascending=False).rename(species_taxonomy.taxonomy_string)

In [None]:
d = gene_stats[
    lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9) & (x.num_mwas_subject > 40)
]
d = (
    d.assign(fdr=fdrcorrection(d.ibd_mwas_pvalue)[1])
    .sort_values("ibd_mwas_pvalue")
    .assign(log_oddsratio_sign=lambda x: (x.ibd_mwas_oddsratio_pc > 1).map({True: '+', False: '-'}))
)

_results = {}
for _col in cog_category_columns:
    _results[_col]  = d[d.fdr<0.2][[_col, 'log_oddsratio_sign']].value_counts().unstack().reindex([True], fill_value=0).squeeze()

_results = pd.DataFrame(_results).fillna(0).T.assign(total=lambda x: x['+'] + x['-']).sort_values(['total', '+'], ascending=False)
_results[_results.total > 0].astype(int)

In [None]:
d0 = (gene_stats[
    lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9) & (x.num_mwas_subject > 40)
    ]
    .assign(fdr=lambda x: fdrcorrection(x.ibd_mwas_pvalue)[1])
    .join(species_taxonomy, on="species")
    .sort_values("ibd_mwas_pvalue")
)

d0[
    [
        "nlength",
        "description",
        "species",
        "f__",
        "g__",
        "s__",
        "ibd_mwas_pvalue",
        "ibd_mwas_oddsratio_pc",
        "prevalence_spgc",
        "prevalence_ref",
        "phylogenetic_r_spgc",
    ]
][d0.fdr<0.2]

In [None]:
d0 = (gene_stats[
    lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9) & (x.num_mwas_subject > 20)
    ]
    .assign(fdr=lambda x: fdrcorrection(x.cd_mwas_pvalue)[1])
    .join(species_taxonomy, on="species")
    .sort_values("cd_mwas_pvalue")
)

plt.hist2d(
    "cd_mwas_oddsratio_pc",
    "cd_mwas_pvalue",
    data=d0,
    bins=(np.logspace(-3, 3, num=100), np.logspace(-7, 0, num=100)),
    norm=mpl.colors.SymLogNorm(linthresh=2, linscale=3, vmin=1, vmax=1e5),
    cmin=1,
    cmap="copper_r",
)

plt.colorbar()

plt.scatter(
    "cd_mwas_oddsratio_pc",
    "cd_mwas_pvalue",
    data=d0[(d0.fdr < 0.1)],
    s=20,
    marker='o',
    edgecolor='k',
    facecolor='none',
    lw=1,
    alpha=0.5,
)
plt.yscale("log")
plt.xscale("log")
# plt.xscale('log')
plt.gca().invert_yaxis()
plt.xlabel("Log(Odds Ratio)")
plt.ylabel("P-value")

# d1 = d0[lambda x: (x.gene == 'UHGG000638_01150') & (x.pvalue < 1e-3)]
# for _, d2 in d1.iterrows():
#     plt.annotate('', xy=(d2.log_logratio_pc, d2.pvalue), ha='left', va='bottom', xytext=(10, 10), textcoords="offset points", arrowprops=dict())
# d1

In [None]:
d0 = (gene_stats[
    lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9) & (x.num_mwas_subject > 20)
    ]
    .assign(fdr=lambda x: fdrcorrection(x.uc_mwas_pvalue)[1])
    .join(species_taxonomy, on="species")
    .sort_values("uc_mwas_pvalue")
)

plt.hist2d(
    "uc_mwas_oddsratio_pc",
    "uc_mwas_pvalue",
    data=d0,
    bins=(np.logspace(-3, 3, num=100), np.logspace(-7, 0, num=100)),
    norm=mpl.colors.SymLogNorm(linthresh=2, linscale=3, vmin=1, vmax=1e5),
    cmin=1,
    cmap="copper_r",
)

plt.colorbar()

plt.scatter(
    "uc_mwas_oddsratio_pc",
    "uc_mwas_pvalue",
    data=d0[(d0.fdr < 0.1)],
    s=20,
    marker='o',
    edgecolor='k',
    facecolor='none',
    lw=1,
    alpha=0.5,
)
plt.yscale("log")
plt.xscale("log")
# plt.xscale('log')
plt.gca().invert_yaxis()
plt.xlabel("Log(Odds Ratio)")
plt.ylabel("P-value")

# d1 = d0[lambda x: (x.gene == 'UHGG000638_01150') & (x.pvalue < 1e-3)]
# for _, d2 in d1.iterrows():
#     plt.annotate('', xy=(d2.log_logratio_pc, d2.pvalue), ha='left', va='bottom', xytext=(10, 10), textcoords="offset points", arrowprops=dict())
# d1

In [None]:
def _assign_prevalence_class(p):
    if p > 0.95:
        return "core"
    elif p > 0.1:
        return "shell"
    elif p < 0.1:
        return "cloud"

In [None]:
d = gene_stats.dropna(subset=['prevalence_spgc']).assign(prevalence_class=lambda x: x.prevalence_spgc.dropna().map(_assign_prevalence_class))

x = d['prevalence_class']
y = d.loc[:, cog_category_columns].rename(columns=lambda x: x[len('cog_category_'):]).fillna(False)

cog_category_gene_class_enrichment_test = []

for _prevalence_class, _cog_category in tqdm(list(product(
    prevalence_class_order, cog_category_order
))):
    contingency_table = (
        pd.DataFrame(
            dict(
                is_prev_class=(x == _prevalence_class),
                is_cog_category=y[_cog_category],
            )
        )
        .value_counts()
        .unstack()
        .reindex(index=[False, True], columns=[False, True])
        .fillna(0)
    )
    _test = sp.stats.fisher_exact(contingency_table)
    cog_category_gene_class_enrichment_test.append(
        (
            _prevalence_class,
            _cog_category,
            _test[0],
            _test[1],
            contingency_table.loc[True, True],
        )
    )

cog_category_gene_class_enrichment_test = (
    pd.DataFrame(
        cog_category_gene_class_enrichment_test,
        columns=[
            "prevalence_class",
            "cog_category",
            "statistic",
            "pvalue",
            "gene_count",
        ],
    )
    .set_index(["prevalence_class", "cog_category"])
    .assign(
        negative_log10_pvalue=lambda x: -np.log10(x.pvalue),
        log2_odds_ratio=lambda x: np.log2(x.statistic),
    )
)

In [None]:
x = (
    cog_category_gene_class_enrichment_test.log2_odds_ratio.unstack("prevalence_class")
    .replace({np.inf: np.nan, -np.inf: np.nan})
    .join(cog_category_description)
    .set_index("description")[prevalence_class_order]
    .fillna(0)
)


def _assign_significance_marker(pvalue):
    if pvalue < 0.0001:
        return "*"
    else:
        return ""

# annot = (cog_category_gene_class_enrichment_test.pvalue.map(_assign_significance_marker) + '|' + cog_category_gene_class_enrichment_test.gene_count.astype(int).astype(str)).unstack('prevalence_class')[prevalence_class_order]
annot = (
    cog_category_gene_class_enrichment_test.pvalue.map(_assign_significance_marker)
    .unstack("prevalence_class")
    .join(cog_category_description)
    .set_index("description")[prevalence_class_order]
)
# annot = cog_category_gene_class_enrichment_test.gene_count.unstack('prevalence_class')[prevalence_class_order].astype(int)

_row_order = x["core"].sort_values(ascending=False).index
# x, annot = lib.pandas_util.align_indexes(x, annot)

fig, ax = plt.subplots(figsize=(5, 12))
ax = sns.heatmap(
    x.reindex(_row_order),
    annot=annot.reindex(_row_order),
    fmt="",
    cmap="coolwarm",
    center=0,
    vmin=-3,
    vmax=3,
    cbar_kws=dict(
        use_gridspec=True, location="left", label="log2(odds ratio)", extend="both"
    ),
    ax=ax,
    yticklabels=1,
    xticklabels=1,
    annot_kws=dict(va="center"),
    # norm=mpl.colors.SymLogNorm(linthresh=1e1),
    # center=0,
)

ax.yaxis.set_ticks_position("right")
ax.set_ylabel("")
lib.plot.rotate_yticklabels(rotation=-0, va="center")

In [None]:
gene_stats.columns

In [None]:
d = gene_stats.dropna(subset=["phylogenetic_r_ref", "phylogenetic_r_spgc"])

plt.hist2d(
    "phylogenetic_r_ref",
    "phylogenetic_r_spgc",
    data=d,
    bins=50,
    norm=mpl.colors.SymLogNorm(linthresh=1),
)
print(sp.stats.pearsonr(d["phylogenetic_r_ref"], d["phylogenetic_r_spgc"]))

In [None]:
plt.plot(gene_stats[lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9)].phylogenetic_r_spgc.sort_values().values)
plt.plot(gene_stats[lambda x: (x.prevalence_spgc > 0.2) & (x.prevalence_spgc < 0.8)].phylogenetic_r_spgc.sort_values().values)

In [None]:
gene_stats[
        lambda x: (x.prevalence_ref > 0.1)
        & (x.prevalence_ref < 0.9)
        & (x[_col])
    ].phylogenetic_r_ref

In [None]:
cog_category_median_phylogenetic_r_ref = {}
cog_category_median_phylogenetic_r_spgc = {}


for _col in cog_category_columns:
    cog_category_median_phylogenetic_r_ref[_col] = gene_stats[lambda x: (x.prevalence_ref > 0.1) & (x.prevalence_ref < 0.9) & (x[_col]) & x.species.isin(['102506'])].phylogenetic_r_ref.median()
    cog_category_median_phylogenetic_r_spgc[_col] = gene_stats[lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9) & (x[_col]) & x.species.isin(['102506'])].phylogenetic_r_spgc.median()

cog_category_median_phylogenetic_r_ref = pd.Series(cog_category_median_phylogenetic_r_ref)
cog_category_median_phylogenetic_r_spgc = pd.Series(cog_category_median_phylogenetic_r_spgc)

In [None]:
pd.DataFrame([cog_category_median_phylogenetic_r_spgc, cog_category_median_phylogenetic_r_ref]).T.sort_values(0)

In [None]:
sns.violinplot(data=gene_stats[lambda x: (x.prevalence_spgc > 0.1) & (x.prevalence_spgc < 0.9)], x='cog_category_no_category', y='phylogenetic_r_spgc')

In [None]:
d = gene_stats.assign(
        prevalence_ref=lambda x: x.prevalence_ref.fillna(0),
        prevalence_spgc=lambda x: x.prevalence_spgc.fillna(0),
    )[lambda x: (x.prevalence_ref > 0) | (x.prevalence_spgc > 0)]

plt.hist2d(
    'prevalence_ref',
    'prevalence_spgc',
    data=d,
    bins=np.linspace(0, 1, num=26),
    norm=mpl.colors.PowerNorm(1/5),
)
print(sp.stats.pearsonr(d['prevalence_ref'], d['prevalence_spgc']))

In [None]:
d0 = spgc_strain_stats.sort_values('taxonomy_string')

fit = smf.ols(f"nearest_ref_gene_raw_diss ~ np.log10(nearest_ref_geno_diss) + C(species, Sum)", data=d0).fit()

fig, axs = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

ax = axs[0]
for p__, d1 in d0.groupby("p__"):
    ax.scatter("nearest_ref_geno_diss", "nearest_ref_gene_raw_diss", data=d1, label='__nolegend__', color=phylum_palette[p__], s=5, alpha=0.5)
    ax.scatter([], [], label=p__, color=phylum_palette[p__], s=10)

xx = np.logspace(-3, 0)
ax.plot(xx, np.log10(xx) * fit.params['np.log10(nearest_ref_geno_diss)'] + fit.params['Intercept'], color='k')
p_slope = fit.pvalues['np.log10(nearest_ref_geno_diss)']
slope = fit.params['np.log10(nearest_ref_geno_diss)']  # TODO
r2_adj = fit.rsquared_adj
ax.annotate(f'$\\beta_{{\\mathrm{{slope}}}}$={slope:0.3f}\n$P_{{\\mathrm{{slope}}}}$={p_slope:0.0e}\n$R^2_{{\\mathrm{{adj}}}}$={r2_adj:0.2%}', xy=(0.1, 0.9), xycoords='axes fraction', va='top')
ax.annotate(f'', xy=(0.1, 0.9), xycoords='axes fraction')

# ax.plot("nearest_ref_geno_diss", "gene_dist_predict", data=d2, label="__nolegend__")
# ax.legend(bbox_to_anchor=(1, 1), markerscale=4)
ax.set_xscale("symlog", linthresh=1e-4)

ax.set_xlabel('SNP Profile Dissimilarity')
ax.set_ylabel('Gene Content Dissimilarity\n(filtered, Jaccard)')
# plt.scatter(spgc_strain_stats.nearest_ref_geno_diss, spgc_strain_stats.nearest_ref_gene_raw_diss, s=1)

ax = axs[1]
sns.boxplot(x='p__', y='nearest_ref_gene_raw_diss', data=d0, ax=ax, palette=phylum_palette)
lib.plot.rotate_xticklabels(ax=ax)
ax.set_ylabel('')

ax.set_ylim(-0.005, 0.605)

In [None]:
d0 = ref_strain_stats.sort_values('taxonomy_string')

fit = smf.ols(f"nearest_ref_gene_raw_diss ~ np.log10(nearest_ref_geno_diss) + C(species, Sum)", data=d0).fit()

fig, axs = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

ax = axs[0]
for p__, d1 in d0.groupby("p__"):
    ax.scatter("nearest_ref_geno_diss", "nearest_ref_gene_raw_diss", data=d1, label='__nolegend__', color=phylum_palette[p__], s=5, alpha=0.5)
    ax.scatter([], [], label=p__, color=phylum_palette[p__], s=10)

xx = np.logspace(-3, 0)
ax.plot(xx, np.log10(xx) * fit.params['np.log10(nearest_ref_geno_diss)'] + fit.params['Intercept'], color='k')
p_slope = fit.pvalues['np.log10(nearest_ref_geno_diss)']
slope = fit.params['np.log10(nearest_ref_geno_diss)']  # TODO
r2_adj = fit.rsquared_adj
ax.annotate(f'$\\beta_{{\\mathrm{{slope}}}}$={slope:0.3f}\n$P_{{\\mathrm{{slope}}}}$={p_slope:0.0e}\n$R^2_{{\\mathrm{{adj}}}}$={r2_adj:0.2%}', xy=(0.1, 0.9), xycoords='axes fraction', va='top')
ax.annotate(f'', xy=(0.1, 0.9), xycoords='axes fraction')

# ax.plot("nearest_ref_geno_diss", "gene_dist_predict", data=d2, label="__nolegend__")
# ax.legend(bbox_to_anchor=(1, 1), markerscale=4)
ax.set_xscale("symlog", linthresh=1e-4)

ax.set_xlabel('SNP Profile Dissimilarity')
ax.set_ylabel('Gene Content Dissimilarity\n(filtered, Jaccard)')

ax = axs[1]
sns.violinplot(x='p__', y='nearest_ref_gene_raw_diss', data=d0, ax=ax, palette=phylum_palette)
lib.plot.rotate_xticklabels(ax=ax)
ax.set_ylabel('')

ax.set_ylim(-0.005, 0.605)

In [None]:
fit.summary()

In [None]:
bins = np.linspace(0, 1)

fig, axs = plt.subplots(2, sharex=True)
ax = axs[0]
ax.hist(spgc_strain_stats.nearest_ref_geno_diss, bins=bins, alpha=0.5, density=True, label='SPGC', color='tab:purple')
ax.hist(ref_strain_stats.nearest_ref_geno_diss, bins=bins, alpha=0.5, density=True, label='Reference', color='tab:green')
ax.set_xlabel('Closest Ref. SNP Dissimilarity')
ax.set_ylabel('Density')
ax.legend()

ax = axs[1]
ax.hist(spgc_strain_stats.nearest_ref_gene_raw_diss, bins=bins, alpha=0.5, density=True, label='SPGC', color='tab:purple')
ax.hist(ref_strain_stats.nearest_ref_gene_raw_diss, bins=bins, alpha=0.5, density=True, label='Reference', color='tab:green')
ax.set_xlabel('Closest Ref. Gene Dissimilarity (Jaccard)')
ax.set_ylabel('Density')

fig.tight_layout()

In [None]:
bins = np.linspace(0, 1)
plt.hist(spgc_strain_stats.nearest_ref_gene_raw_diss, bins=bins, alpha=0.5, density=True, label='SPGC', color='tab:purple')
plt.hist(ref_strain_stats.nearest_ref_gene_raw_diss, bins=bins, alpha=0.5, density=True, label='Reference', color='tab:green')
plt.xlabel('Closest Ref. Gene Dissimilarity (Jaccard)')
plt.legend()

In [None]:
plt.hist(spgc_strain_stats.nearest_ref_geno_diss, bins=np.logspace(-4, 0))

In [None]:
d = spgc_strain_stats.reset_index().set_index(['species', 'strain'])[['spgc_core_gene_tally', 'spgc_shell_gene_tally', 'spgc_cloud_gene_tally']].apply(lambda x: x / x.sum(), axis=1)
plt.hist(d.spgc_core_gene_tally, label='core', alpha=0.7)
plt.hist(d.spgc_shell_gene_tally, label='shell', alpha=0.7)
plt.hist(d.spgc_cloud_gene_tally, label='cloud', alpha=0.7)

In [None]:
d = ref_strain_stats.reset_index().set_index(['species', 'strain'])[['ref_core_gene_tally', 'ref_shell_gene_tally', 'ref_cloud_gene_tally']].apply(lambda x: x / x.sum(), axis=1)
plt.hist(d.ref_core_gene_tally, label='core', alpha=0.7)
plt.hist(d.ref_shell_gene_tally, label='shell', alpha=0.7)
plt.hist(d.ref_cloud_gene_tally, label='cloud', alpha=0.7)

In [None]:
d = (
    spgc_strain_stats.reset_index()
    .set_index(["species", "strain"])[
        ["spgc_core_gene_tally", "spgc_shell_gene_tally", "spgc_cloud_gene_tally"]
    ]
    .apply(lambda x: x / x.sum(), axis=1)
    .groupby(level='species').median()
)


plt.hist(d.spgc_core_gene_tally, label='core', alpha=0.7)
plt.hist(d.spgc_shell_gene_tally, label='shell', alpha=0.7)
plt.hist(d.spgc_cloud_gene_tally, label='cloud', alpha=0.7)

In [None]:
d = (
    ref_strain_stats.reset_index()
    .set_index(["species", "strain"])[
        ["ref_core_gene_tally", "ref_shell_gene_tally", "ref_cloud_gene_tally"]
    ]
    .apply(lambda x: x / x.sum(), axis=1)
    .groupby(level='species').median()
)
plt.hist(d.ref_core_gene_tally, label='core', alpha=0.7)
plt.hist(d.ref_shell_gene_tally, label='shell', alpha=0.7)
plt.hist(d.ref_cloud_gene_tally, label='cloud', alpha=0.7)

In [None]:
d0 = (
    spgc_strain_stats.reset_index()
    .set_index(["species", "strain"])[
        ["spgc_core_gene_tally", "spgc_shell_gene_tally", "spgc_cloud_gene_tally"]
    ]
    .apply(lambda x: x / x.sum(), axis=1)
    .groupby(level='species').median()
    .join(species_taxonomy)
)
_phylum_list = d0.p__.unique()

fig, axs = plt.subplots(3, sharex=True)
bins = np.linspace(0, 1, num=21)
for partition, ax in zip(['spgc_core_gene_tally', 'spgc_shell_gene_tally', 'spgc_cloud_gene_tally'], axs):
    for p__, d1 in d0.groupby('p__'):
        sns.kdeplot(d1[partition], color=phylum_palette[p__], ax=ax, label='__nolegend__')
    ax.set_ylabel(partition[len('spgc_'):-len('_gene_tally')])

for p__, _ in d0.groupby('p__'):
    axs[0].plot([], [], color=phylum_palette[p__], label=p__)
axs[0].legend(bbox_to_anchor=(1, 1))
    


# plt.hist(d.spgc_core_gene_tally, label='core', alpha=0.7)
# plt.hist(d.spgc_shell_gene_tally, label='shell', alpha=0.7)
# plt.hist(d.spgc_cloud_gene_tally, label='cloud', alpha=0.7)

In [None]:
d0 = (
    ref_strain_stats.reset_index()
    .set_index(["species", "strain"])[
        ["ref_core_gene_tally", "ref_shell_gene_tally", "ref_cloud_gene_tally"]
    ]
    .apply(lambda x: x / x.sum(), axis=1)
    .groupby(level='species').median()
    .join(species_taxonomy)
)
_phylum_list = d0.p__.unique()

fig, axs = plt.subplots(3, sharex=True)
bins = np.linspace(0, 1, num=21)
for partition, ax in zip(['ref_core_gene_tally', 'ref_shell_gene_tally', 'ref_cloud_gene_tally'], axs):
    for p__, d1 in d0.groupby('p__'):
        sns.kdeplot(d1[partition], color=phylum_palette[p__], ax=ax, label='__nolegend__')
    ax.set_ylabel(partition[len('ref_'):-len('_gene_tally')])

for p__, _ in d0.groupby('p__'):
    axs[0].plot([], [], color=phylum_palette[p__], label=p__)
axs[0].legend(bbox_to_anchor=(1, 1))
    


# plt.hist(d.ref_core_gene_tally, label='core', alpha=0.7)
# plt.hist(d.ref_shell_gene_tally, label='shell', alpha=0.7)
# plt.hist(d.ref_cloud_gene_tally, label='cloud', alpha=0.7)