# Preamble

In [None]:
%load_ext autoreload

In [None]:
import os as _os

_os.chdir(_os.environ["PROJECT_ROOT"])
_os.path.realpath(_os.path.curdir)

## Imports

In [None]:
import os
import subprocess
import time
from itertools import chain, product
from tempfile import mkstemp
from warnings import filterwarnings

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import sfacts as sf
import statsmodels.formula.api as smf
import xarray as xr
from mpl_toolkits.axes_grid1 import make_axes_locatable

# from fastcluster import linkage
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import pdist, squareform
from statsmodels.graphics.regressionplots import influence_plot
from statsmodels.stats.multitest import fdrcorrection
from tqdm import tqdm

import lib.plot
import lib.thisproject.data
from lib.pandas_util import align_indexes, aligned_index, idxwhere, invert_mapping

In [None]:
sns.set_context("talk")
plt.rcParams["figure.dpi"] = 100

In [None]:
def _calculate_2tailed_pvalue_from_perm(obs, perms):
    hypoth_left = perms > obs
    hypoth_right = perms < obs
    null_p_left = (hypoth_left.sum() + 1) / (len(hypoth_left) + 1)
    null_p_right = (hypoth_right.sum() + 1) / (len(hypoth_right) + 1)
    return np.minimum(null_p_left, null_p_right) * 2

In [None]:
def linkage_order(linkage, labels):
    return labels[sp.cluster.hierarchy.to_tree(linkage).pre_order(lambda x: x.id)]


def is_prime(n):
    if n <= 1:
        return False
    for i in range(2, int(n**0.5) + 1):
        if n % i == 0:
            return False
    return True


def iterate_primes_up_to(n, return_index=False):
    n = int(np.ceil(n))
    idx = 0
    for i in range(n):
        if is_prime(i):
            if return_index:
                yield (idx, i)
            else:
                yield i
            idx += 1


def maximally_shuffled_order(sorted_order):
    n = len(sorted_order)
    primes_list = list(iterate_primes_up_to(np.sqrt(n)))
    table = pd.DataFrame(np.arange(n), index=sorted_order, columns=["original_order"])
    for prime in primes_list:
        table[prime] = table.original_order % prime
    table.sort_values(primes_list).original_order.values
    table = table.assign(new_order=table.sort_values(primes_list).original_order.values)
    z = table.sort_values("new_order").original_order.values
    table["delta"] = [np.nan] + list(z[1:] - z[:-1])
    return table.sort_values("new_order").index.to_list()

# Construct Metadata

In [None]:
cog_meta = pd.read_table(
    "ref/cog-20.meta.tsv",
    encoding="latin10",
    names=["cog", "categories", "description", "gene_name", "pathway", "_5", "color"],
    index_col="cog",
)
cog_meta

In [None]:
cog_category_meta = pd.read_table(
    "ref/cog-20.categories.tsv",
    names=["category", "color", "description"],
    index_col="category",
)
cog_category_meta

In [None]:
pair_type_palette = {
    "EEN": "teal",
    "PostEEN": "mediumblue",
    "Transition": "blueviolet",
}

diet_palette = {
    "EEN": "lightgreen",
    "PostEEN": "lightblue",
    "InVitro": "plum",
    "PreEEN": "lightpink",
}

subject_order = [
    "A",
    "B",
    "H",
    "C",
    "D",
    "E",
    "F",
    "G",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
]

# NOTE: Requires a dummy value because I want exactly 20 items.
subject_palette = lib.plot.construct_ordered_palette(
    subject_order + [f"dummy{i}" for i in range(20 - len(subject_order))], cm="tab20"
)
subject_palette["X"] = "black"
pair_type_order = ["EEN", "Transition", "PostEEN"]
pair_type_marker_palette = {"EEN": "s", "Transition": ">", "PostEEN": "o"}
pair_type_linestyle_palette = {"EEN": ":", "Transition": "-.", "PostEEN": "-"}

In [None]:
def _label_experiment_sample(x):
    if x.sample_type == "human":
        label = f"{x.subject_id} [{x.sample_id}] {x.collection_date_relative_een_end} {x.diet_or_media}"
    elif x.sample_type in ["Fermenter_inoculum"]:
        label = (
            f"{x.subject_id} [{x.sample_id}] {x.source_samples} inoc {x.diet_or_media}"
        )
    elif x.sample_type in ["Fermenter"]:
        label = (
            f"{x.subject_id} [{x.sample_id}] {x.source_samples} frmnt {x.diet_or_media}"
        )
    elif x.sample_type in ["mouse"]:
        if x.status_mouse_inflamed == "Inflamed":
            label = f"{x.subject_id} [{x.sample_id}] {x.source_samples} 🐭 {x.mouse_genotype} {x.diet_or_media} inflam"
        elif x.status_mouse_inflamed == "not_Inflamed":
            label = f"{x.subject_id} [{x.sample_id}] {x.source_samples} 🐭 {x.mouse_genotype} {x.diet_or_media} not_inf"
        else:
            raise ValueError(f"sample type {x.status_mouse_inflamed} not understood")
    else:
        raise ValueError(f"sample type {x.sample_type} not understood")
    return label

In [None]:
sample = (
    pd.read_table("meta/een-mgen/sample.tsv")
    .assign(
        label=lambda x: x[
            [
                "subject_id",
                "collection_date_relative_een_end",
                "diet_or_media",
                "sample_id",
            ]
        ].apply(tuple, axis=1),
        fuller_label=lambda d: d.apply(_label_experiment_sample, axis=1),
    )
    .set_index("sample_id")
)
subject = pd.read_table("meta/een-mgen/subject.tsv", index_col="subject_id")

# Load Data

## Species Depth

In [None]:
motu_depth = (
    pd.read_table(
        "data/group/een/r.proc.gtpro.species_depth.tsv",
        index_col=["sample", "species_id"],
    )
    .depth.unstack(fill_value=0)
    .rename(
        columns=str, index=lambda x: "CF_" + str(int(x.split("_")[1]))
    )  # Normalize names
    .rename({"CF_15": "CF_11", "CF_11": "CF_15"})  # Sample swap
)
motu_rabund = motu_depth.divide(motu_depth.sum(1), axis=0)

motu_rabund

## Genes

In [None]:
# cogs
# take up to 7 minutes to compile everything
cog_depth = {}

for species in tqdm(motu_rabund.columns):
    gene_x_cog_inpath = (
        f"data/species/sp-{species}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    gene_depth_inpath = (
        f"data/group/een/species/sp-{species}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    _gene_x_cog = (
        pd.read_table(gene_x_cog_inpath)
        .drop_duplicates()
        .set_index("gene_id")
        .squeeze()
    )
    _cog_depth = (
        xr.load_dataarray(gene_depth_inpath)
        .to_pandas()
        .T.join(_gene_x_cog)
        .groupby("cog")
        .sum()
    )
    cog_depth[species] = _cog_depth.stack()

cog_depth = (
    pd.DataFrame(cog_depth)
    .stack()
    .rename_axis(["cog", "sample", "species"])
    .to_xarray()
    .fillna(0)
)

# Normalize sample names and swap the mislabeled samples.
cog_depth["sample"] = (
    cog_depth.sample.to_series()
    .map(lambda x: "CF_" + str(int(x.split("_")[1])))
    .replace({"CF_15": "CF_11", "CF_11": "CF_15"})
    .values
)

In [None]:
schg_cog_list = [
    "COG0012",
    "COG0016",
    "COG0048",
    "COG0049",
    "COG0052",
    "COG0080",
    "COG0081",
    "COG0085",
    "COG0087",
    "COG0088",
    "COG0090",
    "COG0091",
    "COG0092",
    "COG0093",
    "COG0094",
    "COG0096",
    "COG0097",
    "COG0098",
    "COG0099",
    "COG0100",
    "COG0102",
    "COG0103",
    "COG0124",
    "COG0184",
    "COG0185",
    "COG0186",
    "COG0197",
    "COG0200",
    "COG0201",
    "COG0256",
    "COG0495",
    "COG0522",
    "COG0525",
    "COG0533",
    # "COG0542",  # This one is a depth outlier...
]

len(schg_cog_list)

In [None]:
sns.clustermap(cog_depth.sum("species").sel(cog=schg_cog_list).to_pandas(), metric='cosine', yticklabels=1)

In [None]:
motu_depth2 = cog_depth.sel(cog=schg_cog_list).mean("cog")
motu_rabund2 = motu_depth2 / motu_depth2.sum("species")

In [None]:
_species_list = ["100003", "102506", "100022"]
_sample_list = ["CF_94", "CF_93"]

fig, axs = plt.subplots(
    len(_species_list),
    len(_sample_list),
    figsize=(5 * len(_sample_list), 3 * len(_species_list)),
    sharex=True,
    sharey=True,
)
axs = np.asanyarray(axs).reshape((len(_species_list), len(_sample_list)))

bins = np.logspace(-3, 4, num=100)

for (_species, _sample), ax in zip(product(_species_list, _sample_list), axs.flatten()):
    ax.hist(cog_depth.sel(sample=_sample, species=_species).to_pandas(), bins=bins)
    ax.axvline(motu_depth.loc[_sample, _species], color="tab:orange")
    ax.axvline(motu_depth2.loc[_sample, _species], color="black")
    ax.set_title((_species, _sample))

ax.set_xscale("log")
# plt.xlim(0, 200)

In [None]:
cog_detection_limit = cog_depth.where(lambda x: x != 0, np.inf).min(
    ("sample", "species")
)
undetected_cogs_list = idxwhere((cog_detection_limit == np.inf).to_series())

cog_depth_or_detection_limit = cog_depth.where(
    lambda x: x != 0, cog_detection_limit
).drop_sel(cog=undetected_cogs_list)
cog_depth_or_detection_limit

In [None]:
total_genome_depth = (
    cog_depth_or_detection_limit.sel(cog=schg_cog_list).median("cog").sum("species")  # NOTE: Mean or Median? Does it matter?
)
normalized_cog_depth_by_sample = (
    cog_depth_or_detection_limit.sum("species") / total_genome_depth
)
normalized_cog_depth_by_subject_and_type = (
    normalized_cog_depth_by_sample.to_pandas()
    .T.join(sample[["subject_id", "diet_or_media"]])[
        lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
    ]
    .groupby(["subject_id", "diet_or_media"])
    .median()  # NOTE: Mean or Median?
    .unstack("diet_or_media")
    .dropna()
    .stack("diet_or_media")
)
normalized_cog_depth_by_subject_and_type

In [None]:
pairwise_test_results = {}
for cog in tqdm(normalized_cog_depth_by_subject_and_type.columns):
    d = normalized_cog_depth_by_subject_and_type[cog].unstack()
    mean_een = d.EEN.mean()
    mean_post = d.PostEEN.mean()
    mean_log_ratio = np.log2(d.PostEEN / d.EEN).mean()
    median_log_ratio = np.log2(d.PostEEN / d.EEN).median()
    try:
        result = sp.stats.wilcoxon(
            d.EEN,
            d.PostEEN,
        )
        pval = result.pvalue
    except ValueError:
        pval = np.nan
    pairwise_test_results[cog] = (
        mean_een,
        mean_post,
        mean_log_ratio,
        median_log_ratio,
        pval,
    )

pairwise_test_results = pd.DataFrame(
    pairwise_test_results,
    index=("mean_een", "mean_post", "mean_log_ratio", "median_log_ratio", "pval"),
).T

In [None]:
# Here is where I define filters on COGs:
#    They must have a mean depth during one of the two time-periods of > 0.01

pairwise_test_results_filt_with_fdr = (
    pairwise_test_results[lambda x: (x.mean_een > 0.01) | (x.mean_post > 0.01)]
    .assign(
        fdr=lambda x: fdrcorrection(x.pval)[1],
        hit=lambda x: (x.fdr < 0.1) & (np.abs(x.mean_log_ratio) > 0.2),
    )
    .sort_values("fdr")
)
pairwise_test_results_filt_with_fdr[lambda x: x.hit]

In [None]:
d = pairwise_test_results_filt_with_fdr.sort_values("pval").join(cog_meta)

fig, ax = plt.subplots()
ax.scatter("mean_log_ratio", "pval", data=d[d.hit], color="r", s=5)
ax.scatter("mean_log_ratio", "pval", data=d[~d.hit], color="grey", s=5)
ax.invert_yaxis()
ax.set_yscale("log")
ax.axvline(0.2, color="black", lw=1, linestyle="--")
ax.axvline(-0.2, color="black", lw=1, linestyle="--")
ax.axhline(0.05, color="black", lw=1, linestyle="--")
ax.set_xlabel("Mean Log2(OR)")
ax.set_ylabel("P-value")

In [None]:
pairwise_test_results_filt_with_fdr

In [None]:
cog_x_cog_category_matrix = (
    cog_meta.categories.map(tuple)
    .explode()
    .rename("category")
    .reset_index()
    .assign(in_category=True)
    .set_index(["cog", "category"])
    .in_category.unstack(fill_value=False)
)

In [None]:
d = pairwise_test_results_filt_with_fdr[["hit"]].join(cog_x_cog_category_matrix)

enrichment_test = []
for category in cog_x_cog_category_matrix.columns:
    contingency = (
        d[["hit", category]]
        .value_counts()
        .reindex(
            [(True, True), (True, False), (False, True), (False, False)], fill_value=0
        )
        .unstack()
    )
    contingency_pc = contingency + 1
    log2_odds_ratio_pc = np.log2(
        (contingency_pc.loc[True, True] / contingency_pc.loc[True, False])
        / (contingency_pc.loc[False, True] / contingency_pc.loc[False, False])
    )
    num_hit = contingency_pc.loc[True, True]
    enrichment_test.append(
        [category, num_hit, log2_odds_ratio_pc, *sp.stats.fisher_exact(contingency)]
    )
enrichment_test = pd.DataFrame(
    enrichment_test,
    columns=["category", "num_hit", "log2_odds_ratio_pc", "stat", "pvalue"],
).set_index("category")

In [None]:
enrichment_test.sort_values("log2_odds_ratio_pc", ascending=False).join(cog_category_meta)

In [None]:
cog_x_cog_category_matrix.reindex(
    idxwhere(
        pairwise_test_results_filt_with_fdr.hit
        & (pairwise_test_results_filt_with_fdr.mean_log_ratio > 0)
    )
).sum()

In [None]:
# def _species_specificity_statistics(x):
#     x = x.sort_values(ascending=False) / x.sum()
#     ratio_to_top = x / x[0]
#     ratio_to_second = x / x[1]
#     return pd.DataFrame(dict(fraction=x, ratio_to_top=ratio_to_top, ratio_to_second=ratio_to_second)).stack()
#     # return pd.DataFrame([[1, 2, 3], [4, 5, 6]])

# d = cog_depth.mean("sample").to_pandas().T

cog_species_fraction = (cog_depth / cog_depth.sum("species")).mean("sample")

## Select by Function

### COG Category I

In [None]:
pairwise_test_results_filt_with_fdr.join(cog_meta).fillna({"categories": ""})[
    lambda x: x.hit & x.categories.str.contains("I")
].sort_values("mean_log_ratio", ascending=False)

In [None]:
pairwise_test_results_filt_with_fdr.join(cog_meta).fillna({"categories": ""})[
    lambda x: x.hit & x.categories.str.contains("I")
].sort_values("mean_log_ratio", ascending=False).description.values

In [None]:
cog_species_fraction.sel(cog="COG3255").to_series().sort_values(ascending=False).head()

In [None]:
cog_species_fraction.sel(cog="COG4981").to_series().sort_values(ascending=False).head()

In [None]:
cog_species_fraction.sel(cog="COG3675").to_series().sort_values(ascending=False).head()

In [None]:
cog_species_fraction.sel(cog="COG4667").to_series().sort_values(ascending=False).head()

In [None]:
# COGs that are *DE-enriched* in PostEEN are frequently dominated by sequences from E. coli.
cog_species_fraction.sel(
    cog=idxwhere(pairwise_test_results_filt_with_fdr.mean_log_ratio < -1)
).to_pandas().idxmax(1).value_counts().head(20)

In [None]:
# COGs that are *DE-enriched* in PostEEN are frequently dominated by sequences from E. coli.
cog_species_fraction.sel(
    cog=idxwhere(pairwise_test_results_filt_with_fdr.mean_log_ratio > 0.2)
).to_pandas().idxmax(1).value_counts().head(20)

### COG Category G

In [None]:
pairwise_test_results_filt_with_fdr.join(cog_meta).fillna({"categories": ""})[
    lambda x: x.hit & x.categories.str.contains("G")
].sort_values("mean_log_ratio", ascending=False)

In [None]:
cog_species_fraction.sel(cog="COG3429").to_series().sort_values(ascending=False).head()

In [None]:
cog_species_fraction.sel(cog="COG4813").to_series().sort_values(ascending=False).head()

In [None]:
cog_species_fraction.sel(cog="COG3867").to_series().sort_values(ascending=False).head()

In [None]:
cog_species_fraction.sel(cog="COG3957").to_series().sort_values(ascending=False).head()

In [None]:
cog_species_fraction.sel(cog="COG1621").to_series().sort_values(ascending=False).head()

In [None]:
pairwise_test_results_filt_with_fdr.join(cog_meta).fillna({"categories": ""})[
    lambda x: x.hit & x.categories.str.contains("X")
].sort_values("mean_log_ratio", ascending=False)

In [None]:
cog_species_fraction.sel(cog="COG3636").to_series().sort_values(ascending=False).head()

## Select by Species

In [None]:
cog_meta.loc["COG4466"].description

In [None]:
species_specific_normalized_cog_depth_by_sample = (
    cog_depth_or_detection_limit
    / cog_depth_or_detection_limit.sel(cog=schg_cog_list).mean("cog")
)
assert np.isnan(species_specific_normalized_cog_depth_by_sample.values).sum() == 0

### sp-102506 (E. coli)

In [None]:
species_id = "102506"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=1000).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
cog_meta.loc["COG2747"].description

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.05)
    & (x.mean_log_ratio > 0.5)
    # & (x.prevalence < 0.9)
    & (x.pval < 0.05)
].sort_values(
    "species_frac", ascending=False
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    # & (x.prevalence < 0.9)
    & (x.pval < 0.05)
].sort_values(
    "species_frac", ascending=False
).description.values

#### COG2888

In [None]:
cog_id = "COG2888"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG2747

In [None]:
cog_id = "COG2747"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG0630

In [None]:
cog_id = "COG0630"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG4667

In [None]:
cog_id = "COG4667"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG1857

In [None]:
cog_id = "COG1857"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    lib.plot.rotate_xticklabels(ax=ax)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### sp-102544 (E. lenta)

In [None]:
species_id = "102544"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=1000).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    # & (x.prevalence < 0.9)
    & (x.pval < 0.05)
].sort_values(
    "species_frac", ascending=False
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    # & (x.prevalence < 0.9)
    & (x.pval < 0.05)
].sort_values(
    "species_frac", ascending=False
).description.values

#### COG4981

In [None]:
cog_id = "COG4981"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG3675

In [None]:
cog_id = "COG3675"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG5617

In [None]:
cog_id = "COG5617"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### sp-100099 (F. plautii)

In [None]:
species_id = "100099"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=1000).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "species_frac", ascending=False
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta).loc["COG3957"]

#### COG5585

In [None]:
cog_id = "COG5585"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG3957

In [None]:
cog_id = "COG3957"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### sp-101303 (D. scindens)

In [None]:
species_id = "101303"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=1000).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "species_frac", ascending=False
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta).loc["COG3255"]

### sp-101493 (s__Clostridium_M bolteae)

In [None]:
species_id = "101493"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=1000).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "species_frac", ascending=False
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta).loc["COG3255"]

#### COG3255

In [None]:
cog_id = "COG3255"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG5585

In [None]:
cog_id = "COG5585"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### sp-101338 (s__Blautia_A wexlerae)

In [None]:
species_id = "101338"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "species_frac", ascending=False
)

#### COG4878

In [None]:
cog_id = "COG4878"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG4267

In [None]:
cog_id = "COG4267"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG5585

In [None]:
cog_id = "COG5585"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### sp-101367 (s__Clostridium_Q symbiosum)

In [None]:
species_id = "101367"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    # & (x.prevalence < 0.9)
    & (x.pval < 0.05)
].sort_values(
    "species_frac", ascending=False
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    & (x.prevalence < 0.9)
    & (x.pval < 0.05)
].sort_values(
    "species_frac", ascending=False
).description.values

#### COG1353

In [None]:
cog_id = "COG1353"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### sp-101386 s__Clostridium_M clostridioforme)

In [None]:
species_id = "101386"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "species_frac", ascending=False
)

### sp-100032 (s__Hungatella effluvii)

In [None]:
species_id = "100032"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (x.mean_log_ratio > 0.3)
    # & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "species_frac", ascending=False
)

### 100179 (g__Clostridium_M)

In [None]:
species_id = "100179"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.05)
    & (x.mean_log_ratio > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "mean_log_ratio", ascending=False
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta).loc["COG3255"]

#### COG3255

In [None]:
cog_id = "COG3255"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG4938

In [None]:
cog_id = "COG4938"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG0146

In [None]:
cog_id = "COG0146"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### 102549

In [None]:
species_id = "102549"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (np.abs(x.mean_log_ratio) > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "mean_log_ratio", ascending=False
)

### 103682

In [None]:
species_id = "103682"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (np.abs(x.mean_log_ratio) > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "mean_log_ratio", ascending=False
)

#### COG4396

In [None]:
cog_id = "COG4396"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### 101400

In [None]:
species_id = "101400"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (np.abs(x.mean_log_ratio) > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "mean_log_ratio", ascending=False
)

#### COG5599

In [None]:
cog_id = "COG5599"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### 102478

In [None]:
species_id = "102478"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (np.abs(x.mean_log_ratio) > 0.5)
    & (x.prevalence < 0.95)
    & (x.pval < 0.1)
].sort_values(
    "mean_log_ratio", ascending=False
)

#### COG4938

In [None]:
cog_id = "COG4938"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### 101292

In [None]:
species_id = "101292"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.clean-m10-e20-c10.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (np.abs(x.mean_log_ratio) > 0.5)
    & (x.prevalence < 0.9)
    & x.hit
].sort_values(
    "mean_log_ratio", ascending=False
)

#### COG2920

In [None]:
cog_id = "COG2920"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

#### COG4981

In [None]:
cog_id = "COG4981"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

### 101346

In [None]:
species_id = "101346"

In [None]:
np.random.seed(0)

world = (
    sf.data.World.load(
        f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.world.nc"
    )
    .rename_coords(sample=lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename_coords(sample={"CF_11": "CF_15", "CF_15": "CF_11"})
    .drop_low_abundance_strains(0.01)
)


subject_abh_sample_list = list(
    set(idxwhere(sample.subject_id.isin(["A", "B", "H"]))) & set(world.sample.values)
)
world_subject_abh = world.sel(
    sample=subject_abh_sample_list
).drop_low_abundance_strains(0.05)
position_ss = world.random_sample(position=min(1000, world.sizes["position"])).position

sample_linkage = world.unifrac_linkage()
strain_linkage = world.genotype.linkage()
subject_abh_sample_linkage = world_subject_abh.unifrac_linkage()
subject_abh_strain_linkage = world_subject_abh.genotype.linkage()
subject_abh_position_ss_linkage = world_subject_abh.sel(
    position=position_ss
).genotype.linkage("position")

_col_colors = sample.set_index("fuller_label").subject_id.map(subject_palette)


sf.plot.plot_community(
    world_subject_abh.rename_coords(sample=sample.fuller_label),
    scalex=0.3,
    scaley=0.7,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_strain_linkage,
    col_colors=_col_colors,
)

sf.plot.plot_metagenotype(
    world_subject_abh.sel(position=position_ss).rename_coords(
        sample=sample.fuller_label
    ),
    scalex=0.3,
    col_linkage_func=lambda w: subject_abh_sample_linkage,
    row_linkage_func=lambda w: subject_abh_position_ss_linkage,
    col_colors=_col_colors,
)

In [None]:
spgc_strain_meta = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.strain_meta.tsv",
    index_col="genome_id",
).rename(str)
spgc = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10_thresh-corr450-depth200.uhgg-strain_gene.tsv",
    index_col="gene_id",
)
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_inpathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(
    " "
)
gene_meta = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations",
        comment="#",
        names=eggnog_column_names,
        index_col="query",
    )
    .rename_axis(index="gene_id")
    .replace({"-": np.nan})
)
gene_x_cog = (
    pd.read_table(
        f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_cog.tsv"
    )
    .drop_duplicates()
    .set_index("gene_id")
    .squeeze()
)
spgc_depth_ratio = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_depth_ratio.tsv",
    index_col=["gene_id", "strain"],
).depth.unstack()
spgc_corr = pd.read_table(
    f"data/group/een/species/sp-{species_id}/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts48-s85-seed0.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-10.strain_correlation.tsv",
    index_col=["gene_id", "strain"],
).correlation.unstack()
gene_depth = (
    xr.load_dataarray(
        f"data/group/een/species/sp-{species_id}/r.proc.gene99_new-v22-agg75.depth2.nc"
    )
    .to_pandas()
    .rename(lambda s: "CF_{}".format(int(s.split("_")[1])))
    .rename({"CF_11": "CF_15", "CF_15": "CF_11"})
)
eggnog_prevalence_in_refs = pd.read_table(
    f"data/species/sp-{species_id}/midasdb.gene75_new.eggnog-strain_gene.prevalence.tsv",
    names=["eggnog_id", "prevalence"],
).assign(cog_id=lambda x: x.eggnog_id.str.split("@").str[0])

high_quality_strain_list = idxwhere(
    (spgc_strain_meta.sum_depth > 1) & (spgc_strain_meta.species_gene_frac > 0.9)
)

spgc_strain_meta

In [None]:
cog_prevalence_in_refs = (
    eggnog_prevalence_in_refs[lambda x: x.cog_id.str.startswith("COG")][
        ["cog_id", "prevalence"]
    ]
    .drop_duplicates()
    .groupby("cog_id")
    .prevalence.max()
)

In [None]:
pairwise_test_results_filt_with_fdr.assign(
    species_frac=cog_species_fraction.sel(species=species_id).to_series()
).join(cog_prevalence_in_refs).join(cog_meta)[
    lambda x: True
    & (x.species_frac > 0.1)
    & (np.abs(x.mean_log_ratio) > 0.5)
    & (x.prevalence < 0.95)
    & (x.pval < 0.1)
].sort_values(
    "mean_log_ratio", ascending=False
)

#### COG1231

In [None]:
cog_id = "COG1231"

In [None]:
d = (
    species_specific_normalized_cog_depth_by_sample.sel(species=species_id)
    .to_pandas()
    .T
)

bins = np.logspace(-3, 4)
plt.hist(d.mean(), bins=bins)
plt.hist(d[cog_id], bins=bins)
plt.xscale("log")

In [None]:
normalized_cog_depth = cog_depth_or_detection_limit.sel(
    species=species_id, cog=cog_id
) / motu_depth2.sel(species=species_id)

plt.scatter(
    motu_depth2.sel(species=species_id),
    cog_depth_or_detection_limit.sel(species=species_id, cog=cog_id),
    c=normalized_cog_depth,
)
plt.xscale("log")
plt.yscale("log")

In [None]:
spgc.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
strain_with_gene_list = idxwhere(
    (
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)
strain_without_gene_list = idxwhere(
    ~(
        spgc[high_quality_strain_list].reindex(gene_x_cog[lambda x: x == cog_id].index)
        == 1
    ).any()
)

frac = world.community.to_pandas().rename(columns=str)
strain_gene_unknown_list = ["-1"] + list(
    set(frac.columns)
    - set(strain_with_gene_list)
    - set(strain_without_gene_list)
    - set(["-1"])
)

strain_list = (
    strain_with_gene_list + strain_without_gene_list + strain_gene_unknown_list
)

strain_with_gene_list, strain_without_gene_list, len(strain_gene_unknown_list)

In [None]:
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_with_gene_list, cm="autumn"
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_without_gene_list, cm="winter", extend=strain_by_cog_palette
)
strain_by_cog_palette = lib.plot.construct_ordered_palette(
    strain_gene_unknown_list, cm="Greys", extend=strain_by_cog_palette
)

In [None]:
_meta = sample.loc[motu_depth2.sample]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order[:3]
fig, axs = lib.plot.subplots_grid(1, len(_subject_list), ax_width=20, ax_height=10)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.04))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
_meta = sample.loc[motu_depth2.sample][
    lambda x: x.diet_or_media.isin(["EEN", "PostEEN"])
]
_frac = frac.reindex(_meta.index, fill_value=0)
_frac["-1"] = 1 - _frac.drop(columns=["-1"]).sum(1)
d0 = (
    _meta.join(_frac)
    .assign(
        species_rabund=motu_rabund2.sel(species=species_id).to_series(),
        norm_cog_depth=normalized_cog_depth.to_series(),
    )
    .set_index("fuller_label")
    .sort_values(
        [
            "subject_id",
            "collection_date_relative_een_end",
            "sample_type",
            "diet_or_media",
            "mouse_genotype",
            "status_mouse_inflamed",
        ]
    )
)

_subject_list = subject_order
fig, axs = lib.plot.subplots_grid(6, len(_subject_list), ax_width=6, ax_height=7)

for subject_id, ax in zip(_subject_list, axs.flatten()):
    d1 = d0[lambda x: x.subject_id == subject_id]
    if d1.empty:
        continue
    d1[strain_list].plot(
        kind="bar",
        width=0.95,
        stacked=True,
        color=strain_by_cog_palette,
        edgecolor="k",
        ax=ax,
        lw=0.5,
    )
    ax.set_title(subject_id)
    ax.legend_.set_visible(False)
    ax2 = plt.twinx(ax)
    ax2.plot(d1["norm_cog_depth"], color="k", marker="o", lw=1, markersize=5)
    ax2.set_yscale("symlog", linthresh=1e-2, linscale=0.1)
    ax3 = plt.twinx(ax)
    ax3.plot(d1["species_rabund"], color="k", marker="o")
    ax3.set_yscale("symlog", linthresh=1e-4, linscale=0.1)
    ax3.spines.right.set_position(("axes", 1.15))

fig.tight_layout()

# frac.rename(index=sample.fuller_label).sort_index()

In [None]:
gene_meta.reindex(gene_x_cog[lambda x: x == cog_id].index).drop_duplicates(
    subset=["seed_ortholog"]
)

In [None]:
spgc_depth_ratio.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
spgc_corr.reindex(gene_x_cog[lambda x: x == cog_id].index).dropna()

In [None]:
normalized_gene_depth = gene_depth.stack().to_xarray() / motu_depth2.sel(
    species=species_id
)

In [None]:
d = pd.DataFrame(
    dict(
        with_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_with_gene_list]
        .sum(1),
        without_gene_fraction=world.community.to_pandas()
        .rename(columns=str)[strain_without_gene_list]
        .sum(1),
        gene_depth=gene_depth.stack()
        .to_xarray()
        .sel(
            gene_id=list(
                set(gene_x_cog[lambda x: x == cog_id].index)
                & set(normalized_gene_depth.gene_id.values)
            )
        )
        .to_pandas()
        .sum(1),
        species_depth=motu_depth2.sel(species=species_id).to_series(),
    )
).fillna({"with_gene_fraction": 0, "without_gene_fraction": 0})


fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="with_gene_fraction",
    data=d.sort_values("with_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain With Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")

fig = plt.figure()
plt.scatter(
    "species_depth",
    "gene_depth",
    c="without_gene_fraction",
    data=d.sort_values("without_gene_fraction"),
    norm=mpl.colors.PowerNorm(1, vmin=0, vmax=1),
)
plt.plot([1, 1000], [1, 1000])
plt.colorbar(label="Strain Without Gene Fraction")
plt.yscale("symlog", linthresh=1e-1)
plt.xscale("symlog", linthresh=1e-1)
plt.ylabel("gene depth")
plt.xlabel("species depth")