In [None]:
%load_ext autoreload

In [None]:
import os as _os
_os.chdir(_os.environ['PROJECT_ROOT'])
_os.path.realpath(_os.path.curdir)

#### Imports

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import xarray as xr
from lib.pandas_util import idxwhere, aligned_index, align_indexes, invert_mapping
import lib.thisproject.data
import matplotlib as mpl
import lib.plot
import statsmodels as sm
from statsmodels.stats.multitest import fdrcorrection
from tqdm import tqdm
import subprocess
from tempfile import mkstemp
import time
import subprocess
from itertools import chain
import os
from itertools import product
from mpl_toolkits.axes_grid1 import make_axes_locatable
import sfacts as sf

In [None]:
species_id = '102327'  # '100236'

species_taxonomy = lib.thisproject.data.load_species_taxonomy("ref/gtpro/species_taxonomy_ext.tsv")
species_taxonomy.loc[species_id]

In [None]:
sample_to_spgc = pd.read_table(f"data/group/xjin_hmp2/species/sp-{species_id}/r.proc.gtpro.sfacts-fit.spgc_ss-all.strain_samples.tsv", index_col='sample').strain.astype(str)

spgc_palette = lib.plot.construct_ordered_palette(sample_to_spgc, other=(0.8, 0.8, 0.8, 1.0)) 

In [None]:
sfacts_fit = sf.World.load(f"data/group/xjin_hmp2/species/sp-{species_id}/r.proc.gtpro.sfacts-fit.world.nc").drop_low_abundance_strains(0.5)
# Strains should be str not int.
sfacts_fit.data['strain'] = sfacts_fit.strain.values.astype(str)
print(dict(sfacts_fit.sizes))
position_ss = sfacts_fit.random_sample(position=min(500, sfacts_fit.sizes['position'])).position

# Pre-calculate shared heatmap decorations
w = sfacts_fit.sel(position=position_ss)
sample_linkage = w.unifrac_linkage()
position_linkage = w.metagenotype.linkage("position")
sample_colors = w.sample.to_series().map(sample_to_spgc).map(spgc_palette)
spgc_linkage = w.genotype.linkage()
spgc_colors = w.strain.to_series().map(spgc_palette)

In [None]:
spgc_est_geno = sf.Metagenotype(sfacts_fit.metagenotype.data.sel(sample=sample_to_spgc.index).groupby(sample_to_spgc.to_xarray()).sum().rename(strain="sample")).to_estimated_genotype(pseudo=1)

In [None]:
sf.plot.plot_genotype(sf.data.Genotype.concat(dict(mgen=spgc_est_geno, fit=sfacts_fit.genotype), dim='strain').sel(position=position_ss), transpose=True)

In [None]:
ref_geno = sf.Metagenotype.load(f"data/species/sp-{species_id}/gtpro_ref.mgtp.nc").to_estimated_genotype()
ref_geno.data['strain'] = ref_geno.strain.to_series().map(lambda s: "UHGG" + s[len("GUT_GENOME"):])

In [None]:
# NOTE: Select any ref genotype that is within the top-5 closest distances from an SPGC strain.
spgc_to_ref_geno_cdist = ref_geno.sel(position=spgc_est_geno.position).cdist(spgc_est_geno)
ref_list = spgc_to_ref_geno_cdist.apply(lambda x: x.sort_values().head(5).index).stack().unique()

In [None]:
spgc_meta = pd.read_table(f"data/group/xjin_hmp2/species/sp-{species_id}/r.proc.gtpro.sfacts-fit.gene99-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30_thresh-corr200-depth250.strain_meta.tsv", index_col='strain').rename(str)
print(spgc_meta.shape)
spgc_meta

In [None]:
ref_gene_copy_number_uhgg = xr.load_dataarray(f"ref/midasdb_uhgg_pangenomes/{species_id}/gene75.reference_copy_number.nc")
ref_gene_uhgg = (ref_gene_copy_number_uhgg > 0).astype(int).to_pandas().T

In [None]:
ref_gene_uhgg.shape

In [None]:
spgc_gene_uhgg = pd.read_table(f"data/group/xjin_hmp2/species/sp-{species_id}/r.proc.gtpro.sfacts-fit.gene99-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30_thresh-corr200-depth250.strain_gene.tsv", index_col='gene_id').rename_axis(columns='strain')

In [None]:
ref_num_genes_uhgg = ref_gene_uhgg.sum()
spgc_num_genes_uhgg = spgc_gene_uhgg.sum()

In [None]:
x = spgc_meta[lambda x: x.species_gene_frac > 0.9].num_genes
y = ref_num_genes_uhgg
_df, _loc, _scale = sp.stats.t.fit(x.values, fix_df=2)
_dist0 = sp.stats.t(_df, _loc, _scale)
_dist1 = sp.stats.norm(_loc, _scale)

thresh_max_num_uhgg_genes = _dist1.ppf(0.999)
thresh_min_num_uhgg_genes = _dist1.ppf(0.001)


bins = np.linspace(0, x.max() * 1.5, num=50)
xx = np.linspace(0, x.max() * 1.5, num=1000)

plt.hist(x, density=True, bins=bins, alpha=0.2)
plt.hist(y, density=True, bins=bins, alpha=0.2)

plt.plot(xx, _dist0.pdf(xx), color='k')
plt.plot(xx, _dist1.pdf(xx), color='k', linestyle='--')
plt.axvline(thresh_max_num_uhgg_genes, lw=1, linestyle='--', color='k')
plt.axvline(thresh_min_num_uhgg_genes, lw=1, linestyle='--', color='k')

In [None]:
thresh_min_num_uhgg_genes, thresh_max_num_uhgg_genes

In [None]:
# NOTE: Select SPGC strains that pass various filters
spgc_list = idxwhere(
    (spgc_meta.sum_depth > 5)
    & (spgc_meta.species_gene_frac > 0.9)
    & (spgc_num_genes_uhgg <= thresh_max_num_uhgg_genes)
    & (spgc_num_genes_uhgg >= thresh_min_num_uhgg_genes)
)

print(len(ref_list), len(spgc_list))

In [None]:
strain_geno = sf.Genotype.concat(dict(
    ref=ref_geno.sel(strain=ref_list, position=spgc_est_geno.position),
    spgc=spgc_est_geno.sel(strain=spgc_list),
), dim='strain', rename=False).mlift('fillna', 0.5)

strain_geno_pdist = strain_geno.pdist()
strain_geno_linkage = strain_geno.linkage()

In [None]:
sf.plot.plot_genotype(strain_geno.sel(position=position_ss), transpose=True, col_linkage=strain_geno_linkage)

In [None]:
colors = pd.DataFrame(dict(
    is_ref=strain_geno_pdist.index.to_series().isin(ref_list).map({True: 'black', False: 'grey'}),
    is_spgc=strain_geno_pdist.index.to_series().isin(spgc_list).map({True: 'black', False: 'grey'}),
))
sns.clustermap(
    strain_geno_pdist,
    row_colors=colors,
    col_colors=colors,
    row_linkage=strain_geno_linkage,
    col_linkage=strain_geno_linkage,
    # figsize=(40, 40),
)

In [None]:
from scipy.spatial.distance import squareform

bins = np.linspace(0, 1)
plt.hist(squareform(strain_geno_pdist.loc[ref_list, ref_list]), bins=bins, histtype='step', density=True, label='ref-to-ref')
plt.hist(strain_geno_pdist.loc[ref_list, spgc_list].values.flatten(), bins=bins, histtype='step', density=True, label='spgc-to-ref')
plt.hist(squareform(strain_geno_pdist.loc[spgc_list, spgc_list]), bins=bins, histtype='step', density=True, label='spgc-to-spgc')
plt.legend()

In [None]:
bins = np.linspace(0, 1, num=200)

_pdist = strain_geno_pdist + np.eye(len(strain_geno_pdist))

plt.hist(_pdist.loc[ref_list, ref_list].min(), bins=bins, histtype='step', cumulative=True, density=True, label='ref-to-ref')
plt.hist(_pdist.loc[ref_list, spgc_list].min(), bins=bins, histtype='step', cumulative=True, density=True, label='spgc-to-ref')
plt.hist(_pdist.loc[spgc_list, ref_list].min(), bins=bins, histtype='step', cumulative=True, density=True, label='ref-to-spgc')
plt.hist(_pdist.loc[spgc_list, spgc_list].min(), bins=bins, histtype='step', cumulative=True, density=True, label='spgc-to-spgc')

plt.legend()
# plt.axvline(0.43, lw=1, linestyle='--', color='k')
plt.xlabel('minimum distance')
plt.ylabel('cumulative fraction')

In [None]:
spgc_gene_uhgg.shape, ref_gene_uhgg.shape

In [None]:
strain_gene_uhgg = pd.concat([ref_gene_uhgg[ref_list], spgc_gene_uhgg[spgc_list]], axis=1).fillna(0)[lambda x: x.sum(1) > 0]
strain_gene_uhgg.shape

In [None]:
bins = np.linspace(0, 10000)
plt.hist(strain_gene_uhgg[ref_list].sum(), bins=bins, histtype='step', label='ref', density=True)
plt.hist(strain_gene_uhgg[spgc_list].sum(), bins=bins, histtype='step', label='spgc', density=True)
plt.legend()

None

In [None]:
%%time
# NOTE: This will take ~2 minutes to run for 40,000 genes.
gene_uhgg_cdmat = sp.spatial.distance.pdist(strain_gene_uhgg, metric='cosine')
gene_uhgg_pdist = pd.DataFrame(sp.spatial.distance.squareform(gene_uhgg_cdmat), index=strain_gene_uhgg.index, columns=strain_gene_uhgg.index)

In [None]:
%%time
gene_uhgg_linkage = sp.cluster.hierarchy.linkage(gene_uhgg_cdmat, method='average')
gene_uhgg_linkage.shape

In [None]:
strain_gene_uhgg_cdmat = sp.spatial.distance.pdist(strain_gene_uhgg.T, metric='jaccard')
strain_gene_uhgg_pdist = pd.DataFrame(sp.spatial.distance.squareform(strain_gene_uhgg_cdmat), index=strain_gene_uhgg.columns, columns=strain_gene_uhgg.columns)

assert (strain_geno_pdist.index == strain_gene_uhgg_pdist.index).all()

In [None]:
strain_gene_uhgg_linkage = sp.cluster.hierarchy.linkage(strain_gene_uhgg_cdmat, method='average')

In [None]:
x = strain_gene_uhgg
_col_linkage = strain_geno_linkage
_row_linkage = gene_uhgg_linkage
# Order x by leaf order.
# See <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ClusterNode.pre_order.html#scipy.cluster.hierarchy.ClusterNode.pre_order>
x = x.iloc[sp.cluster.hierarchy.to_tree(_row_linkage).pre_order(lambda x: x.id)]

colors = pd.DataFrame(dict(
    is_ref=x.columns.to_series().isin(ref_list).map({True: 'black', False: 'white'}),
    is_spgc=x.columns.to_series().isin(spgc_list).map({True: 'black', False: 'white'}),
))

sns.clustermap(x, row_cluster=False, col_linkage=_col_linkage, col_colors=colors)

In [None]:
x = strain_gene_uhgg
_col_linkage = strain_gene_uhgg_linkage
_row_linkage = gene_uhgg_linkage
# Order x by leaf order.
# See <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ClusterNode.pre_order.html#scipy.cluster.hierarchy.ClusterNode.pre_order>
x = x.iloc[sp.cluster.hierarchy.to_tree(_row_linkage).pre_order(lambda x: x.id)]

colors = pd.DataFrame(dict(
    is_ref=x.columns.to_series().isin(ref_list).map({True: 'black', False: 'white'}),
    is_spgc=x.columns.to_series().isin(spgc_list).map({True: 'black', False: 'white'}),
))

sns.clustermap(x, row_cluster=False, col_linkage=_col_linkage, col_colors=colors)

In [None]:
spgc_gene_uhgg_depth = pd.read_table(f"data/group/xjin_hmp2/species/sp-{species_id}/r.proc.gtpro.sfacts-fit.gene99-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30.strain_depth_ratio.tsv", index_col=['gene_id', 'strain']).depth.unstack('strain').rename(columns=str)
spgc_gene_uhgg_corr = pd.read_table(f"data/group/xjin_hmp2/species/sp-{species_id}/r.proc.gtpro.sfacts-fit.gene99-v22-agg75.spgc_specgene-ref-t25-p95_ss-all_t-30.strain_correlation.tsv", index_col=['gene_id', 'strain']).correlation.unstack('strain').rename(columns=str)

In [None]:
_ref = ref_gene_uhgg[ref_list]
_spgc = spgc_gene_uhgg_depth[spgc_list]
strain_gene_uhgg_depth = pd.concat([_ref, _spgc], axis=1).fillna(0).loc[strain_gene_uhgg.index]

In [None]:
_ref = ref_gene_uhgg[ref_list]
_spgc = spgc_gene_uhgg_corr[spgc_list]
strain_gene_uhgg_corr = pd.concat([_ref, _spgc], axis=1).fillna(0).loc[strain_gene_uhgg.index]

In [None]:
x = strain_gene_uhgg_depth
_col_linkage = strain_gene_uhgg_linkage
_row_linkage = gene_uhgg_linkage
# Order x by leaf order.
# See <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ClusterNode.pre_order.html#scipy.cluster.hierarchy.ClusterNode.pre_order>
x = x.iloc[sp.cluster.hierarchy.to_tree(_row_linkage).pre_order(lambda x: x.id)]

colors = pd.DataFrame(dict(
    is_ref=x.columns.to_series().isin(ref_list).map({True: 'black', False: 'white'}),
    is_spgc=x.columns.to_series().isin(spgc_list).map({True: 'black', False: 'white'}),
))

sns.clustermap(x, row_cluster=False, col_linkage=_col_linkage, col_colors=colors, norm=mpl.colors.PowerNorm(1/2, vmin=0, vmax=2))

In [None]:
x = strain_gene_uhgg_corr
_col_linkage = strain_gene_uhgg_linkage
_row_linkage = gene_uhgg_linkage
# Order x by leaf order.
# See <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ClusterNode.pre_order.html#scipy.cluster.hierarchy.ClusterNode.pre_order>
x = x.iloc[sp.cluster.hierarchy.to_tree(_row_linkage).pre_order(lambda x: x.id)]

colors = pd.DataFrame(dict(
    is_ref=x.columns.to_series().isin(ref_list).map({True: 'black', False: 'white'}),
    is_spgc=x.columns.to_series().isin(spgc_list).map({True: 'black', False: 'white'}),
))

sns.clustermap(x, row_cluster=False, col_linkage=_col_linkage, col_colors=colors)

In [None]:
_pdistA = strain_geno_pdist
_pdistB = strain_gene_uhgg_pdist

plt.scatter(
    squareform(_pdistA.loc[ref_list, ref_list]),
    squareform(_pdistB.loc[ref_list, ref_list]),
    s=1,
    alpha=0.5,
    label='ref-ref',
)
print('ref-ref', sp.stats.pearsonr(
    squareform(_pdistA.loc[ref_list, ref_list]),
    squareform(_pdistB.loc[ref_list, ref_list]),
))

plt.scatter(
    squareform(_pdistA.loc[spgc_list, spgc_list]),
    squareform(_pdistB.loc[spgc_list, spgc_list]),
    s=1,
    alpha=0.5,
    label='spgc-spgc',
)
print('spgc-spgc', sp.stats.pearsonr(
    squareform(_pdistA.loc[spgc_list, spgc_list]),
    squareform(_pdistB.loc[spgc_list, spgc_list]),
))

plt.scatter(
    _pdistA.loc[spgc_list, ref_list].values.flatten(),
    _pdistB.loc[spgc_list, ref_list].values.flatten(),
    s=1,
    alpha=0.5,
    label='spgc-ref',
)
print('spgc-ref', sp.stats.pearsonr(
    _pdistA.loc[spgc_list, ref_list].values.flatten(),
    _pdistB.loc[spgc_list, ref_list].values.flatten(),
))

plt.legend(markerscale=5)

In [None]:
bins = np.linspace(0, 1, num=200)

_pdistA = strain_geno_pdist
_pdistB = strain_gene_uhgg_pdist


# Remove the diagonal from "minimum distance".
_pdistA = _pdistA + np.eye(len(_pdistA))
_pdistB = _pdistB + np.eye(len(_pdistB))

plt.scatter(
    _pdistA.loc[ref_list, ref_list].min(),
    _pdistB.loc[ref_list, ref_list].min(),
    s=10,
    alpha=0.5,
    label='ref-to-ref',
)
plt.scatter(
    _pdistA.loc[spgc_list, spgc_list].min(),
    _pdistB.loc[spgc_list, spgc_list].min(),
    s=10,
    alpha=0.5,
    label='spgc-to-spgc',
)
plt.scatter(
    _pdistA.loc[spgc_list, ref_list].min(),
    _pdistB.loc[spgc_list, ref_list].min(),
    s=10,
    alpha=0.5,
    label='ref-to-spgc',
)
plt.scatter(
    _pdistA.loc[ref_list, spgc_list].min(),
    _pdistB.loc[ref_list, spgc_list].min(),
    s=10,
    alpha=0.5,
    label='spgc-to-ref',
)
plt.xlabel('minimum_genotype_diss')
plt.ylabel('minimum_gene_diss')

plt.legend()

In [None]:
_pdistA = strain_geno_pdist
_pdistB = strain_gene_uhgg_pdist

plt.scatter(
    squareform(_pdistA.loc[ref_list, ref_list]),
    squareform(_pdistB.loc[ref_list, ref_list]),
    s=1,
    alpha=0.5,
    color='k',
    label='ref-ref',
)

for spgc_strain_id in spgc_list:
    plt.scatter(
        _pdistA.loc[spgc_strain_id, ref_list],
        _pdistB.loc[spgc_strain_id, ref_list],
        s=1,
        alpha=0.5,
        color=spgc_palette[spgc_strain_id],
        # label='spgc-ref',
    )
    print(spgc_strain_id, sp.stats.pearsonr(_pdistA.loc[spgc_strain_id, ref_list], _pdistB.loc[spgc_strain_id, ref_list]))

plt.legend()

In [None]:
ref_gene_uhgg_prevalence = strain_gene_uhgg[ref_list].mean(1)
spgc_gene_uhgg_prevalence = strain_gene_uhgg[spgc_list].mean(1)

In [None]:
x = ref_gene_uhgg_prevalence
y = spgc_gene_uhgg_prevalence

print(sp.stats.pearsonr(x, y))

fig, axs = plt.subplots(2, figsize=(5, 10))

bins0 = np.linspace(0.0, 1.0, num=50)
axs[0].hist2d(x, y, bins=bins0, norm=mpl.colors.PowerNorm(1/3, vmin=0, vmax=1e3))

bins1 = np.linspace(0.1, 0.9, num=40)
axs[1].hist2d(x, y, bins=bins1, norm=mpl.colors.PowerNorm(1/3))
axs[1].set_xlabel('reference prevalence')
axs[1].set_ylabel('inferred prevalence')
None

In [None]:
!head data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations

In [None]:
uhgg_x_eggnog = pd.read_table(f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_eggnog.tsv")

In [None]:
uhgg_x_top_eggnog = pd.read_table(f"data/species/sp-{species_id}/pangenome.centroids.emapper.gene_x_top_eggnog.tsv")

In [None]:
eggnog_column_names = "query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs".split(" ")
gene_annotations = pd.read_table(f"data/species/sp-{species_id}/pangenome.centroids.emapper.d/proteins.emapper.annotations", comment="#", names=eggnog_column_names, index_col="query").rename_axis(index="gene_id").replace({'-': np.nan})
gene_annotations.info()

In [None]:
spgc_extremely_enriched = idxwhere((spgc_gene_uhgg_prevalence - ref_gene_uhgg_prevalence) > 0.9)
spgc_extremely_depleted = idxwhere((spgc_gene_uhgg_prevalence - ref_gene_uhgg_prevalence) < -0.9)
spgc_very_enriched = idxwhere((spgc_gene_uhgg_prevalence - ref_gene_uhgg_prevalence) > 0.5)
spgc_very_depleted = idxwhere((spgc_gene_uhgg_prevalence - ref_gene_uhgg_prevalence) < -0.5)
spgc_enriched = idxwhere((spgc_gene_uhgg_prevalence - ref_gene_uhgg_prevalence) > 0.25)
spgc_depleted = idxwhere((spgc_gene_uhgg_prevalence - ref_gene_uhgg_prevalence) < -0.25)
spgc_similar = idxwhere(((spgc_gene_uhgg_prevalence - ref_gene_uhgg_prevalence) > -0.25) & ((spgc_gene_uhgg_prevalence - ref_gene_uhgg_prevalence) < 0.25))

In [None]:
spgc_too_common = idxwhere((spgc_gene_uhgg_prevalence > 0.7) & (ref_gene_uhgg_prevalence < 0.1))
spgc_too_rare = idxwhere((spgc_gene_uhgg_prevalence < 0.1) & (ref_gene_uhgg_prevalence > 0.7))

len(spgc_too_common + spgc_too_rare)

In [None]:
strain_gene_uhgg_filt = strain_gene_uhgg.drop(index=spgc_too_common + spgc_too_rare)[lambda x: x.sum(1) > 1]

In [None]:
strain_gene_top_eggnog = (
    uhgg_x_top_eggnog
    .join(strain_gene_uhgg, on='gene_id')
    .drop(columns=['gene_id'])
    .groupby('top_eggnog')
    .sum()
    .gt(0)
    [lambda x: x.sum(1) > 0]
)
strain_gene_top_eggnog

In [None]:
strain_gene_eggnog = (
    uhgg_x_eggnog
    .join(strain_gene_uhgg, on='gene_id')
    .drop(columns=['gene_id'])
    .groupby('eggnog')
    .sum()
    .gt(0)
    [lambda x: x.sum(1) > 0]
)
strain_gene_eggnog

In [None]:
strain_gene_filt_eggnog = (
    uhgg_x_eggnog
    .join(strain_gene_uhgg_filt, on='gene_id')
    .drop(columns=['gene_id'])
    .groupby('eggnog')
    .sum()
    .gt(0)
    [lambda x: x.sum(1) > 0]
)
strain_gene_filt_eggnog

In [None]:
bins = np.linspace(1000, 12500, num=50)
# plt.hist(strain_gene[ref_list].sum(), bins=bins, histtype='step', label='ref (uhgg)', density=True, linestyle='--', color='tab:blue', lw=2)
# plt.hist(strain_gene[spgc_list].sum(), bins=bins, histtype='step', label='spgc (uhgg)', density=True, linestyle='--', color='tab:orange', lw=2)
plt.hist(strain_gene_uhgg_filt[ref_list].sum(), bins=bins, histtype='step', label='ref (filtered uhgg)', density=True, linestyle=':', color='tab:blue', lw=2)
plt.hist(strain_gene_uhgg_filt[spgc_list].sum(), bins=bins, histtype='step', label='spgc (filtered uhgg)', density=True, linestyle=':', color='tab:orange', lw=2)
plt.hist(strain_gene_eggnog[ref_list].sum(), bins=bins, histtype='step', label='ref (eggnog)', density=True, color='tab:blue')
plt.hist(strain_gene_eggnog[spgc_list].sum(), bins=bins, histtype='step', label='spgc (eggnog)', density=True, color='tab:orange')
plt.hist(strain_gene_filt_eggnog[ref_list].sum(), bins=bins, histtype='stepfilled', label='ref (filtered eggnog)', density=True, color='tab:blue', alpha=0.5)
plt.hist(strain_gene_filt_eggnog[spgc_list].sum(), bins=bins, histtype='stepfilled', label='spgc (filtered eggnog)', density=True, color='tab:orange', alpha=0.5)
# plt.hist(strain_gene_top_eggnog[ref_list].sum(), bins=bins, histtype='stepfilled', alpha=0.5, label='ref (top eggnog)', density=True, color='tab:blue')
# plt.hist(strain_gene_top_eggnog[spgc_list].sum(), bins=bins, histtype='stepfilled', alpha=0.5, label='spgc (top eggnog)', density=True, color='tab:orange')
plt.legend()

None

In [None]:
%%time
# NOTE: This will take about ~3 minustrain_gene_filt_eggnogr 40,000 genes.
eggnog_filt_cdmat = sp.spatial.distance.pdist(strain_gene_filt_eggnog, metric='cosine')
eggnog_filt_pdist = pd.DataFrame(sp.spatial.distance.squareform(eggnog_filt_cdmat), index=strain_gene_filt_eggnog.index, columns=strain_gene_filt_eggnog.index)

In [None]:
%%time
eggnog_filt_linkage = sp.cluster.hierarchy.linkage(eggnog_filt_cdmat, method='average')
eggnog_filt_linkage.shape

In [None]:
strain_eggnog_filt_cdmat = sp.spatial.distance.pdist(strain_gene_filt_eggnog.T, metric='jaccard')
strain_eggnog_filt_pdist = pd.DataFrame(sp.spatial.distance.squareform(strain_eggnog_filt_cdmat), index=strain_gene_filt_eggnog.columns, columns=strain_gene_filt_eggnog.columns)

In [None]:
strain_eggnog_filt_linkage = sp.cluster.hierarchy.linkage(strain_eggnog_filt_cdmat, method='average')

In [None]:
x = strain_gene_filt_eggnog
_col_linkage = strain_geno_linkage
_row_linkage = eggnog_filt_linkage
# Order x by leaf order.
# See <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ClusterNode.pre_order.html#scipy.cluster.hierarchy.ClusterNode.pre_order>
x = x.iloc[sp.cluster.hierarchy.to_tree(_row_linkage).pre_order(lambda x: x.id)]

colors = pd.DataFrame(dict(
    is_ref=x.columns.to_series().isin(ref_list).map({True: 'black', False: 'white'}),
    is_spgc=x.columns.to_series().isin(spgc_list).map({True: 'black', False: 'white'}),
))

sns.clustermap(x, row_cluster=False, col_linkage=_col_linkage, col_colors=colors)

In [None]:
x = strain_gene_filt_eggnog
_col_linkage = strain_eggnog_filt_linkage
_row_linkage = eggnog_filt_linkage
# Order x by leaf order.
# See <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ClusterNode.pre_order.html#scipy.cluster.hierarchy.ClusterNode.pre_order>
x = x.iloc[sp.cluster.hierarchy.to_tree(_row_linkage).pre_order(lambda x: x.id)]

colors = pd.DataFrame(dict(
    is_ref=x.columns.to_series().isin(ref_list).map({True: 'black', False: 'white'}),
    is_spgc=x.columns.to_series().isin(spgc_list).map({True: 'black', False: 'white'}),
))

sns.clustermap(x, row_cluster=False, col_linkage=_col_linkage, col_colors=colors)

In [None]:
_pdistA = strain_geno_pdist
_pdistB = strain_eggnog_filt_pdist

plt.scatter(
    squareform(_pdistA.loc[ref_list, ref_list]),
    squareform(_pdistB.loc[ref_list, ref_list]),
    s=1,
    alpha=0.5,
    label='ref-ref',
)
print('ref-ref', sp.stats.pearsonr(
    squareform(_pdistA.loc[ref_list, ref_list]),
    squareform(_pdistB.loc[ref_list, ref_list]),
))

plt.scatter(
    squareform(_pdistA.loc[spgc_list, spgc_list]),
    squareform(_pdistB.loc[spgc_list, spgc_list]),
    s=1,
    alpha=0.5,
    label='spgc-spgc',
)
print('spgc-spgc', sp.stats.pearsonr(
    squareform(_pdistA.loc[spgc_list, spgc_list]),
    squareform(_pdistB.loc[spgc_list, spgc_list]),
))

plt.scatter(
    _pdistA.loc[spgc_list, ref_list].values.flatten(),
    _pdistB.loc[spgc_list, ref_list].values.flatten(),
    s=1,
    alpha=0.5,
    label='spgc-ref',
)
print('spgc-ref', sp.stats.pearsonr(
    _pdistA.loc[spgc_list, ref_list].values.flatten(),
    _pdistB.loc[spgc_list, ref_list].values.flatten(),
))

plt.legend(markerscale=5)

In [None]:
_pdistA = strain_geno_pdist
_pdistB = strain_eggnog_filt_pdist


# Remove the diagonal from "minimum distance".
_pdistA = _pdistA + np.eye(len(_pdistA))
_pdistB = _pdistB + np.eye(len(_pdistB))

plt.scatter(
    _pdistA.loc[ref_list, ref_list].min(),
    _pdistB.loc[ref_list, ref_list].min(),
    s=10,
    alpha=0.5,
    label='ref-to-ref',
)
plt.scatter(
    _pdistA.loc[spgc_list, spgc_list].min(),
    _pdistB.loc[spgc_list, spgc_list].min(),
    s=10,
    alpha=0.5,
    label='spgc-to-spgc',
)
plt.scatter(
    _pdistA.loc[spgc_list, ref_list].min(),
    _pdistB.loc[spgc_list, ref_list].min(),
    s=10,
    alpha=0.5,
    label='ref-to-spgc',
)
plt.scatter(
    _pdistA.loc[ref_list, spgc_list].min(),
    _pdistB.loc[ref_list, spgc_list].min(),
    s=10,
    alpha=0.5,
    label='spgc-to-ref',
)
plt.xlabel('minimum_genotype_diss')
plt.ylabel('minimum_gene_diss')

plt.legend()

In [None]:
_pdistA = strain_geno_pdist
_pdistB = strain_eggnog_filt_pdist

plt.scatter(
    squareform(_pdistA.loc[ref_list, ref_list]),
    squareform(_pdistB.loc[ref_list, ref_list]),
    s=1,
    alpha=0.5,
    color='k',
    label='ref-ref',
)

for spgc_strain_id in spgc_list:
    plt.scatter(
        _pdistA.loc[spgc_strain_id, ref_list],
        _pdistB.loc[spgc_strain_id, ref_list],
        s=1,
        alpha=0.5,
        color=spgc_palette[spgc_strain_id],
        # label='spgc-ref',
    )
    print(spgc_strain_id, sp.stats.pearsonr(_pdistA.loc[spgc_strain_id, ref_list], _pdistB.loc[spgc_strain_id, ref_list]))

plt.legend()

In [None]:
ref_gene_eggnog_prevalence = strain_gene_eggnog[ref_list].mean(1)
spgc_gene_eggnog_prevalence = strain_gene_eggnog[spgc_list].mean(1)

In [None]:
x = ref_gene_eggnog_prevalence
y = spgc_gene_eggnog_prevalence

print(sp.stats.pearsonr(x, y))

fig, axs = plt.subplots(2, figsize=(5, 10))

bins0 = np.linspace(0.0, 1.0, num=50)
axs[0].hist2d(x, y, bins=bins0, norm=mpl.colors.PowerNorm(1/3, vmin=0, vmax=1e3))

bins1 = np.linspace(0.1, 0.9, num=40)
axs[1].hist2d(x, y, bins=bins1, norm=mpl.colors.PowerNorm(1/3))
axs[1].set_xlabel('reference prevalence')
axs[1].set_ylabel('inferred prevalence')
None

In [None]:
colors = pd.DataFrame(dict(
    is_spgc=strain_geno_pdist.index.to_series().isin(spgc_list).map({True: 'black', False: 'lightgrey'}),
))

sns.clustermap(
    strain_geno_pdist,
    row_colors=colors,
    col_colors=colors,
    row_linkage=strain_geno_linkage,
    col_linkage=strain_eggnog_filt_linkage,
    # figsize=(40, 40),
)

In [None]:
colors = pd.DataFrame(dict(
    is_spgc=strain_geno_pdist.index.to_series().isin(spgc_list).map({True: 'black', False: 'lightgrey'}),
))

sns.clustermap(
    strain_eggnog_filt_pdist,
    row_colors=colors,
    col_colors=colors,
    row_linkage=strain_geno_linkage,
    col_linkage=strain_eggnog_filt_linkage,
    # figsize=(40, 40),
)

In [None]:
mgen = pd.read_table('meta/hmp2/mgen.tsv', index_col='library_id')
preparation = pd.read_table('meta/hmp2/preparation.tsv', index_col='preparation_id')
stool = pd.read_table('meta/hmp2/stool.tsv', index_col='stool_id')
subject = pd.read_table('meta/hmp2/subject.tsv', index_col='subject_id')

mgen_meta = mgen.join(preparation, on='preparation_id', lsuffix='_mgen', rsuffix='_preparation').join(stool, on='stool_id').join(subject, on='subject_id')

In [None]:
_row_linkage = gene_uhgg_linkage
_gene_data = strain_gene_uhgg

u = (sfacts_fit.community.sel(strain=spgc_list).to_pandas().groupby(mgen_meta.subject_id).mean() > 0.2)[lambda x: x.any(1)]
v = _gene_data[u.columns]
y = ((u @ v.T).T > 0)

x = subject.loc[y.columns].ibd_diagnosis

col_colors = x.replace({'CD': 'tab:green', 'UC': 'tab:blue', 'nonIBD': 'lightgrey'})
row_order = sp.cluster.hierarchy.to_tree(_row_linkage).pre_order(lambda x: x.id)
sns.clustermap(y.iloc[row_order], col_colors=col_colors, row_cluster=False)

In [None]:
%%time

test_gene_list = idxwhere((y.mean(1) > 0.25) & (y.mean(1) < 0.75))
print(len(test_gene_list))

test_results = {}
for ibd_diagnosis in x.unique():
    for _gene in test_gene_list:
        _test = sp.stats.fisher_exact(pd.DataFrame(dict(diagnosis=(x == ibd_diagnosis), gene=y.loc[_gene])).value_counts().unstack().reindex(index=[True, False], columns=[True, False]).fillna(0))
        test_results[(ibd_diagnosis, _gene)] = _test.pvalue
        
test_results = (
    pd.Series(test_results)
    .rename_axis(['ibd_diagnosis', 'gene'])
    .unstack('ibd_diagnosis')
    .stack()
    .to_frame(name='pvalue')
    .assign(fdr=lambda x: fdrcorrection(x.pvalue, method="poscorr")[1])
    .join(gene_annotations[['Description', 'COG_category', 'eggNOG_OGs']], on='gene')
    .sort_values('pvalue')
)

In [None]:
for ibd_diagnosis in test_results.reset_index().ibd_diagnosis.unique():
    plt.hist(test_results.xs(ibd_diagnosis, level='ibd_diagnosis').pvalue + 1e-5, label=ibd_diagnosis, alpha=0.5, bins=np.logspace(-5, 0, num=50))
plt.legend()
plt.xscale('log')

In [None]:
test_results.head(10)

In [None]:
_gene = test_results.index[0][0]
print(_gene)
pd.DataFrame(dict(diagnosis=x, gene=y.loc[_gene])).value_counts().unstack().fillna(0)

In [None]:
uhgg_depth = xr.load_dataarray(f"data/group/xjin_hmp2/species/sp-{species_id}/r.proc.gene99-v22-agg75.depth2.nc")
subject_uhgg_depth = mgen_meta[['subject_id']].join(uhgg_depth.to_pandas()).groupby('subject_id').mean().dropna()

In [None]:
_row_linkage = gene_uhgg_linkage
_gene_data = subject_uhgg_depth.T.loc[test_gene_list]

col_colors = subject[['ibd_diagnosis']].replace({'CD': 'tab:green', 'UC': 'tab:blue', 'nonIBD': 'lightgrey'})
sns.clustermap(_gene_data, col_colors=col_colors, norm=mpl.colors.SymLogNorm(1e-3), metric='cosine')

In [None]:
mean_subject_depth = subject_uhgg_depth[_gene]

d = subject[['ibd_diagnosis']].join(mean_subject_depth).dropna()
for ibd_diagnosis in d.ibd_diagnosis.unique():
    plt.hist(d.loc[d.ibd_diagnosis == ibd_diagnosis, _gene] + 1e-4, bins=np.logspace(-4, 3), alpha=0.5, label=ibd_diagnosis)

thresh = 1e-3
plt.axvline(thresh, lw=1, linestyle='--', color='k')
plt.xscale('log')
# plt.yscale('log')
plt.legend()

d.assign(gene=lambda x: x[_gene] > thresh)[['ibd_diagnosis', 'gene']].value_counts().unstack()