In [None]:
import os as _os
_os.chdir('..')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from operator import eq, sub
import scipy.stats
import scipy as sp
import seaborn as sns
from lib.pandas_util import idxwhere
import matplotlib as mpl

In [None]:
mgen = pd.read_table('meta/mgen.tsv', index_col='library_id')
preparation = pd.read_table('meta/preparation.tsv', index_col='preparation_id')
stool = pd.read_table('meta/stool.tsv', index_col='stool_id')
visit = pd.read_table('meta/visit.tsv', index_col='visit_id')
subject = pd.read_table('meta/subject.tsv', index_col='subject_id')

mgen_meta = (
    mgen
    .join(preparation.drop(columns='library_type'), on='preparation_id')
    .join(stool, on='stool_id')
    .join(visit, on='visit_id', rsuffix='_')
    .join(subject, on='subject_id')
)

assert not any(mgen_meta.subject_id.isna())

# mgen_meta.columns

In [None]:
species_taxonomy = pd.read_table('ref/gtpro/species_taxonomy_ext.tsv', names=['genome_id', 'species_id', 'taxonomy_string']).assign(species_id=lambda x: x.species_id.astype(str)).set_index('species_id').taxonomy_string
species_taxonomy = species_taxonomy.str.split(';').to_frame()

for level_name, level_number in [('p__', 2), ('f__', 5), ('s__', 7)]:
    species_taxonomy = species_taxonomy.assign(**{level_name: species_taxonomy['taxonomy_string'].apply(lambda x: ';'.join(x[:level_number]))})
    
species_taxonomy

In [None]:
strain_depth_with_minor = pd.read_table(
    'data/hmp2.a.r.proc.gtpro.filt-poly05-cvrg05.fit-sfacts9-s75-g10000-seed0.collapse-10.strain_depth.tsv',
    # names=['library_id', 'species_strain_id', 'depth'],
    index_col=['sample', 'strain'],
).squeeze().unstack('strain', fill_value=0)

In [None]:
strain_taxonomy = strain_depth_with_minor.columns.to_series().str.split('-').str[0].to_frame(name='species_id').join(species_taxonomy, on='species_id')

In [None]:
species_depth = pd.read_table('data/hmp2.a.r.proc.gtpro.species_depth.tsv', index_col=['sample', 'species_id']).squeeze().unstack('species_id', fill_value=0)
plt.hist(strain_depth_with_minor.sum(1) - species_depth.sum(1))

In [None]:
species_rabund = species_depth.divide(species_depth.sum(1), axis=0)

In [None]:
thresh = 0.5

strain_collapse = strain_depth_with_minor.columns.to_series()
strain_other = strain_collapse.str.rsplit('-', 1).str[0] + '-other'
strain_collapse = strain_collapse.where(strain_depth_with_minor.max() > thresh, strain_other)

strain_collapse.value_counts().shape

In [None]:
strain_depth = strain_depth_with_minor.groupby(strain_collapse, axis='columns').sum()

In [None]:
plt.hist(strain_depth_with_minor.sum(1) - strain_depth.sum(1))

In [None]:
strain_rabund = strain_depth.divide(strain_depth.sum(1), axis=0)

In [None]:
plt.hist(np.log10(strain_rabund.max()), bins=np.linspace(-5, 0, num=51))
None

In [None]:
species_bc_dist = sp.spatial.distance.pdist(species_depth.divide(species_depth.sum(1), axis=0), metric='braycurtis')

In [None]:
bc_dist = sp.spatial.distance.pdist(strain_rabund, metric='braycurtis')
same_subject = sp.spatial.distance.pdist(mgen_meta.loc[strain_rabund.index].subject_id.values.reshape((-1, 1)), metric=eq).astype(bool)

In [None]:
species_bc_dist = sp.spatial.distance.pdist(species_rabund, metric='braycurtis')

In [None]:
bins = np.linspace(0, 1, num=51)
dist = species_bc_dist

fig, ax = plt.subplots()
ax.hist(dist[~same_subject], bins=bins, density=True, alpha=0.5, label='trans')
ax.hist(dist[same_subject], bins=bins, density=True, alpha=0.5, label='cis')
ax.set_yscale('log')
mwu, p = sp.stats.mannwhitneyu(dist[~same_subject], dist[same_subject])
auc = mwu / (len(dist[~same_subject]) * len(dist[same_subject]))
print(auc)
    
ax.legend()

In [None]:
bins = np.linspace(0, 1, num=51)
dist = bc_dist

fig, ax = plt.subplots()
ax.hist(dist[~same_subject], bins=bins, density=True, alpha=0.5, label='trans')
ax.hist(dist[same_subject], bins=bins, density=True, alpha=0.5, label='cis')
ax.set_yscale('log')
mwu, p = sp.stats.mannwhitneyu(dist[~same_subject], dist[same_subject])
auc = mwu / (len(dist[~same_subject]) * len(dist[same_subject]))
print(auc)
    
ax.legend()

In [None]:
jc_dist = sp.spatial.distance.pdist(
    strain_rabund.loc[:, strain_rabund.columns.str.split('-').str[1] != 'other'] > 1e-5,
    metric='jaccard',
)

In [None]:
species_jc_dist = sp.spatial.distance.pdist(
    species_rabund > 1e-5,
    metric='jaccard',
)

In [None]:
bins = np.linspace(0, 1, num=51)
dist = jc_dist

fig, ax = plt.subplots()
ax.hist(dist[~same_subject], bins=bins, density=True, alpha=0.5, label='trans')
ax.hist(dist[same_subject], bins=bins, density=True, alpha=0.5, label='cis')
ax.set_yscale('log')
mwu, p = sp.stats.mannwhitneyu(dist[~same_subject], dist[same_subject])
auc = mwu / (len(dist[~same_subject]) * len(dist[same_subject]))
print(auc)
    
ax.legend()

In [None]:
bins = np.linspace(0, 1, num=51)
dist = species_jc_dist

fig, ax = plt.subplots()
ax.hist(dist[~same_subject], bins=bins, density=True, alpha=0.5, label='trans')
ax.hist(dist[same_subject], bins=bins, density=True, alpha=0.5, label='cis')
ax.set_yscale('log')
mwu, p = sp.stats.mannwhitneyu(dist[~same_subject], dist[same_subject])
auc = mwu / (len(dist[~same_subject]) * len(dist[same_subject]))
print(auc)
    
ax.legend()

In [None]:
shared_species = sp.spatial.distance.pdist(species_rabund > 1e-5, metric=lambda u, v: (u * v).sum())

In [None]:
shared_strains = sp.spatial.distance.pdist(strain_rabund > 1e-5, metric=lambda u, v: (u * v).sum())

In [None]:
bins = np.linspace(0, 500, num=501)
dist = shared_strains

fig, ax = plt.subplots()
ax.hist(dist[~same_subject], bins=bins, density=True, alpha=0.5, label='trans')
ax.hist(dist[same_subject], bins=bins, density=True, alpha=0.5, label='cis')
ax.set_yscale('log')
    
ax.legend()

In [None]:
bins = np.linspace(0, 500, num=501)
dist = shared_species

fig, ax = plt.subplots()
ax.hist(dist[~same_subject], bins=bins, density=True, alpha=0.5, label='trans')
ax.hist(dist[same_subject], bins=bins, density=True, alpha=0.5, label='cis')
ax.set_yscale('log')
    
ax.legend()

In [None]:
bins = np.linspace(0, 1, num=501)
dist = 1 / (shared_strains + 1)

fig, ax = plt.subplots()
ax.hist(dist[~same_subject], bins=bins, density=True, alpha=0.5, label='trans')
ax.hist(dist[same_subject], bins=bins, density=True, alpha=0.5, label='cis')
ax.set_yscale('log')
mwu, p = sp.stats.mannwhitneyu(dist[~same_subject], dist[same_subject])
auc = mwu / (len(dist[~same_subject]) * len(dist[same_subject]))
print(auc)

ax.set_xlim(right=0.2)
    
ax.legend()

In [None]:
frac_shared_strains = sp.spatial.distance.pdist(strain_rabund > 1e-5, metric=lambda u, v: 2 * (u * v).sum() / (u.sum() + v.sum()))

In [None]:
bins = np.linspace(0, 1, num=51)
dist = 1 - frac_shared_strains

fig, ax = plt.subplots()
ax.hist(dist[~same_subject], bins=bins, density=True, alpha=0.5, label='trans')
ax.hist(dist[same_subject], bins=bins, density=True, alpha=0.5, label='cis')
ax.set_yscale('log')
mwu, p = sp.stats.mannwhitneyu(dist[~same_subject], dist[same_subject])
auc = mwu / (len(dist[~same_subject]) * len(dist[same_subject]))
print(auc)
    
ax.legend()

In [None]:
frac_shared_species = sp.spatial.distance.pdist(species_rabund > 1e-5, metric=lambda u, v: 2 * (u * v).sum() / (u.sum() + v.sum()))

In [None]:
bins = np.linspace(0, 1, num=51)
dist = 1 - frac_shared_species

fig, ax = plt.subplots()
ax.hist(dist[~same_subject], bins=bins, density=True, alpha=0.5, label='trans')
ax.hist(dist[same_subject], bins=bins, density=True, alpha=0.5, label='cis')
ax.set_yscale('log')
mwu, p = sp.stats.mannwhitneyu(dist[~same_subject], dist[same_subject])
auc = mwu / (len(dist[~same_subject]) * len(dist[same_subject]))
print(auc)
    
ax.legend()

In [None]:
time_delta = sp.spatial.distance.pdist(mgen_meta.loc[strain_rabund.index][['week_number']], metric=lambda u, v: np.abs(u - v))
pairwise_dist = pd.DataFrame(dict(
    same_subject=same_subject,
    bc_dist=bc_dist,
    time_delta=time_delta,
    jc_dist=jc_dist,
    species_jc_dist=species_jc_dist,
    species_bc_dist=species_bc_dist,
    frac_shared_strains=frac_shared_strains
)).dropna()

In [None]:
k = 'jc_dist'
d = pairwise_dist[lambda x: x.same_subject]

plt.hexbin('time_delta', k, data=d, gridsize=(56, 50), norm=mpl.colors.PowerNorm(1/3, vmin=0), cmap='Greys')
sns.regplot('time_delta', k, data=d, scatter_kws=dict(s=1, alpha=0.2), scatter=False, lowess=True, color='k')
sp.stats.spearmanr(d.time_delta, d[k])

In [None]:
k = 'species_jc_dist'
d = pairwise_dist[lambda x: x.same_subject]

plt.hexbin('time_delta', k, data=d, gridsize=(56, 50), norm=mpl.colors.PowerNorm(1/3, vmin=0), cmap='Greys')
sns.regplot('time_delta', k, data=d, scatter_kws=dict(s=1, alpha=0.2), scatter=False, lowess=True, color='k')
sp.stats.spearmanr(d.time_delta, d[k])

In [None]:
k = 'frac_shared_strains'
d = pairwise_dist[lambda x: ~x.same_subject]

plt.scatter('time_delta', k, data=d, s=1, alpha=0.2)
sns.regplot('time_delta', k, data=d, scatter_kws=dict(s=1, alpha=0.2), scatter=False, lowess=True, color='k')
sp.stats.spearmanr(d.time_delta, d[k])

In [None]:
k = 'bc_dist'
d = pairwise_dist[lambda x: x.same_subject]

plt.hexbin('time_delta', k, data=d, gridsize=(28, 20), norm=mpl.colors.PowerNorm(1/2, vmin=0), cmap='Greys')
sns.regplot('time_delta', k, data=d, scatter_kws=dict(s=1, alpha=0.2), scatter=False, lowess=True, color='r')
sp.stats.spearmanr(d.time_delta, d[k])

In [None]:
k = 'species_bc_dist'
d = pairwise_dist[lambda x: x.same_subject]

plt.hexbin('time_delta', k, data=d, gridsize=(28, 20), norm=mpl.colors.PowerNorm(1/2, vmin=0), cmap='Greys')
sns.regplot('time_delta', k, data=d, scatter_kws=dict(s=1, alpha=0.2), scatter=False, lowess=True, color='r')
sp.stats.spearmanr(d.time_delta, d[k])

In [None]:
k = 'bc_dist'
d = pairwise_dist[lambda x: ~x.same_subject]

plt.scatter('time_delta', k, data=d, s=1, alpha=0.2)
sns.regplot('time_delta', k, data=d, scatter_kws=dict(s=1, alpha=0.2), scatter=False, lowess=True, color='k')
sp.stats.spearmanr(d.time_delta, d[k])

In [None]:
k = 'jc_dist'
d = pairwise_dist[lambda x: x.same_subject]

plt.scatter('time_delta', k, data=d, s=1, alpha=0.2)
sns.regplot('time_delta', k, data=d, scatter_kws=dict(s=1, alpha=0.2), scatter=False, lowess=True, color='k')
sp.stats.spearmanr(d.time_delta, d[k])

In [None]:
k = 'shared_strains'
d = pairwise_dist[lambda x: x.same_subject]

plt.scatter('time_delta', k, data=d, s=1, alpha=0.2)
sns.regplot('time_delta', k, data=d, scatter_kws=dict(s=1, alpha=0.2), scatter=False, lowess=True, color='k')
sp.stats.spearmanr(d.time_delta, d[k])

In [None]:
mgen_meta_baseline = mgen_meta.loc[strain_rabund.index].groupby('subject_id').apply(lambda d: d.sort_values('week_number').head(1)).reset_index('subject_id', drop=True)
baseline_library = mgen_meta_baseline.index.to_series()

In [None]:
import lib.plot

lib.plot.ordination_plot(
    pd.DataFrame(sp.spatial.distance.squareform(bc_dist), index=strain_rabund.index, columns=strain_rabund.index),
    meta=mgen_meta,
    subset=baseline_library,
    ordin=lib.plot.nmds2_ordination,
    ordin_kws={},
    colorby='ibd_diagnosis',
    scatter_kws=dict(lw=0),
)
None

In [None]:
_depth = species_depth.loc[baseline_library]
_n_taxa = len(_depth.columns)
_rabund = _depth.divide(_depth.sum(1), axis=0)

c = 'tab:blue'

_prevalence = (_rabund > 1e-5).mean()
_mean_rabund = _rabund.mean()
_decreasing_prevalence = (_rabund > 1e-5).mean().sort_values(ascending=False).index
_quantile_rabund = _rabund.loc[:, _decreasing_prevalence].cumsum(1).quantile([0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]).T

xx = np.arange(_n_taxa)
plt.plot(xx, _quantile_rabund[0.5], c=c, lw=2, label='median_rabund')
plt.fill_between(xx, _quantile_rabund[0.0], _quantile_rabund[1.0], color=c, alpha=0.05, edgecolor=None)
plt.fill_between(xx, _quantile_rabund[0.05], _quantile_rabund[0.95], color=c, alpha=0.2, edgecolor=None)
plt.fill_between(xx, _quantile_rabund[0.25], _quantile_rabund[0.75], color=c, alpha=0.2, edgecolor=None)

plt.axvline((_quantile_rabund[0.0] < 0.99).sum(), linestyle='--', lw=1, color='grey')

In [None]:
_depth = strain_depth.loc[baseline_library]
_n_taxa = len(_depth.columns)
_rabund = _depth.divide(_depth.sum(1), axis=0)

c = 'tab:blue'

_prevalence = (_rabund > 1e-5).mean()
_mean_rabund = _rabund.mean()
_decreasing_prevalence = (_rabund > 1e-5).mean().sort_values(ascending=False).index
_quantile_rabund = _rabund.loc[:, _decreasing_prevalence].cumsum(1).quantile([0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]).T

xx = np.arange(_n_taxa)
plt.plot(xx, _quantile_rabund[0.5], c=c, lw=2, label='median_rabund')
plt.fill_between(xx, _quantile_rabund[0.0], _quantile_rabund[1.0], color=c, alpha=0.05, edgecolor=None)
plt.fill_between(xx, _quantile_rabund[0.05], _quantile_rabund[0.95], color=c, alpha=0.2, edgecolor=None)
plt.fill_between(xx, _quantile_rabund[0.25], _quantile_rabund[0.75], color=c, alpha=0.2, edgecolor=None)

plt.axvline((_quantile_rabund[0.0] < 0.99).sum(), linestyle='--', lw=1, color='grey')

print('Unique species:', len(set(map(lambda s: s.split('-')[0], idxwhere(_quantile_rabund[0.0] < 0.99)))))

In [None]:
_depth = strain_depth
_n_taxa = len(_depth.columns)
_rabund = _depth.divide(_depth.sum(1), axis=0)

c = 'tab:blue'

_prevalence = (_rabund > 1e-5).mean()
_mean_rabund = _rabund.mean()
_decreasing_prevalence = (_rabund > 1e-5).mean().sort_values(ascending=False).index
_quantile_rabund = _rabund.loc[:, _decreasing_prevalence].cumsum(1).quantile([0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]).T

xx = np.arange(_n_taxa)
plt.plot(xx, _quantile_rabund[0.5], c=c, lw=2, label='median_rabund')
plt.fill_between(xx, _quantile_rabund[0.0], _quantile_rabund[1.0], color=c, alpha=0.05, edgecolor=None)
plt.fill_between(xx, _quantile_rabund[0.05], _quantile_rabund[0.95], color=c, alpha=0.2, edgecolor=None)
plt.fill_between(xx, _quantile_rabund[0.25], _quantile_rabund[0.75], color=c, alpha=0.2, edgecolor=None)

plt.axvline((_quantile_rabund[0.0] < 0.99).sum(), linestyle='--', lw=1, color='grey')

print('Unique species:', len(set(map(lambda s: s.split('-')[0], idxwhere(_quantile_rabund[0.0] < 0.99)))))

In [None]:
_same_subject.shape

In [None]:
from tqdm import tqdm

strain_to_species = strain_rabund.columns.to_series().str.split('-').str[0]
same_subject_matrix = pd.DataFrame(
    sp.spatial.distance.squareform(same_subject),
    index=strain_rabund.index,
    columns=strain_rabund.index
)

out = {}
for species_id in tqdm(strain_to_species.unique()):
    d = strain_rabund.loc[:, strain_to_species == species_id]
    d = d.divide(d.sum(1), axis=0).dropna()
    num_subjects = len(mgen_meta.loc[d.index].subject_id.unique())
    _dist = sp.spatial.distance.pdist(d, metric='braycurtis')
    _same_subject = sp.spatial.distance.squareform(same_subject_matrix.loc[d.index, d.index])
    if (_same_subject.mean() == 1) or (_same_subject.mean() == 0):
        continue
    mwu, p = sp.stats.mannwhitneyu(_dist[~_same_subject], _dist[_same_subject])
    out[species_id] = mwu / (len(_dist[~_same_subject]) * len(_dist[_same_subject])), num_subjects

In [None]:
d = pd.DataFrame(out, index=['auc', 'num_subjects']).T

plt.scatter('auc', 'num_subjects', data=d, s=5, alpha=0.7)
plt.xlim(-0.05, 1.05)

In [None]:
((d.auc > 0.9) & (d.num_subjects > 50)).sum()

In [None]:
d.join(species_taxonomy[['p__', 'f__', 's__']]).sort_values('num_subjects').tail(20)

In [None]:
plt.hist(pd.Series(auc), bins=np.linspace(0, 1, num=101))