In [None]:
import os as _os
_os.chdir('..')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
species_taxonomy = pd.read_table('ref/gtpro/species_taxonomy_ext.tsv', names=['genome_id', 'species_id', 'taxonomy_string']).assign(species_id=lambda x: x.species_id.astype(str)).set_index('species_id')[['taxonomy_string']].assign(taxonomy_split=lambda x: x.taxonomy_string.str.split(';'))

for level_name, level_number in [('p__', 1), ('c__', 2), ('o__', 3), ('f__', 4), ('g__', 5), ('s__', 6)]:
    species_taxonomy = species_taxonomy.assign(**{level_name: species_taxonomy.taxonomy_split.apply(lambda x: x[level_number])}) 
species_taxonomy = species_taxonomy.drop(columns=['taxonomy_split'])

In [None]:
depth = pd.read_table('data/group/hmp2/r.proc.gtpro.species_depth.tsv', index_col=['sample', 'species_id']).squeeze().unstack(fill_value=0).rename(columns=lambda x: str(x))
rabund = depth.divide(depth.sum(1), axis=0)

In [None]:
n_species = 40
top_species = (rabund > 1e-5).sum().sort_values(ascending=False).head(n_species).index

fig, axs = plt.subplots(n_species, figsize=(5, 0.5 * n_species), sharex=True, sharey=True)

bins = np.logspace(-7, 0, num=51)

for species_id, ax in zip(top_species, axs):
    ax.hist(rabund[species_id], bins=bins, alpha=0.7)
    ax.set_xscale('log')
    prevalence = (rabund[species_id] > 1e-5).mean()
    ax.set_title("")
    # ax.set_xticks()
    # ax.set_yticks()
    ax.yaxis.set_visible(False)
    ax.xaxis.set_visible(False)
    ax.patch.set_alpha(0.0)
    for spine in ['left', 'right', 'top', 'bottom']:
        ax.spines[spine].set_visible(False)
    ax.annotate(f'{species_id} ({prevalence:0.0%})', xy=(0.05, 0.1), ha='left', xycoords="axes fraction")
    ax.set_xlim(left=1e-7)
    ax.set_ylim(top=300)
    
ax.xaxis.set_visible(True)
ax.spines['bottom'].set_visible(True)

fig.subplots_adjust(hspace=-0.75)

In [None]:
species_taxonomy.loc[top_species]

In [None]:
_species_id = '102478'
print((rabund[_species_id] > 1e-5).mean())
print(rabund[_species_id].median())
species_taxonomy.loc[_species_id].taxonomy_string

In [None]:
_species_id = '100099'
print((rabund[_species_id] > 1e-5).mean())
print(rabund[_species_id].median())
species_taxonomy.loc[_species_id].taxonomy_string

In [None]:
_species_id = '101346'
print((rabund[_species_id] > 1e-5).mean())
print(rabund[_species_id].median())
species_taxonomy.loc[_species_id].taxonomy_string

In [None]:
_species_id = '102492'
print((rabund[_species_id] > 1e-5).mean())
print(rabund[_species_id].median())
species_taxonomy.loc[_species_id].taxonomy_string

In [None]:
_species_id = '102506'
print((rabund[_species_id] > 1e-5).mean())
print(rabund[_species_id].median())
species_taxonomy.loc[_species_id].taxonomy_string