In [None]:
import os as _os
_os.chdir('..')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from operator import eq, sub
import scipy.stats
import scipy as sp
import seaborn as sns
from lib.pandas_util import idxwhere
import sfacts as sf
from tqdm import tqdm

In [None]:
mgen = pd.read_table('meta/mgen.tsv', index_col='library_id')
preparation = pd.read_table('meta/preparation.tsv', index_col='preparation_id')
stool = pd.read_table('meta/stool.tsv', index_col='stool_id')
visit = pd.read_tabsfacts'meta/visit.tsv', index_col='visit_id')
subject = pd.read_table('meta/subject.tsv', index_col='subject_id')

mgen_meta = (
    mgen
    .join(preparation.drop(columns='library_type'), on='preparation_id')
    .join(stool, on='stool_id')
    .join(visit, on='visit_id')
    .join(subject, on='subject_id')
)

assert not any(mgen_meta.subject_id.isna())

# mgen_meta.columns

In [None]:
species_taxonomy = pd.read_table('ref/gtpro/species_taxonomy_ext.tsv', names=['genome_id', 'species_id', 'taxonomy_string']).assign(species_id=lambda x: x.species_id.astype(str)).set_index('species_id').taxonomy_string
species_taxonomy = species_taxonomy.str.split(';').to_frame()

for level_name, level_number in [('p__', 2), ('f__', 5), ('s__', 7)]:
    species_taxonomy = species_taxonomy.assign(**{level_name: species_taxonomy['taxonomy_string'].apply(lambda x: ';'.join(x[:level_number]))})
    
species_taxonomy

In [None]:
species_depth = pd.read_table('data/core.a.r.proc.gtpro.species_depth.tsv', index_col=['sample', 'species_id']).squeeze().unstack('species_id', fill_value=0)
species_list = species_depth.columns.astype(str).to_list()

In [None]:
import warnings

warnings.filterwarnings('ignore', message='divide by zero encountered in log2', category=RuntimeWarning, lineno=727)

_result = {}
for species_id in tqdm(species_list):
    path = f'data/sp-{species_id}.core.a.r.proc.gtpro.filt-poly05-cvrg05.mgen.nc'
    if not os.path.exists(path):
        continue
    world = sf.World.load(path)
    _result[species_id] = (float(world.metagenotype.entropy().mean()), world.sizes['position'], world.sizes['sample'])
result = pd.DataFrame(_result, index=['entropy', 'num_positions', 'num_samples']).T

In [None]:
result

In [None]:
species_taxonomy.join(result).sort_values('entropy', ascending=False).head(20)