In [1]:
import calour as ca
import calour_utils as cu

failed to load logging config file


In [2]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import glob
import os
import pandas as pd
import shutil

In [3]:
ca.set_log_level('INFO')

In [4]:
%matplotlib notebook

In [5]:
pwd

'/Users/amnon/git/paper-metaanalysis/scripts'

In [6]:
pd.__version__

'1.2.5'

In [7]:
from os.path import join
save_dir = '../ratios/'

# Prepare the ratios table
## For each study in the directory, calculate the sick/healthy ratio for each bacteria
### Steps:
#### For each study identify bacteria significantly different between sick/healthy
Using at most 23 sick and 10 healthy samples per study (for fair comparison between study sizes)

Using FDR=0.25
#### Join the significant bacteria from all studies
To a single list
#### Calculate the log2(sick/healthy) for each bacteria in each study
On all samples from each study (to get better estimate of the effect size).

Save in a biom table. 1 entry per bacteria per study (log2 ratio sick/healthy)

In [8]:
def norm_diff(exp, field, val1, val2, **kwargs):
    '''same parameters as diff_abundance, but normalize effect size to -1:1'''
    dd=exp.diff_abundance(field, val1, val2, **kwargs)
    g1 = exp.filter_samples(field, val1)
    if val2 is None:
        g2 = exp.filter_samples(field, val1, negate=True)
    else:
        g2 = exp.filter_samples(field, val2)
    n_g1 = len(g1.sample_metadata)
    n_g2 = len(g2.sample_metadata)
    odif = dd.feature_metadata['_calour_stat'] / ((n_g1 + n_g2) / 2)
    dd.feature_metadata['_calour_stat'] = odif
    return dd

In [9]:
def diff_all(fasta_file=None, alpha=0.1, subset_size=None, subset_control=None, random_seed=None):
    '''Iterate over "studies" directory. Each subdir is an experiment with a biom table (all.XXX.biom) and map file (up.map.csv).
    The map file contains the field "type" with the values "HC"/"disease".
    If fasta_file is not None, we keep only bacteria from the experiment present in fasta_file (for second stage diff abundace to reduce FDR cost)
    We then do diff. abundance with thresh. alpha
    Finally we save these disease associated bacteria to a new biom table diff-ALPHA.biom
    
    Parameters
    ----------
    fasta_file:str or None, optional
        if not None, keep only bacteria present in fasta_file
    alpha: float, optional
        the alpha value for the dsFDR (see diff_abundance()  )
    subset_size: int or None, optional
        if not None, keep only subset_size samples from each group (HC/disease)
    subset_control: int or None, optional
        if None,  control subset_size is the same as subset_size.
        if not None, this is the minimal control subset_size
    '''
    x = []
    savedir = join(save_dir,'diff-%s' % alpha)
    if subset_control is None:
        subset_control = subset_size
    if subset_size is not None:
        savedir += '-subset-%d' % subset_size
    print('deleting output dir %s' % savedir)
    try:
        shutil.rmtree(savedir)
    except:
        pass
    if fasta_file is not None:
        savedir += '-filtered'
    try:
        os.mkdir(savedir)
    except:
        pass
    num_processed = 0
    for cname in glob.glob('../studies/*'):
        if os.path.isdir(cname):
            # re-set the random seed for each directory since order of directories may vary between OSs
            if random_seed is not None:
                np.random.seed(random_seed)
            print('**********')
            print(cname)
            x.append(cname)
            tables = glob.glob(os.path.join(cname,'all.*biom'))
            print(tables)
            if len(tables)==0:
                print('dir %s does not contain a biom table' % cname)
                continue
            bt=tables[0]
            data=ca.read_amplicon(os.path.join(bt),os.path.join(cname,'up.map.csv'),normalize=10000,min_reads=1000)
            print('-------------')
            print(data)
            # if enough samples, keep only random subet
            if subset_size is not None:
                tt=data.filter_samples('type','HC')
                ns = len(tt.sample_metadata)
                if ns < subset_control:
                    print('not enough controls (%d). skipping' % ns)
                    continue
                # randomly subsample
                rp = np.random.permutation(ns)
                rp = rp[:subset_size]
                new_ids = [tt.sample_metadata['_sample_id'][x] for x in rp]
                # similar for disease
                tt=data.filter_samples('type','disease')
                ns = len(tt.sample_metadata)
                if ns < subset_size:
                    print('not enough disease (%d). skipping' % ns)
                    continue
                # randomly subsample
                rp = np.random.permutation(ns)
                rp = rp[:subset_size]
                new_ids2 = [tt.sample_metadata['_sample_id'][x] for x in rp]
                # join the ids for the sick and controls
                all_ids = new_ids + new_ids2
                # and finally
                data = data.filter_samples('_sample_id',all_ids)
                print('keeping')
            data=data.filter_sum_abundance(10)
            if fasta_file is not None:
                data=data.filter_by_fasta(fasta_file)
            dd=norm_diff(data,'type','disease', 'HC',alpha=alpha,random_seed=1234)
#             dd=data.diff_abundance('type','disease', 'HC',alpha=alpha,random_seed=1234)
    #         dd=cu.add_taxonomy(dd)
            savename = os.path.join(savedir,'diff-%s' % (os.path.basename(cname)))
            dd.save_biom(savename+'.biom',add_metadata=None)
            #dd.save_metadata(savename+'.tsv',axis='f')
            dd.save(savename)
            num_processed += 1
    print('processed %d studies' % num_processed)

In [10]:
ca.set_log_level('ERROR')

In [43]:
diff_all(alpha=0.25,subset_size=23, subset_control=10, random_seed=2020)

deleting output dir ../ratios/diff-0.25-subset-23
**********
../studies/61
['../studies/61/all.biom']
-------------
AmpliconExperiment with 41 samples, 2715 features
not enough disease (20). skipping
**********
../studies/59
['../studies/59/all.biom']
-------------
AmpliconExperiment with 33 samples, 2637 features
not enough disease (17). skipping
**********
../studies/50
['../studies/50/all.biom']
-------------
AmpliconExperiment with 58 samples, 959 features
keeping
**********
../studies/57
['../studies/57/all.biom']
-------------
AmpliconExperiment with 85 samples, 4058 features
keeping
**********
../studies/32
['../studies/32/all.biom']
-------------
AmpliconExperiment with 43 samples, 1748 features
keeping
**********
../studies/56
['../studies/56/all.biom']
-------------
AmpliconExperiment with 43 samples, 3045 features
not enough disease (21). skipping
**********
../studies/51
['../studies/51/all.biom']
-------------
AmpliconExperiment with 164 samples, 3240 features
keeping
****

# load everything and save to a single table and fasta file

In [44]:
def merge_results(files_dir):
    '''Load the results of the diff_all() and merge to a single table
    save the sequences to "combined-FILES-DIR.fasta"
    returns the Experiment
    '''
    df = None
    all_filenames = glob.glob(join(save_dir, files_dir) + "/*_feature.txt")
    print(all_filenames)
    for j,f in enumerate(all_filenames):
        dt=pd.read_csv((f), sep='\t')
        dt=dt[['_feature_id', '_calour_stat']]
        ccolname='-'.join(os.path.basename(f).split('-')[1:]).split('_feature.txt')[0]
        dt=dt.rename(columns = { '_calour_stat': ccolname})
        if df is None:
            df = dt
        else:
            df=pd.merge(df, dt, how='outer', on = '_feature_id')
#     print('processed %d files' % (j+1))
    df.fillna(value=0, inplace=True)
    oname = join(save_dir, 'combined-%s' % files_dir)
    df.to_csv(oname + '.tsv', sep='\t', index=False, encoding='utf-8-sig')
    df=df.set_index('_feature_id',drop=False)
    exp = ca.Experiment.from_pandas(df.transpose())
    exp.save_fasta(oname+'.fasta')
    return exp

In [45]:
exp = merge_results('diff-0.25-subset-23')

['../ratios/diff-0.25-subset-23/diff-12_feature.txt', '../ratios/diff-0.25-subset-23/diff-62_feature.txt', '../ratios/diff-0.25-subset-23/diff-50_feature.txt', '../ratios/diff-0.25-subset-23/diff-45_feature.txt', '../ratios/diff-0.25-subset-23/diff-4_feature.txt', '../ratios/diff-0.25-subset-23/diff-29_feature.txt', '../ratios/diff-0.25-subset-23/diff-40_feature.txt', '../ratios/diff-0.25-subset-23/diff-49_feature.txt', '../ratios/diff-0.25-subset-23/diff-39_feature.txt', '../ratios/diff-0.25-subset-23/diff-17_feature.txt', '../ratios/diff-0.25-subset-23/diff-55_feature.txt', '../ratios/diff-0.25-subset-23/diff-14_feature.txt', '../ratios/diff-0.25-subset-23/diff-2_feature.txt', '../ratios/diff-0.25-subset-23/diff-43_feature.txt', '../ratios/diff-0.25-subset-23/diff-7_feature.txt', '../ratios/diff-0.25-subset-23/diff-46_feature.txt', '../ratios/diff-0.25-subset-23/diff-18_feature.txt', '../ratios/diff-0.25-subset-23/diff-53_feature.txt', '../ratios/diff-0.25-subset-23/diff-23_feature.t

# redo diff abundance with fasta filtering
in order to get effect size. so we use alpha=1

In [46]:
diff_all(fasta_file=join(save_dir,'combined-diff-0.25-subset-23.fasta'),alpha=1,subset_size=None)

deleting output dir ../ratios/diff-1
**********
../studies/61
['../studies/61/all.biom']
-------------
AmpliconExperiment with 41 samples, 2715 features
**********
../studies/59
['../studies/59/all.biom']
-------------
AmpliconExperiment with 33 samples, 2637 features
**********
../studies/50
['../studies/50/all.biom']
-------------
AmpliconExperiment with 58 samples, 959 features
**********
../studies/57
['../studies/57/all.biom']
-------------
AmpliconExperiment with 85 samples, 4058 features
**********
../studies/32
['../studies/32/all.biom']
-------------
AmpliconExperiment with 43 samples, 1748 features
**********
../studies/56
['../studies/56/all.biom']
-------------
AmpliconExperiment with 43 samples, 3045 features
**********
../studies/51
['../studies/51/all.biom']
-------------
AmpliconExperiment with 164 samples, 3240 features
**********
../studies/58
['../studies/58/all.biom']
-------------
AmpliconExperiment with 45 samples, 3005 features
**********
../studies/60
['../studi

In [47]:
exp = merge_results('diff-1-filtered')

['../ratios/diff-1-filtered/diff-12_feature.txt', '../ratios/diff-1-filtered/diff-62_feature.txt', '../ratios/diff-1-filtered/diff-50_feature.txt', '../ratios/diff-1-filtered/diff-20_feature.txt', '../ratios/diff-1-filtered/diff-45_feature.txt', '../ratios/diff-1-filtered/diff-4_feature.txt', '../ratios/diff-1-filtered/diff-29_feature.txt', '../ratios/diff-1-filtered/diff-59_feature.txt', '../ratios/diff-1-filtered/diff-40_feature.txt', '../ratios/diff-1-filtered/diff-1_feature.txt', '../ratios/diff-1-filtered/diff-8_feature.txt', '../ratios/diff-1-filtered/diff-49_feature.txt', '../ratios/diff-1-filtered/diff-39_feature.txt', '../ratios/diff-1-filtered/diff-17_feature.txt', '../ratios/diff-1-filtered/diff-25_feature.txt', '../ratios/diff-1-filtered/diff-55_feature.txt', '../ratios/diff-1-filtered/diff-26_feature.txt', '../ratios/diff-1-filtered/diff-56_feature.txt', '../ratios/diff-1-filtered/diff-14_feature.txt', '../ratios/diff-1-filtered/diff-33_feature.txt', '../ratios/diff-1-filt

In [48]:
def prepare_exp(f, o):
    '''Prepare the merged experiment for analysis and save it
    
    Parameters
    ----------
    f: str
        name of the tsv file output from merge_results() (XXX.tsv)
    o: str
        name of the output o.biom o.map.tsv
    
    Returns
    -------
    ca.Experiment
    '''
    data=pd.read_csv(join(save_dir,f),sep='\t',index_col='_feature_id')
    dat=data.to_numpy(dtype=float)
    newexp=ca.Experiment(dat.T,sample_metadata=pd.DataFrame(data.columns,index=data.columns),feature_metadata=pd.DataFrame(data.index, index=data.index))
    o = join(save_dir, o)
    newexp.save(o)
#    newexp.save_biom(o+'.biom',add_metadata=None)
#    newexp.save_metadata(o+'.map.tsv')
    xx=ca.read_amplicon(o+'.biom',o+'_sample.txt',normalize=None, min_reads=None)
    return xx

# Save the ratios table to 'ratios/ratios.biom'
## sample metadata in ratios_sample.txt

In [49]:
xx=prepare_exp('combined-diff-1-filtered.tsv','ratios')

# Remove the blooming bacteria (since we are using American Gut dataset)
see:

Amir, Amnon, et al. "Correcting for microbial blooms in fecal samples during room-temperature shipping." MSystems 2.2 (2017).


In [11]:
xx=ca.read_amplicon('../ratios/ratios.biom','../studies/index.csv',normalize=None, min_reads=None)

In [51]:
xx

AmpliconExperiment with 59 samples, 737 features

In [52]:
xx=xx.filter_by_fasta('../allsamples/blooming-bacteria.fa',negate=True)


In [53]:
xx.save('../ratios/ratios_no_bloom')

In [54]:
xx

AmpliconExperiment with 59 samples, 731 features