In [1]:
import calour as ca
import calour_utils as cu

failed to load logging config file


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import pandas as pd
import shutil
import matplotlib.colors
import skbio as skbio

  import pandas.util.testing as pdt


In [3]:
ca.set_log_level('INFO')

In [4]:
%matplotlib notebook

In [5]:
pwd

'/Users/amnon/Projects/sheba/metaanalysis/scripts'

# Prepare the data

In [33]:
def join_all_exps(subset_size=23, subset_control=23, keep_less=True,normalize=False, random_seed=None):
    '''load all experiemnts
    to a single dataset aftert subsampling

    Parameters
    ----------
    subset_size: int or None, optional
        if not None, randomly select subset_size samples for case and control groups each
    subset_control: int or None, optional
        if subset_size is not None, randomly select subset_control control samples. If None, use value of subset_size
    keep_less: bool
        True to also keep studies with < subset_size samples.
        False to throw away studies with < subset_size samples.
    normalize: bool, optional
        True to noramlize each sample to 10k reads, False (default) to skip normalization
    random_seed : int, np.radnom.Generator instance or None, optional, default=None
        set the random number generator seed for the random permutations. Used to select subset of samples.
        If int, random_seed is the seed used by the random number generator;
        If Generator instance, random_seed is set to the random number generator;
        If None, then fresh, unpredictable entropy will be pulled from the OS    

    Returns
    -------
    exp: the joined experiment
    '''
    exp = None
    total_ok = 0
    total_low = 0
    for idx, cname in enumerate(glob.glob('../studies/*')):
        # Initialize the random seed for each study, since the order of studies is OS dependent
        rng = np.random.default_rng(random_seed)
        if os.path.isdir(cname):
            print('**********')
            print('processing %s (%d)' % (cname, idx))
            tables = glob.glob(os.path.join(cname,'all.*biom'))
            print(tables)
            if len(tables)==0:
                print('dir %s does not contain a biom table' % cname)
                continue
            bt=tables[0]
            if normalize is True:
                normalize=10000
            else:
                normalize = None
            data=ca.read_amplicon(os.path.join(bt),os.path.join(cname,'up.map.csv'),normalize=normalize,min_reads=1000)
            print('-------------')
            print(data)
            # TODO: if not enough samples, print and continue
            # if enough samples, keep only random subet
            if subset_size is not None:
                tt=data.filter_samples('type','HC')
                ns = len(tt.sample_metadata)
                if ns < subset_control:
                    print('not enough controls (%d). skipping' % ns)
                    if not keep_less:
                        total_low += 1
                        continue
                # randomly subsample
                rp = rng.permutation(ns)
                rp = rp[:subset_size]
                new_ids = [tt.sample_metadata['_sample_id'][x] for x in rp]
                # similar for disease
                tt=data.filter_samples('type','disease')
                ns = len(tt.sample_metadata)
                if ns < subset_size:
                    print('not enough disease (%d). skipping' % ns)
                    if not keep_less:
                        total_low += 1
                        continue
                # randomly subsample
                rp = np.random.permutation(ns)
                rp = rp[:subset_size]
                new_ids2 = [tt.sample_metadata['_sample_id'][x] for x in rp]
                # join the ids for the sick and controls
                all_ids = new_ids + new_ids2
                # and finally
                data = data.filter_samples('_sample_id',all_ids)
                print('keeping %d total samples' % len(data.sample_metadata))
            else:
                print('filtering for HC and disease')
                data = data.filter_samples('type', ['HC', 'disease'])
                print(data)
            data.description = cname
            if exp is None:
                exp = data
                exp.sample_metadata['exp']=cname
            else:
                exp = exp.join_experiments(data, field=cname)
                exp.sample_metadata.loc[exp.sample_metadata['exp'].isnull(),'exp']=cname
            total_ok += 1
    print('finished. Total processed %d. Total ok %d. Not enough samples %d' % (idx+1, total_ok, total_low))
    return exp

In [7]:
exp=join_all_exps(random_seed=2020)

**********
processing ../studies/61 (0)
['../studies/61/all.biom']
2021-04-05 17:46:18 INFO loaded 41 samples, 2715 features
2021-04-05 17:46:18 INFO After filtering, 41 remain.
-------------
AmpliconExperiment with 41 samples, 2715 features
not enough controls (21). skipping
not enough disease (20). skipping
keeping 41 total samples
**********
processing ../studies/59 (1)
['../studies/59/all.biom']
2021-04-05 17:46:18 INFO loaded 33 samples, 2637 features
2021-04-05 17:46:18 INFO After filtering, 33 remain.
-------------
AmpliconExperiment with 33 samples, 2637 features
not enough controls (16). skipping
not enough disease (17). skipping
keeping 33 total samples
**********
processing ../studies/50 (2)
['../studies/50/all.biom']
2021-04-05 17:46:18 INFO loaded 58 samples, 959 features
2021-04-05 17:46:18 INFO After filtering, 58 remain.
-------------
AmpliconExperiment with 58 samples, 959 features
keeping 46 total samples
**********
processing ../studies/57 (3)
['../studies/57/all.bio

2021-04-05 17:46:54 INFO After filtering, 96 remain.
-------------
AmpliconExperiment with 96 samples, 3829 features
keeping 46 total samples
**********
processing ../studies/45 (21)
['../studies/45/all.biom']
2021-04-05 17:47:00 INFO loaded 1095 samples, 14981 features
2021-04-05 17:47:01 INFO After filtering, 1043 remain.
-------------
AmpliconExperiment with 1043 samples, 14981 features
keeping 46 total samples
**********
processing ../studies/6 (22)
['../studies/6/all.biom']
2021-04-05 17:47:19 INFO loaded 620 samples, 4175 features
2021-04-05 17:47:20 INFO After filtering, 612 remain.
-------------
AmpliconExperiment with 612 samples, 4175 features
keeping 46 total samples
**********
processing ../studies/28 (23)
['../studies/28/all.biom']
2021-04-05 17:47:37 INFO loaded 142 samples, 1306 features
2021-04-05 17:47:37 INFO After filtering, 135 remain.
-------------
AmpliconExperiment with 135 samples, 1306 features
keeping 46 total samples
**********
processing ../studies/17 (24)
[

2021-04-05 17:49:13 INFO After filtering, 1172 remain.
-------------
AmpliconExperiment with 1172 samples, 11834 features
not enough controls (13). skipping
not enough disease (12). skipping
keeping 25 total samples
**********
processing ../studies/8 (29)
['../studies/8/all.biom']
2021-04-05 17:49:53 INFO loaded 33 samples, 780 features
2021-04-05 17:49:53 INFO After filtering, 33 remain.
-------------
AmpliconExperiment with 33 samples, 780 features
not enough controls (11). skipping
not enough disease (22). skipping
keeping 33 total samples
**********
processing ../studies/21 (30)
['../studies/21/all.biom']
2021-04-05 17:50:27 INFO loaded 178 samples, 2597 features
2021-04-05 17:50:28 INFO After filtering, 178 remain.
-------------
AmpliconExperiment with 178 samples, 2597 features
keeping 46 total samples
**********
processing ../studies/44 (31)
['../studies/44/all.biom']
2021-04-05 17:51:10 INFO loaded 875 samples, 11915 features
2021-04-05 17:51:11 INFO After filtering, 835 remain

2021-04-05 18:00:07 INFO After filtering, 30 remain.
-------------
AmpliconExperiment with 30 samples, 2580 features
not enough controls (17). skipping
not enough disease (13). skipping
keeping 30 total samples
**********
processing ../studies/55 (41)
['../studies/55/all.biom']
2021-04-05 18:01:05 INFO loaded 74 samples, 3939 features
2021-04-05 18:01:05 INFO After filtering, 74 remain.
-------------
AmpliconExperiment with 74 samples, 3939 features
keeping 46 total samples
**********
processing ../studies/46 (42)
['../studies/46/all.biom']
2021-04-05 18:02:04 INFO loaded 584 samples, 10980 features
2021-04-05 18:02:04 INFO After filtering, 554 remain.
-------------
AmpliconExperiment with 554 samples, 10980 features
keeping 46 total samples
**********
processing ../studies/41 (43)
['../studies/41/all.biom']
2021-04-05 18:03:23 INFO loaded 84 samples, 3557 features
2021-04-05 18:03:23 INFO After filtering, 80 remain.
-------------
AmpliconExperiment with 80 samples, 3557 features
keepi

2021-04-05 18:09:37 INFO After filtering, 2373 remain.
-------------
AmpliconExperiment with 2373 samples, 13732 features
keeping 46 total samples
**********
processing ../studies/49 (52)
['../studies/49/all.biom']
2021-04-05 18:10:47 INFO loaded 273 samples, 8112 features
2021-04-05 18:10:47 INFO After filtering, 263 remain.
-------------
AmpliconExperiment with 263 samples, 8112 features
keeping 46 total samples
**********
processing ../studies/40 (53)
['../studies/40/all.biom']
2021-04-05 18:11:53 INFO loaded 86 samples, 3917 features
2021-04-05 18:11:53 INFO After filtering, 85 remain.
-------------
AmpliconExperiment with 85 samples, 3917 features
keeping 46 total samples
**********
processing ../studies/47 (54)
['../studies/47/all.biom']
2021-04-05 18:12:56 INFO loaded 257 samples, 7437 features
2021-04-05 18:12:56 INFO After filtering, 247 remain.
-------------
AmpliconExperiment with 247 samples, 7437 features
keeping 46 total samples
**********
processing ../studies/2 (55)
['.

2021-04-05 18:16:10 INFO After filtering, 333 remain.
-------------
AmpliconExperiment with 333 samples, 4664 features
keeping 46 total samples
**********
processing ../studies/14 (58)
['../studies/14/all.biom']
2021-04-05 18:17:25 INFO loaded 123 samples, 1271 features
2021-04-05 18:17:25 INFO After filtering, 123 remain.
-------------
AmpliconExperiment with 123 samples, 1271 features
keeping 46 total samples
**********
processing ../studies/22 (59)
['../studies/22/all.biom']
2021-04-05 18:18:34 INFO loaded 144 samples, 2687 features
2021-04-05 18:18:34 INFO After filtering, 144 remain.
-------------
AmpliconExperiment with 144 samples, 2687 features
keeping 46 total samples
**********
processing ../studies/25 (61)
['../studies/25/all.biom']
2021-04-05 18:19:48 INFO loaded 92 samples, 1781 features
2021-04-05 18:19:48 INFO After filtering, 89 remain.
-------------
AmpliconExperiment with 89 samples, 1781 features
not enough disease (20). skipping
keeping 43 total samples
finished. To

In [8]:
exp.description='merged'
exp

AmpliconExperiment ("merged") with 2533 samples, 44537 features

In [9]:
exp.sample_metadata

Unnamed: 0,type,disease_cohort,age,_sample_id,_calour_original_abundance,exp,../studies/59,host_subject,title,host_sex,...,../studies/13,Isolation_source,sequencing_lab,../studies/5,../studies/14,../studies/22,Sample,rep,sample_type,../studies/25
ERR1385491,disease,T2D,adult,ERR1385491,68395.0,../studies/61,exp,,,,...,exp,,,exp,exp,exp,,,,exp
ERR1385206,HC,T2D,adult,ERR1385206,95408.0,../studies/61,exp,,,,...,exp,,,exp,exp,exp,,,,exp
ERR1383735,disease,T2D,adult,ERR1383735,60137.0,../studies/61,exp,,,,...,exp,,,exp,exp,exp,,,,exp
ERR1384980,disease,T2D,adult,ERR1384980,74487.0,../studies/61,exp,,,,...,exp,,,exp,exp,exp,,,,exp
ERR1382783,HC,T2D,adult,ERR1382783,48282.0,../studies/61,exp,,,,...,exp,,,exp,exp,exp,,,,exp
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR1776911,disease,,child,ERR1776911,4943.0,../studies/25,,,,,...,,,,,,,S74,a,Crohn_at _start_of_EEN,other
ERR1776929,disease,,child,ERR1776929,27155.0,../studies/25,,,,,...,,,,,,,S80,c,Crohn_at _start_of_EEN,other
ERR1776938,HC,,adult,ERR1776938,4227.0,../studies/25,,,,,...,,,,,,,S84,a,Relative_mother,other
ERR1776947,disease,,child,ERR1776947,10203.0,../studies/25,,,,,...,,,,,,,S88,a,Crohn_at _start_of_EEN,other


## Save the resulting biom table and mapping file

In [10]:
exp.save('../allsamples/allsamples')

2021-04-05 18:21:04 INFO Metadata field taxonomy not found. Saving biom table without metadata


In [11]:
exp.save_fasta('../allsamples/allsamples.fa')

In [12]:
exp

AmpliconExperiment ("merged") with 2533 samples, 44537 features

## Also save after filtering for blooiming bacteria

In [13]:
exp=exp.filter_by_fasta('../allsamples/blooming-bacteria.fa',negate=True)

In [14]:
exp.save('../allsamples/allsamples-no-bloom')

2021-04-05 18:21:10 INFO Metadata field taxonomy not found. Saving biom table without metadata


In [15]:
exp.save_fasta('../allsamples/allsamples-no-bloom.fa')

In [16]:
exp

AmpliconExperiment ("merged") with 2533 samples, 44526 features

# Now create the single biom table for all samples without subsampling

In [34]:
expall=join_all_exps(subset_size=None, random_seed=2020)

**********
processing ../studies/61 (0)
['../studies/61/all.biom']
2021-04-05 19:56:44 INFO loaded 41 samples, 2715 features
2021-04-05 19:56:44 INFO After filtering, 41 remain.
-------------
AmpliconExperiment with 41 samples, 2715 features
filtering for HC and disease
AmpliconExperiment with 41 samples, 2715 features
**********
processing ../studies/59 (1)
['../studies/59/all.biom']
2021-04-05 19:56:44 INFO loaded 33 samples, 2637 features
2021-04-05 19:56:44 INFO After filtering, 33 remain.
-------------
AmpliconExperiment with 33 samples, 2637 features
filtering for HC and disease
AmpliconExperiment with 33 samples, 2637 features
**********
processing ../studies/50 (2)
['../studies/50/all.biom']
2021-04-05 19:56:44 INFO loaded 58 samples, 959 features
2021-04-05 19:56:44 INFO After filtering, 58 remain.
-------------
AmpliconExperiment with 58 samples, 959 features
filtering for HC and disease
AmpliconExperiment with 58 samples, 959 features
**********
processing ../studies/57 (3)


2021-04-05 19:57:12 INFO After filtering, 119 remain.
-------------
AmpliconExperiment with 119 samples, 2116 features
filtering for HC and disease
AmpliconExperiment with 119 samples, 2116 features
**********
processing ../studies/42 (20)
['../studies/42/all.biom']
2021-04-05 19:57:17 INFO loaded 99 samples, 3829 features
2021-04-05 19:57:17 INFO After filtering, 96 remain.
-------------
AmpliconExperiment with 96 samples, 3829 features
filtering for HC and disease
AmpliconExperiment with 96 samples, 3829 features
**********
processing ../studies/45 (21)
['../studies/45/all.biom']
2021-04-05 19:57:24 INFO loaded 1095 samples, 14981 features
2021-04-05 19:57:25 INFO After filtering, 1043 remain.
-------------
AmpliconExperiment with 1043 samples, 14981 features
filtering for HC and disease
AmpliconExperiment with 1043 samples, 14981 features
**********
processing ../studies/6 (22)
['../studies/6/all.biom']
2021-04-05 19:57:44 INFO loaded 620 samples, 4175 features
2021-04-05 19:57:44 I

2021-04-05 19:59:26 INFO After filtering, 1172 remain.
-------------
AmpliconExperiment with 1172 samples, 11834 features
filtering for HC and disease
AmpliconExperiment with 25 samples, 11834 features
**********
processing ../studies/8 (29)
['../studies/8/all.biom']
2021-04-05 19:59:58 INFO loaded 33 samples, 780 features
2021-04-05 19:59:58 INFO After filtering, 33 remain.
-------------
AmpliconExperiment with 33 samples, 780 features
filtering for HC and disease
AmpliconExperiment with 33 samples, 780 features
**********
processing ../studies/21 (30)
['../studies/21/all.biom']
2021-04-05 20:00:30 INFO loaded 178 samples, 2597 features
2021-04-05 20:00:30 INFO After filtering, 178 remain.
-------------
AmpliconExperiment with 178 samples, 2597 features
filtering for HC and disease
AmpliconExperiment with 178 samples, 2597 features
**********
processing ../studies/44 (31)
['../studies/44/all.biom']
2021-04-05 20:01:04 INFO loaded 875 samples, 11915 features
2021-04-05 20:01:04 INFO Af

2021-04-05 20:07:30 INFO After filtering, 587 remain.
-------------
AmpliconExperiment with 587 samples, 10422 features
filtering for HC and disease
AmpliconExperiment with 587 samples, 10422 features
**********
processing ../studies/52 (40)
['../studies/52/all.biom']
2021-04-05 20:08:31 INFO loaded 30 samples, 2580 features
2021-04-05 20:08:31 INFO After filtering, 30 remain.
-------------
AmpliconExperiment with 30 samples, 2580 features
filtering for HC and disease
AmpliconExperiment with 30 samples, 2580 features
**********
processing ../studies/55 (41)
['../studies/55/all.biom']
2021-04-05 20:09:31 INFO loaded 74 samples, 3939 features
2021-04-05 20:09:31 INFO After filtering, 74 remain.
-------------
AmpliconExperiment with 74 samples, 3939 features
filtering for HC and disease
AmpliconExperiment with 74 samples, 3939 features
**********
processing ../studies/46 (42)
['../studies/46/all.biom']
2021-04-05 20:10:31 INFO loaded 584 samples, 10980 features
2021-04-05 20:10:31 INFO Af

2021-04-05 20:19:22 INFO After filtering, 2373 remain.
-------------
AmpliconExperiment with 2373 samples, 13732 features
filtering for HC and disease
AmpliconExperiment with 224 samples, 13732 features
**********
processing ../studies/49 (52)
['../studies/49/all.biom']
2021-04-05 20:20:38 INFO loaded 273 samples, 8112 features
2021-04-05 20:20:38 INFO After filtering, 263 remain.
-------------
AmpliconExperiment with 263 samples, 8112 features
filtering for HC and disease
AmpliconExperiment with 263 samples, 8112 features
**********
processing ../studies/40 (53)
['../studies/40/all.biom']
2021-04-05 20:21:52 INFO loaded 86 samples, 3917 features
2021-04-05 20:21:52 INFO After filtering, 85 remain.
-------------
AmpliconExperiment with 85 samples, 3917 features
filtering for HC and disease
AmpliconExperiment with 85 samples, 3917 features
**********
processing ../studies/47 (54)
['../studies/47/all.biom']
2021-04-05 20:23:05 INFO loaded 257 samples, 7437 features
2021-04-05 20:23:05 IN

2021-04-05 20:26:54 INFO After filtering, 333 remain.
-------------
AmpliconExperiment with 333 samples, 4664 features
filtering for HC and disease
AmpliconExperiment with 333 samples, 4664 features
**********
processing ../studies/14 (58)
['../studies/14/all.biom']
2021-04-05 20:28:22 INFO loaded 123 samples, 1271 features
2021-04-05 20:28:22 INFO After filtering, 123 remain.
-------------
AmpliconExperiment with 123 samples, 1271 features
filtering for HC and disease
AmpliconExperiment with 123 samples, 1271 features
**********
processing ../studies/22 (59)
['../studies/22/all.biom']
2021-04-05 20:29:54 INFO loaded 144 samples, 2687 features
2021-04-05 20:29:54 INFO After filtering, 144 remain.
-------------
AmpliconExperiment with 144 samples, 2687 features
filtering for HC and disease
AmpliconExperiment with 144 samples, 2687 features
**********
processing ../studies/25 (61)
['../studies/25/all.biom']
2021-04-05 20:31:29 INFO loaded 92 samples, 1781 features
2021-04-05 20:31:29 INF

In [35]:
expall.description='merged no subsampling'
expall

AmpliconExperiment ("merged no subsampling") with 11459 samples, 44537 features

In [36]:
expall.save('../allsamples/allsamples-no-subsampling')

2021-04-05 20:33:17 INFO Metadata field taxonomy not found. Saving biom table without metadata


In [57]:
expall.sample_metadata.exp.value_counts()

../studies/45    1043
../studies/44     835
../studies/37     727
../studies/6      612
../studies/29     594
../studies/39     587
../studies/46     554
../studies/9      451
../studies/7      441
../studies/3      334
../studies/5      333
../studies/48     280
../studies/49     263
../studies/47     247
../studies/27     233
../studies/12     224
../studies/62     196
../studies/2      179
../studies/21     178
../studies/10     174
../studies/51     164
../studies/15     162
../studies/20     151
../studies/22     144
../studies/28     135
../studies/14     123
../studies/16     119
../studies/53     115
../studies/23     114
../studies/11     109
../studies/42      96
../studies/25      89
../studies/43      89
../studies/57      85
../studies/40      85
../studies/18      84
../studies/4       82
../studies/41      80
../studies/55      74
../studies/60      73
../studies/31      70
../studies/19      68
../studies/54      63
../studies/33      58
../studies/50      58
../studies

## Also save after filtering for blooiming bacteria

In [37]:
expall=expall.filter_by_fasta('../allsamples/blooming-bacteria.fa',negate=True)

In [38]:
expall.save('../allsamples/allsamples-no-subsampling-no-bloom')

2021-04-05 20:34:01 INFO Metadata field taxonomy not found. Saving biom table without metadata


# Load the data
(up to 23 control and 23 cases in each study)

In [39]:
exp=ca.read_amplicon('../allsamples/allsamples-no-bloom.biom',
                     '../allsamples/allsamples-no-bloom_sample.txt',
                     min_reads=1000, normalize=10000)

2021-04-05 22:16:40 INFO loaded 2533 samples, 44526 features


  This is separate from the ipykernel package so we can avoid doing imports until


2021-04-05 22:16:45 INFO After filtering, 2514 remain.


In [40]:
exp

AmpliconExperiment with 2514 samples, 44526 features

### Keep only features present in at least 10 samples
(so we filter min_prevalence 9.5/num_of_samples

In [41]:
tt=exp.filter_prevalence(9.5/len(exp.sample_metadata))
tt

2021-04-05 22:16:50 INFO After filtering, 2692 remain.


AmpliconExperiment with 2514 samples, 2692 features

### merge by sample type for taxonomy plot

In [42]:
tt=exp.join_metadata_fields('exp','type','exptype',axis=0,inplace=False)

In [43]:
tt=tt.aggregate_by_metadata('exptype', agg='mean')

In [44]:
tt=tt.sort_samples('exp')

In [45]:
cu.splot(tt,'type')

creating logger


<calour.heatmap.plotgui_qt5.PlotGUI_QT5 at 0x7f911ba8fe10>

In [46]:
tt.save('../allsamples/cohort-disease-mean')

2021-04-05 22:17:05 INFO Metadata field taxonomy not found. Saving biom table without metadata


### Save separately the healthy and sick groups

In [47]:
ttt=tt.filter_samples('type','HC')
ttt

AmpliconExperiment with 59 samples, 44526 features

In [48]:
ttt.save('../allsamples/cohort-disease-mean-HC')

2021-04-05 22:17:08 INFO Metadata field taxonomy not found. Saving biom table without metadata


In [49]:
ttt=tt.filter_samples('type','disease')
ttt

AmpliconExperiment with 59 samples, 44526 features

In [50]:
ttt.save('../allsamples/cohort-disease-mean-disease')

2021-04-05 22:17:10 INFO Metadata field taxonomy not found. Saving biom table without metadata


### Save single combined healthy/sick (for all cohorts together)

In [51]:
ttt=tt.aggregate_by_metadata('type', agg='mean')
ttt

AmpliconExperiment with 2 samples, 44526 features

In [52]:
ttt.save('../allsamples/mean-type-combined')

2021-04-05 22:17:12 INFO Metadata field taxonomy not found. Saving biom table without metadata
