# Preprocess the kindergarten data and save biom tables and mapping files

In [1]:
import calour as ca
import calour_utils as cu

  import pandas.util.testing as pdt


failed to load logging config file


In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
import numpy as np
import matplotlib as mpl
import pandas as pd
import scipy as sp;


In [3]:
pwd

'/Users/amnon/Projects/sheba/ortal-kindergarten'

In [4]:
ca.set_log_level(11)

In [5]:
%matplotlib inline

In [6]:
db=ca.database._get_database_class('dbbact')

# Load the data
## we have 2 runs, with some samples duplicated between the two runs
## so we load without normalization/filtering, then aggregate by sum and then filter

In [7]:
ca.set_log_level('ERROR')
dat=ca.read_amplicon('data/16S_DB1-10.biom','data/DB7_10_11_ganim_amnon_map_14July.txt',normalize=None,min_reads=500)
ca.set_log_level('INFO')

In [8]:
ca.set_log_level('ERROR')
dat2=ca.read_amplicon('data/all.db11.biom','data/DB7_10_11_ganim_amnon_map_14July.txt',normalize=None,min_reads=500)
ca.set_log_level('INFO')

## get rid of samples not in mapping file


In [9]:
dat=dat.filter_samples('BarcodeSequence',None)
dat2=dat2.filter_samples('BarcodeSequence',None)

## combine the 2 runs

In [10]:
dat=dat.join_experiments(dat2,'orig_run')

In [11]:
dat

AmpliconExperiment ("join  & ") with 271 samples, 14052 features

## Join samples with same sample_ID (i.e. rerun of same sample in the 2 runs)

In [12]:
dat=dat.aggregate_by_metadata('sample_ID',agg='mean')
dat

AmpliconExperiment ("join  & ") with 268 samples, 14052 features

## Get rid of non-child samples (family members)

In [13]:
dat=dat.filter_samples('kindergarten','Family',negate=True)
dat

AmpliconExperiment ("join  & ") with 268 samples, 14052 features

## Sort by timepoint, and within each timepoint by subjectID

In [38]:
dat=dat.sort_samples('pn_ID')
dat=dat.sort_samples('Time')
dat=dat.sort_samples('kindergarten')

## Save the non-normalized/rarified table

In [39]:
dat.save('data/gan-joined')

2020-08-16 16:12:34 INFO Metadata field taxonomy not found. Saving biom table without metadata


# get rid of samples with not enough reads
## How many samples do we lose for different read-depths?

In [40]:
nreads=np.sum(dat.get_data(),axis=1)

In [41]:
for i in range(8):
    print('min reads %d: num samples deleted %d' % (i*1000, np.sum(nreads<i*1000)))

min reads 0: num samples deleted 0
min reads 1000: num samples deleted 0
min reads 2000: num samples deleted 0
min reads 3000: num samples deleted 0
min reads 4000: num samples deleted 0
min reads 5000: num samples deleted 6
min reads 6000: num samples deleted 11
min reads 7000: num samples deleted 22


In [42]:
dat

AmpliconExperiment ("join  & ") with 268 samples, 14052 features

### We choose the threshold to be 4000 (so we lose 0 samples)

In [43]:
dat=dat.reorder(nreads>=4000,axis=0)

# Prepare the normalized biom table

## Normalize to 10k reads/sample

In [44]:
gan=dat.normalize(10000)

## Get rid of features with <10 reads total over all samples, and cluster the features

In [45]:
gan=gan.filter_sum_abundance(10)
gan=gan.cluster_features()

2020-08-16 16:12:37 INFO After filtering, 928 remain.
2020-08-16 16:12:37 INFO After filtering, 928 remain.


In [46]:
gan

AmpliconExperiment ("join  & ") with 268 samples, 928 features

## Save the normalized biom table

In [47]:
gan.save('data/gan-normalized')

2020-08-16 16:12:38 INFO Metadata field taxonomy not found. Saving biom table without metadata


### and save the fasta file for adding qiime2 taxonomy

In [52]:
gan.save_fasta('data/gan-normalized.fa')

### The output of the qiime2 taxonomy is in data/taxonomy.fa
We load it using the feature_metadata_file parameter of read_amplicon

# Prepare the subsampled (rarified) table for the alpha-diversity

## Subsample to 4000 reads/sample

In [48]:
dat.data=dat.data.astype(int)
dat_subsampled=dat.subsample_count(4000)

In [49]:
dat_subsampled.sample_metadata['numSpecies']=np.sum(dat_subsampled.data>0,axis=1)

In [50]:
dat_subsampled

AmpliconExperiment ("join  & ") with 268 samples, 14052 features

In [51]:
dat_subsampled.save('data/gan-subsampled')

2020-08-16 16:12:39 INFO Metadata field taxonomy not found. Saving biom table without metadata
