#### This notebook is apply Discrete-FDR to identify significant OTUs (reference: Jiang et al, msystems, 2017)

In [1]:
import numpy as np
import pandas as pd
from biom import load_table
from gneiss.util import match
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from dsfdr import dsfdr
from dsfdr import simulation
from dsfdr import statistics
from dsfdr import transform

In [3]:
np.random.seed(2018)

## Prepare biom table and mapping file¶

### load biom table

In [4]:
def convert_biom_to_pandas(table):
    otu_table = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                             index=table.ids(axis='sample'),
                             columns=table.ids(axis='observation'))
    return otu_table

In [5]:
table = load_table('deblur-feature-table.biom') 
otu_table = convert_biom_to_pandas(table)

In [6]:
otu_table.head()

Unnamed: 0,TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGACTGTCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCCCGCCGTTGAAACTGGTGGTCTTGAGTGGATGAGAAGTACGCGGAATGCGTGGT,TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGACAGTTAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCCCGCCGTTGAAACTGATTGTCTTGAGTGGGCGAGAAGTATGCGGAATGCGTGGT,TACGTAGGTGGCGAGCGTTATCCGGAATGATTGGGCGTAAAGGGTGCGCAGGCGGTCCTGCAAGTCTGGAGTGAAACGCATGAGCTCAACTCATGCATGGCTTTGGAAACTGGAGGACTGGAGAGCAGGAGAGGGCGGTGGAACTCCATG,TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCAGCAAAGGTCTGTGGTGAAAGACTGAAGCTTAACTTCAGTAAGCCATAGAAACCGGGCAGCTAGAGTGCAGGAGAGGATCGTGGAATTCCATGT,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGGGCGCAGACGGCGATGCAAGCCAGGAGTGAAAGCCCGGGGCCCAACCCCGGGACTGCTCTTGGAACTGCGTGGCTGGAGTGCAGGAGGGGCAGGCGGAATTCCTGG,TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTCCGTTAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTCGAGCCGTTGAAACTGGCAGACTTGAGTTGGCGAGAAGTACGCGGAATGCGCGGT,TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCCTGCCAAGTCTGATGTGAAATACCGGGGCTCAACCCCGGGGCTGCATTGGAAACTGGCAGGCTGGAGTGTCGGAGAGGCAGGCGGAATTCCTAG,TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGAAAGCAAGTCAGATGTGAAAACCATGGGCTCAACCTGTGGCCTGCATTTGAAACTGTTTTTCTTGAGTACTGGAGAGGCAGACGGAATTCCTAG,TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGT,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTGATGCAAGTCTGAAGTGAAAGGCGGGGGCTCAACCCCCGGACTGCTTTGGAAACTGTATGACTGGAGTGCAGGAGAGGTAAGTGGAATTCCTAG,...,TACGGAGGGTGCAAGCGTTACCCGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGATTAATAAGTCAGAGGTGAAATCCCACAGCTTAACTGTGGAACTGCCTTTGATACTGTTAGTCTTGAGTTATATGGAAGTAGATGGAATGTGTAG,TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTATGTAAGACAGTTGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGTGACTGCATAGCTAGAGTACGGTAGAGTGGGATGGAATTCCGCG,TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGTTGACAAGTCAGATGTGAAATCCTGCGGCTTAACCGCAGAACTGCATTTGAAACTGTTGATCTTGAGTACTGGAGAGGCAGACGGAAATCCTAG,TATAAGACAGATGTGAAATCCCCGGGCTCAACCTGGGACCTGCATTTGTGACTGTATCGCTAGAGTACGGTAGAGGGGGATGGAATTCCGCGTGTAGCAGTGAAATGCGTAGATATGCGGAGGAACACCGATGGCGAAGGCAATCCCCTG,TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTATGTAAGACAGTTGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGTGACTGCATAGCTAGAGTACGGTAGAGTGGGATGGAATTCCGCGT,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCCATGCAAGCCTGGAGTGAAAGCCCGGGGCCCAACCCCGGGACTGCTCTGGGAACTGTGCGGCTGGAGTGCGGGAGGGGCAGGCGGAATTCCTGG,TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGAAAGCAAGTCAGATGTGAAAACCATGGGCTCAACCTGTGGCCTGCATTTGAAACTGTTTTTCTTGAGTACTGGAGAGGCAGACGGACTTCCTAG,TACGAAGGGGTCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGAGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTGATACTGGGTATCTAGAGTATGGAAGAGGTGAGTGGAATTCCGAG,TACGTAGGGTGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTTGTAGGCGGTTTGTCGCGTCTGCTGTGAAAGACCGGGGCGTAACTCCGGTTCTGCAGTGGGTACGGGCAGACTAGAGTGTGGTAGGGGAGACTGGAATTCCTGG,TACGAAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCGCGTAGGTGGCTTGATAAGTTGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCCAAAACTGTCTGGCTAGAGTGTGGTAGAGGGTAGTGGAATTTCCAG
10422.17.F.10,825.0,370.0,0.0,149.0,120.0,85.0,0.0,13.0,48.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10422.17.F.11,713.0,306.0,0.0,43.0,229.0,69.0,0.0,95.0,21.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10422.17.F.12,660.0,258.0,0.0,40.0,326.0,52.0,0.0,65.0,19.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10422.17.F.13,582.0,280.0,0.0,28.0,408.0,58.0,0.0,29.0,19.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10422.17.F.3,563.0,287.0,0.0,130.0,311.0,56.0,0.0,16.0,23.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
otu_table.shape

(182, 740)

### load mapping file

In [8]:
mapping = pd.read_table("metadata_rare2k.txt", 
                        sep='\t', header=0, index_col=0)

In [9]:
mapping.shape

(182, 69)

In [10]:
mapping.head()

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,center_name,experiment_design_description,extraction_robot,extractionkit_lot,instrument_model,library_construction_protocol,linker,mastermix_lot,...,physical_specimen_location,physical_specimen_remaining,sample_type,scientific_name,sex,title,weekly_cage_food_consumption,weight,weight_units,Description
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10422.17.F.10,GTTGTTCTGGGA,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF1,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,Missing: Not provided,25.6,g,feces mouse 17 collection 10 of 13
10422.17.F.11,TGTGCTTGTAGG,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF2,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,71.8,25.2,g,feces mouse 17 collection 11 of 13
10422.17.F.12,AGAATCCACCAC,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF1,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,Missing: Not provided,25.7,g,feces mouse 17 collection 12 of 13
10422.17.F.13,CTGTAAAGGTTG,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF2,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,71.7,26.3,g,final feces mouse 17 collection 13 of 13
10422.17.F.3,CTCCCGAGCTCC,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF2,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,105.7,24.9,g,feces mouse 17 collection 3 of 13


In [11]:
mapping.exposure_type.value_counts()

IHH    92
Air    90
Name: exposure_type, dtype: int64

# match mapping file and biom table

In [12]:
mapping, otu_table = match(mapping, otu_table)

In [13]:
print(mapping.shape)
print(otu_table.shape)

(182, 69)
(182, 740)


In [14]:
# convert values in exposure_type to be integers
labels = np.array((mapping['exposure_type'] == 'IHH').astype(int))

In [15]:
labels

array([1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0])

In [16]:
# transpose otu_table to have OTUs as rows and samples as columns
dat = np.transpose(np.array(otu_table))

In [17]:
dat.shape

(740, 182)

## Apply DS-FDR

In [18]:
result = dsfdr.dsfdr(dat, labels, transform_type='rank', 
                     method='meandiff', alpha=0.01, 
                     numperm=10000, fdr_method='dsfdr')

In [19]:
# total number of significant hypotheses
rej=(result[0])
np.sum(rej)

80

## Output result

In [20]:
pvals=[]
teststat=[]

for i in range(len(result[0])):
    if result[0][i]==True:
        pvals.append(result[2][i])
        teststat.append(result[1][i])

In [21]:
s = pd.Series(rej, name='bools')

In [22]:
out = otu_table.T[s.values]

In [23]:
out.shape

(80, 182)

In [24]:
out['pvalue']=pvals
out['test_statistic']=teststat

In [25]:
out.head()

Unnamed: 0,10422.17.F.13,10422.25.F.12,10422.18.F.7,10422.22.F.13,10422.31.F.2,10422.17.F.6,10422.20.F.4,10422.32.F.1,10422.25.F.8,10422.19.F.7,...,10422.21.F.6,10422.17.F.9,10422.17.F.5,10422.29.F.13,10422.28.F.4,10422.26.F.5,10422.27.F.8,10422.30.F.9,pvalue,test_statistic
TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCAGCAAAGGTCTGTGGTGAAAGACTGAAGCTTAACTTCAGTAAGCCATAGAAACCGGGCAGCTAGAGTGCAGGAGAGGATCGTGGAATTCCATGT,28.0,5.0,53.0,48.0,0.0,136.0,79.0,0.0,8.0,135.0,...,58.0,66.0,83.0,2.0,72.0,20.0,2.0,40.0,0.0001,45.027415
TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGGGCGCAGACGGCGATGCAAGCCAGGAGTGAAAGCCCGGGGCCCAACCCCGGGACTGCTCTTGGAACTGCGTGGCTGGAGTGCAGGAGGGGCAGGCGGAATTCCTGG,408.0,26.0,332.0,331.0,1.0,177.0,143.0,0.0,76.0,119.0,...,782.0,446.0,151.0,702.0,265.0,507.0,233.0,80.0,0.0001,34.103019
TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCCTGCCAAGTCTGATGTGAAATACCGGGGCTCAACCCCGGGGCTGCATTGGAAACTGGCAGGCTGGAGTGTCGGAGAGGCAGGCGGAATTCCTAG,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,-20.222222
TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGAAAGCAAGTCAGATGTGAAAACCATGGGCTCAACCTGTGGCCTGCATTTGAAACTGTTTTTCTTGAGTACTGGAGAGGCAGACGGAATTCCTAG,29.0,5.0,46.0,65.0,0.0,19.0,19.0,0.0,43.0,31.0,...,14.0,52.0,7.0,12.0,27.0,59.0,44.0,12.0,0.0004,28.223188
TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGTGGTTTATTAAGTCTGGTGTAAAAGGCAGTGGCTCAACCATTGTATGCATTGGAAACTGGTAGACTTGAGTGCAGGAGAGGAGAGTGGAATTCCATGT,10.0,5.0,21.0,30.0,0.0,38.0,43.0,0.0,10.0,37.0,...,10.0,20.0,26.0,5.0,17.0,13.0,8.0,18.0,0.0003,29.157367


In [26]:
out.to_csv('dblr_dsfdr.txt', sep='\t')