# Description:
        Implement sparsity filter
        Prune non-samples and 2 ECHCG samples that were not samples
        Final threshold: physeq.thresh = filter_taxa(physeq, 
                 function(x) sum(x > 3) > 3, TRUE)
        Join plant growth and enzyme metadata
        Save output for consistent use in other notebooks

# Setting variables

In [2]:
workDir = '/home/bryan/RhizCG/data/MiSeq_amplicon/MergedRuns/physeq/'
biomFileDir = '/home/bryan/RhizCG/data/MiSeq_amplicon/MergedRuns/OTU_binning/'

biomFile = '/home/bryan/RhizCG/data/MiSeq_amplicon/MergedRuns/OTU_binning/otu_table_wtax.biom'
metadataFile = '/home/bryan/RhizCG/data/MiSeq_amplicon/metadata_RhizCG_merged.txt'
treeFile = '/home/bryan/RhizCG/data/MiSeq_amplicon/MergedRuns/fasttree/otusn.tree'

# Init

In [3]:
import os
from pandas import DataFrame
from IPython.display import FileLink

In [4]:

%load_ext rpy2.ipython

In [5]:
%%R
library(vegan)
library(phyloseq)
library(ggplot2)
library(foreach)
library(doParallel)
library(gridExtra)
library(dplyr)
library(tidyr)


  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)
Use Revolution R for scalability, fault tolerance and more.
http://www.revolutionanalytics.com

  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)


# Loading/editting OTU table

In [6]:
%%R -i biomFile -i metadataFile -i workDir -i treeFile
# loading
setwd(workDir)

## biom file
physeq = import_biom(biomFile, treeFile)

## loading sample metadata
sample.data = import_qiime_sample_data(metadataFile)
physeq = merge_phyloseq(physeq,sample.data)



In [7]:
%%R
#remove controls
physeq = subset_samples(physeq, !is.na(Plant))

#remove two ECHCG plots where only soil and no roots were collected and sequenced
physeq = subset_samples(physeq, !(PlotID %in% c("4.16_Nneg", "4.16_Npos"))) 
print(physeq)


phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 11246 taxa and 238 samples ]
sample_data() Sample Data:       [ 238 samples by 21 sample variables ]
tax_table()   Taxonomy Table:    [ 11246 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 11246 tips and 11245 internal nodes ]


In [8]:
%%R

sample_data(physeq)$Planted = ifelse(sample_data(physeq)$Plant == "BARE", "BARE", "Rhizosphere")
sample_data(physeq)$Planted

  [1] "BARE"        "BARE"        "BARE"        "BARE"        "BARE"       
  [6] "BARE"        "BARE"        "BARE"        "BARE"        "Rhizosphere"
 [11] "Rhizosphere" "BARE"        "Rhizosphere" "Rhizosphere" "Rhizosphere"
 [16] "BARE"        "Rhizosphere" "BARE"        "BARE"        "BARE"       
 [21] "BARE"        "Rhizosphere" "BARE"        "BARE"        "BARE"       
 [26] "Rhizosphere" "Rhizosphere" "BARE"        "Rhizosphere" "Rhizosphere"
 [31] "Rhizosphere" "Rhizosphere" "Rhizosphere" "Rhizosphere" "Rhizosphere"
 [36] "Rhizosphere" "BARE"        "BARE"        "Rhizosphere" "Rhizosphere"
 [41] "Rhizosphere" "Rhizosphere" "Rhizosphere" "BARE"        "BARE"       
 [46] "Rhizosphere" "Rhizosphere" "Rhizosphere" "Rhizosphere" "Rhizosphere"
 [51] "Rhizosphere" "BARE"        "Rhizosphere" "Rhizosphere" "Rhizosphere"
 [56] "BARE"        "BARE"        "Rhizosphere" "Rhizosphere" "Rhizosphere"
 [61] "Rhizosphere" "Rhizosphere" "BARE"        "Rhizosphere" "Rhizosphere"
 [66] "Rhizo

### apply gsub across columns replacing __ w/ "" reload tax_table to phyloseq object

In [9]:
%%R
t = tax_table(physeq)
t = apply(t, 2, function(y) gsub("__", "", y))
tax_table(physeq) = t

# Explore Filtering Functions

In [10]:
%%R 
physeq.thresh = filter_taxa(physeq, 
                 function(x) sum(x > 3) > 3, TRUE)
physeq.thresh

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4982 taxa and 238 samples ]
sample_data() Sample Data:       [ 238 samples by 22 sample variables ]
tax_table()   Taxonomy Table:    [ 4982 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4982 tips and 4981 internal nodes ]


### rarefy to even depth

In [11]:
%%R
set.seed(605)
physeq.r = rarefy_even_depth(physeq.thresh)
sample_sums(physeq.r)

 the random seed of your session for reproducibility.
See `?set.seed`


  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)


sample2-1.06_Npos sample6-4.27_Npos sample4-4.27_Nneg sample4-1.06_Nneg 
             4989              4989              4989              4989 
sample2-1.06_Nneg sample6-3.07_Nneg sample5-3.07_Nneg sample6-3.07_Npos 
             4989              4989              4989              4989 
sample3-3.07_Npos sample5-1.22_Nneg sample6-4.12_Npos sample1-4.27_Npos 
             4989              4989              4989              4989 
sample4-1.03_Npos sample5-2.06_Npos sample5-4.08_Npos sample4-3.07_Nneg 
             4989              4989              4989              4989 
sample5-4.28_Npos sample6-1.06_Npos sample4-2.05_Nneg sample2-4.27_Nneg 
             4989              4989              4989              4989 
sample1-2.05_Nneg sample5-1.29_Npos sample5-4.27_Npos sample5-4.27_Nneg 
             4989              4989              4989              4989 
sample1-3.07_Nneg sample4-2.02_Nneg sample4-4.21_Nneg sample2-2.05_Nneg 
             4989              4989              49

In [12]:
%%R
PassFilter = 
physeq.thresh %>%
                 taxa_names()

physeq.snorm = transform_sample_counts(physeq, function(x) x/sum(x))

physeq.snorm = prune_taxa(PassFilter, physeq.snorm)  
    
physeq.snorm    

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4982 taxa and 238 samples ]
sample_data() Sample Data:       [ 238 samples by 22 sample variables ]
tax_table()   Taxonomy Table:    [ 4982 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4982 tips and 4981 internal nodes ]


Filtering function
function(x): sum(x > 3) > 3, TRUE): 4982 taxa 


In [13]:
%%R
head(sample_data(physeq.thresh))

Sample Data:        [6 samples by 22 sample variables]:
                           X.sample Library Primer_Plate Primer_Well_ID
sample2-1.06_Npos sample2-1.06_Npos       1            1             H9
sample6-4.27_Npos sample6-4.27_Npos       1            2            F11
sample4-4.27_Nneg sample4-4.27_Nneg       1            1            B12
sample4-1.06_Nneg sample4-1.06_Nneg       1            1            E11
sample2-1.06_Nneg sample2-1.06_Nneg       1            1             G9
sample6-3.07_Nneg sample6-3.07_Nneg       1            2            C11
                  primerFR_ID_byPlate primerFR_ID_total barcode_ID    PlotID
sample2-1.06_Npos                  72                72         72 1.06_Npos
sample6-4.27_Npos                  86               182        182 4.27_Npos
sample4-4.27_Nneg                  90                90         90 4.27_Nneg
sample4-1.06_Nneg                  85                85         85 1.06_Nneg
sample2-1.06_Nneg                  71                71

## Join plant data for consistent meta-data set across notebooks

In [14]:
%%R
plantData = read.table('/home/bryan/RhizCG/data/MiSeq_amplicon/PlotData.txt', header = TRUE)
colnames(plantData)

 [1] "Unique"             "PlotID"             "Plant"             
 [4] "Plot"               "Treatment"          "Rep"               
 [7] "DAP"                "Sampling"           "PlantSample"       
[10] "Type"               "Grass_Broad"        "Status"            
[13] "Subfamily"          "Symbol_class"       "Plant_num"         
[16] "Leaf_AreaBiomass_g" "Biomass_T"          "Biomass_P"         
[19] "Leaf_Area_cm2"      "SLA"                "Leaf_N"            
[22] "Leaf_C"             "LeafC_N"            "LNa"               
[25] "Plant_N"            "Plant_C"            "Plant_CN"          
[28] "d15N"               "tN_p"               "BX_act"            
[31] "CB_act"             "LAP_act"            "NAG_act"           
[34] "soil_dw"            "BX_activity_g"      "CB_activity_g"     
[37] "LAP_activity_g"     "NAG_activity_g"     "BX_barefactor"     
[40] "CB_barefactor"      "LAP_barefactor"     "NAG_barefactor"    
[43] "BX_dif"             "CB_dif"             "

In [15]:
%%R
s = sample_data(physeq.thresh)
PD = plantData[,c(1,18, 20:29, 35:38, 47:58)]
s2 = left_join(s, PD, by = c("X.sample" = "Unique"))
rownames(s2) = s2$X.sample
sample_data(physeq.thresh) = s2
sample_data(physeq.thresh) %>% head

Sample Data:        [6 samples by 49 sample variables]:
                           X.sample Library Primer_Plate Primer_Well_ID
sample2-1.06_Npos sample2-1.06_Npos       1            1             H9
sample6-4.27_Npos sample6-4.27_Npos       1            2            F11
sample4-4.27_Nneg sample4-4.27_Nneg       1            1            B12
sample4-1.06_Nneg sample4-1.06_Nneg       1            1            E11
sample2-1.06_Nneg sample2-1.06_Nneg       1            1             G9
sample6-3.07_Nneg sample6-3.07_Nneg       1            2            C11
                  primerFR_ID_byPlate primerFR_ID_total barcode_ID    PlotID
sample2-1.06_Npos                  72                72         72 1.06_Npos
sample6-4.27_Npos                  86               182        182 4.27_Npos
sample4-4.27_Nneg                  90                90         90 4.27_Nneg
sample4-1.06_Nneg                  85                85         85 1.06_Nneg
sample2-1.06_Nneg                  71                71

In [16]:
%%R
s = sample_data(physeq.snorm)
PD = plantData[,c(1,18, 20:29, 35:38, 47:58)]
s2 = left_join(s, PD, by = c("X.sample" = "Unique"))
rownames(s2) = s2$X.sample
sample_data(physeq.snorm) = s2
sample_data(physeq.snorm) %>% head

Sample Data:        [6 samples by 49 sample variables]:
                           X.sample Library Primer_Plate Primer_Well_ID
sample2-1.06_Npos sample2-1.06_Npos       1            1             H9
sample6-4.27_Npos sample6-4.27_Npos       1            2            F11
sample4-4.27_Nneg sample4-4.27_Nneg       1            1            B12
sample4-1.06_Nneg sample4-1.06_Nneg       1            1            E11
sample2-1.06_Nneg sample2-1.06_Nneg       1            1             G9
sample6-3.07_Nneg sample6-3.07_Nneg       1            2            C11
                  primerFR_ID_byPlate primerFR_ID_total barcode_ID    PlotID
sample2-1.06_Npos                  72                72         72 1.06_Npos
sample6-4.27_Npos                  86               182        182 4.27_Npos
sample4-4.27_Nneg                  90                90         90 4.27_Nneg
sample4-1.06_Nneg                  85                85         85 1.06_Nneg
sample2-1.06_Nneg                  71                71

In [17]:
%%R
s = sample_data(physeq.r)
PD = plantData[,c(1,18, 20:29, 35:38, 47:58)]
s2 = left_join(s, PD, by = c("X.sample" = "Unique"))
rownames(s2) = s2$X.sample
sample_data(physeq.r) = s2
sample_data(physeq.r) %>% head

Sample Data:        [6 samples by 49 sample variables]:
                           X.sample Library Primer_Plate Primer_Well_ID
sample2-1.06_Npos sample2-1.06_Npos       1            1             H9
sample6-4.27_Npos sample6-4.27_Npos       1            2            F11
sample4-4.27_Nneg sample4-4.27_Nneg       1            1            B12
sample4-1.06_Nneg sample4-1.06_Nneg       1            1            E11
sample2-1.06_Nneg sample2-1.06_Nneg       1            1             G9
sample6-3.07_Nneg sample6-3.07_Nneg       1            2            C11
                  primerFR_ID_byPlate primerFR_ID_total barcode_ID    PlotID
sample2-1.06_Npos                  72                72         72 1.06_Npos
sample6-4.27_Npos                  86               182        182 4.27_Npos
sample4-4.27_Nneg                  90                90         90 4.27_Nneg
sample4-1.06_Nneg                  85                85         85 1.06_Nneg
sample2-1.06_Nneg                  71                71

In [19]:
%%R
sample_sums(physeq.snorm)

sample2-1.06_Npos sample6-4.27_Npos sample4-4.27_Nneg sample4-1.06_Nneg 
        0.9772621         0.9742362         0.9730047         0.9782814 
sample2-1.06_Nneg sample6-3.07_Nneg sample5-3.07_Nneg sample6-3.07_Npos 
        0.9773245         0.9755345         0.9746259         0.9753871 
sample3-3.07_Npos sample5-1.22_Nneg sample6-4.12_Npos sample1-4.27_Npos 
        0.9790742         0.9792347         0.9738639         0.9751155 
sample4-1.03_Npos sample5-2.06_Npos sample5-4.08_Npos sample4-3.07_Nneg 
        0.9740260         0.9733520         0.9751371         0.9729829 
sample5-4.28_Npos sample6-1.06_Npos sample4-2.05_Nneg sample2-4.27_Nneg 
        0.9745989         0.9784634         0.9746913         0.9741586 
sample1-2.05_Nneg sample5-1.29_Npos sample5-4.27_Npos sample5-4.27_Nneg 
        0.9741184         0.9855837         0.9752546         0.9739945 
sample1-3.07_Nneg sample4-2.02_Nneg sample4-4.21_Nneg sample2-2.05_Nneg 
        0.9757936         0.9853445         0.97633

# Output for other analyses

In [20]:
%%R -i workDir
outFile = file.path(workDir, 'Full-Sparsity3in3')
saveRDS(physeq.thresh, outFile)

In [21]:
%%R -i workDir
outFile = file.path(workDir, 'Full-Sparsity3in3_relabund')
saveRDS(physeq.snorm, outFile)

In [22]:
%%R -i workDir
outFile = file.path(workDir, 'Full-Sparsity3in3_r')
saveRDS(physeq.r, outFile)