# Finalize merged dataset 
* Incorporate phylogenetic tree into phyloseq object
* Save master and split versions

In [1]:
library(dada2)
library(tidyr)
library(dplyr)
library(phyloseq)
library(ggplot2)
library(ape)

Loading required package: Rcpp
“multiple methods tables found for ‘colMeans’”
Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

“package ‘ape’ was built under R version 3.6.1”

In [2]:
ps.thresh = readRDS('~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_thresh.rds')
ps.thresh

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19559 taxa and 434 samples ]
sample_data() Sample Data:       [ 434 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 19559 taxa by 6 taxonomic ranks ]

# Remove duplicate samples from experiment 3

In [3]:
samples = sample_names(ps.thresh)
# These samples have higher contaminants than the replicates on plate 1 and so
# rather than merging the samples they are removed here
Duplicates = c('155_1.3RH', '156_2.3RH','157_3.3RH','158_1.4RH', '159_2.4RH')
samplestokeep = samples[!(samples %in% Duplicates)]
ps.thresh = prune_samples(samplestokeep, ps.thresh)
sample_names(ps.thresh)
# allTaxa = taxa_names(ps)
# allTaxa <- allTaxa[!(allTaxa %in% FC)]
# ps.sub = prune_taxa(allTaxa, ps)
# ps.thresh %>% subset_samples(Experiment == 3) %>% subset_samples(!sample_names(ps.thresh) %in% ) %>% sample_names()

In [5]:
thresh.tree = read_tree(treefile = '~/Hyphosphere/data/3Exp/Fasttree/Master_wSulfo.tree')
#is.rooted(thresh.tree)
thresh.tree = root(thresh.tree, outgroup = 'X90478', r = TRUE)
is.rooted(thresh.tree)
thresh.tree

# root.by.outgroup <- function(tree.unrooted) {
#   longest.edge <- which.max(tree.unrooted$edge.length)
#   long.nodes <- tree.unrooted$edge[longest.edge,]     #this should usually include one tip
#   new.outgroup <- long.nodes[long.nodes < Ntip(tree.unrooted)]
#   tree.rooted <- root(tree.unrooted, outgroup=new.outgroup, resolve.root=T)
#   tree.rooted
# }

# require(ape)
# thresh.tree = root.by.outgroup(thresh.tree)





Phylogenetic tree with 19309 tips and 19245 internal nodes.

Tip labels:
	X90478, ASV6971, ASV2480, ASV5742, ASV15180, ASV9985, ...
Node labels:
	Root, 0.804, 0.000, 0.876.74, 0.947, 0.950, ...

Rooted; includes branch lengths.

# join bacterial tree with physeq object

In [6]:
ps = merge_phyloseq(ps.thresh, thresh.tree)
#force polytomies to dichotomies
phy_tree(ps) = multi2di(phy_tree(ps), random = FALSE)
ps

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19308 taxa and 429 samples ]
sample_data() Sample Data:       [ 429 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 19308 taxa by 6 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 19308 tips and 19307 internal nodes ]

In [7]:
#remove mitochondria
ps = subset_taxa(ps, Family != "Mitochondria" | is.na(Family))
ps = subset_taxa(ps, Order != "Chloroplast" | is.na(Order))
ps

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19241 taxa and 429 samples ]
sample_data() Sample Data:       [ 429 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 19241 taxa by 6 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 19241 tips and 19240 internal nodes ]

In [8]:
saveRDS(ps, '~/Hyphosphere/data/3Exp/phyloseq/final/3Exp_phyloseq_thresh_wtree.rds')


## save with contaminants removed see notebook 6

In [10]:
FC = c("ASV1", 
"ASV54" ,
"ASV78", 
"ASV136", 
"ASV3", 
"ASV12",
"ASV2516",
"ASV681", 
"ASV447",
"ASV49", 
"ASV12695", 
"ASV2395", 
"ASV284", 
"ASV1706", 
"ASV1900", 
"ASV26", 
"ASV14332", 
"ASV2962",
"ASV9101",
"ASV2180", 
"ASV8061", 
"ASV4437", 
"ASV6987", 
"ASV2714",
"ASV2160", 
"ASV2117", 
"ASV2704", 
"ASV1002", 
"ASV4083",
"ASV7871", 
"ASV1900", 
"ASV171", 
"ASV2516", 
"ASV8536",
"ASV285", 
"ASV3225", 
"ASV5265", 
"ASV2962", 
"ASV14282")

In [11]:
allTaxa = taxa_names(ps)
allTaxa <- allTaxa[!(allTaxa %in% FC)]
ps.sub = prune_taxa(allTaxa, ps)
# new phyloseq object with just the taxa you kept.
ps.sub

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19205 taxa and 429 samples ]
sample_data() Sample Data:       [ 429 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 19205 taxa by 6 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 19205 tips and 19204 internal nodes ]

In [12]:

saveRDS(ps.sub, '~/Hyphosphere/data/3Exp/phyloseq/final/3Exp_phyloseq_thresh_wtree_woContam.rds')

### merge duplicates and save experimental samples

In [13]:
ps.sub.m = ps.sub %>% merge_samples(., "Sample")

### re-map factors

In [14]:
sample_data(ps.sub.m)$Sample <- sample_names(ps.sub.m)
sample_data(ps.sub.m)$Fungus <- levels(sample_data(ps.sub)$Fungus)[get_variable(ps.sub.m, "Fungus")]
sample_data(ps.sub.m)$Plant <- levels(sample_data(ps.sub)$Plant)[get_variable(ps.sub.m, "Plant")]
sample_data(ps.sub.m)$Soil <- levels(sample_data(ps.sub)$Soil)[get_variable(ps.sub.m, "Soil")]
sample_data(ps.sub.m)$SampleType <- levels(sample_data(ps.sub)$SampleType)[get_variable(ps.sub.m, "SampleType")]
sample_data(ps.sub.m)$Treatment <- levels(sample_data(ps.sub)$Treatment)[get_variable(ps.sub.m, "Treatment")]

In [15]:
sample_data(ps.sub.m)[sample_data(ps.sub.m)$Experiment == 3,] %>% head()

Unnamed: 0,Sample,Experiment,Plant,Fungus,Soil,SampleType,Treatment,TimePoint,DAI,Rep,Concentration_ng.ul
1.1BP,1.1BP,3,1.1,Gv,Dryden,BP,HN,1,14,1,1.067
1.1BS,1.1BS,3,1.1,Gv,Dryden,BS,HN,1,14,1,4.421
1.1CH,1.1CH,3,1.1,Gv,Dryden,CH,HN,1,14,1,0.325
1.1CS,1.1CS,3,1.1,Gv,Dryden,CS,HN,1,14,1,2.116
1.1RH,1.1RH,3,1.1,Gv,Dryden,RH,HN,1,14,1,0.174
1.1RT,1.1RT,3,1.1,Gv,Dryden,RT,HN,1,14,1,1.561


In [16]:
sample_data(ps.sub.m)$SampleType %>% unique()

In [17]:
outfile = ps.sub.m %>% subset_samples(., 
            !SampleType %in% c("MK", "EB", "PB", "T0")) %>%
            subset_samples(!Sample == "Unknown-RS")
outfile

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19205 taxa and 385 samples ]
sample_data() Sample Data:       [ 385 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 19205 taxa by 6 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 19205 tips and 19204 internal nodes ]

In [18]:
saveRDS(outfile, '~/Hyphosphere/data/3Exp/phyloseq/final/3Exp_phyloseq_thresh_wtree_experimental_woContam.rds')

# now larger object without tree

In [19]:
allTaxa = taxa_names(ps.thresh)
allTaxa <- allTaxa[!(allTaxa %in% FC)]
ps.sub = prune_taxa(allTaxa, ps.thresh)
# new phyloseq object with just the taxa you kept.
ps.sub

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19523 taxa and 429 samples ]
sample_data() Sample Data:       [ 429 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 19523 taxa by 6 taxonomic ranks ]

In [20]:
#remove mitochondria
ps.sub = subset_taxa(ps.sub, Family != "Mitochondria" | is.na(Family))
ps.sub = subset_taxa(ps.sub, Order != "Chloroplast" | is.na(Order))
ps.sub

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19433 taxa and 429 samples ]
sample_data() Sample Data:       [ 429 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 19433 taxa by 6 taxonomic ranks ]

### merge duplicates and save experimental samples

In [21]:
ps.sub.m = ps.sub %>% merge_samples(., "Sample")

### re-map factors

In [22]:
sample_data(ps.sub.m)$Sample <- sample_names(ps.sub.m)
sample_data(ps.sub.m)$Fungus <- levels(sample_data(ps.sub)$Fungus)[get_variable(ps.sub.m, "Fungus")]
sample_data(ps.sub.m)$Plant <- levels(sample_data(ps.sub)$Plant)[get_variable(ps.sub.m, "Plant")]
sample_data(ps.sub.m)$Soil <- levels(sample_data(ps.sub)$Soil)[get_variable(ps.sub.m, "Soil")]
sample_data(ps.sub.m)$SampleType <- levels(sample_data(ps.sub)$SampleType)[get_variable(ps.sub.m, "SampleType")]
sample_data(ps.sub.m)$Treatment <- levels(sample_data(ps.sub)$Treatment)[get_variable(ps.sub.m, "Treatment")]

In [23]:
sample_data(ps.sub.m)[sample_data(ps.sub.m)$Experiment == 3,] %>% head()

Unnamed: 0,Sample,Experiment,Plant,Fungus,Soil,SampleType,Treatment,TimePoint,DAI,Rep,Concentration_ng.ul
1.1BP,1.1BP,3,1.1,Gv,Dryden,BP,HN,1,14,1,1.067
1.1BS,1.1BS,3,1.1,Gv,Dryden,BS,HN,1,14,1,4.421
1.1CH,1.1CH,3,1.1,Gv,Dryden,CH,HN,1,14,1,0.325
1.1CS,1.1CS,3,1.1,Gv,Dryden,CS,HN,1,14,1,2.116
1.1RH,1.1RH,3,1.1,Gv,Dryden,RH,HN,1,14,1,0.174
1.1RT,1.1RT,3,1.1,Gv,Dryden,RT,HN,1,14,1,1.561


In [24]:
sample_data(ps.sub.m)$SampleType %>% unique()

In [25]:
outfile = ps.sub.m %>% subset_samples(., 
            !SampleType %in% c("MK", "EB", "PB", "T0")) %>%
            subset_samples(!Sample == "Unknown-RS")
outfile

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19433 taxa and 385 samples ]
sample_data() Sample Data:       [ 385 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 19433 taxa by 6 taxonomic ranks ]

In [26]:
saveRDS(outfile, '~/Hyphosphere/data/3Exp/phyloseq/final/3Exp_phyloseq_thresh_woTree_experimental_woContam.rds')

In [27]:
sessionInfo()

R version 3.6.0 (2019-04-26)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 16.04.6 LTS

Matrix products: default
BLAS/LAPACK: /data/home/be68/anaconda3/envs/MyR/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ape_5.3         ggplot2_3.2.1   phyloseq_1.28.0 dplyr_1.0.0    
[5] tidyr_1.1.0     dada2_1.10.0    Rcpp_1.0.2     

loaded via a namespace (and not attached):
 [1] Biobase_2.44.0              splines_3.6.0              
 [3] jsonlite_1.6                foreach_1.4.7              
 [5] RcppParallel_4.