# Merge sequence tables from multiple runs and create phyloseq object

In [None]:
ncores = 20
OutFolder = "~/Hyphosphere/data/MiSeq/merged/dada2_intermediates"

#location of taxonomy database
TrainingSet = '~/databases/silva_nr_v132_train_set.fa.gz'
SpeciesTraining = '~/databases/silva_species_assignment_v132.fa.gz'

#Metadata file
SamData = '~/Hyphosphere/3exp_metadata.txt'

In [3]:
library(dada2)
library(tidyr)
library(dplyr)
library(phyloseq)
library(ggplot2)

Loading required package: Rcpp

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [None]:

                   # Merge multiple runs (if necessary)
st1 <- readRDS("~/Hyphosphere/data/Exp1/DADA2Files/seqtab-nochim.rds")
st2 <- readRDS("~/Hyphosphere/data/Exp10/DADA2Files/seqtab-nochim.rds")
st3 <- readRDS("~/Hyphosphere/data/Exp11/DADA2Files/seqtab-nochim.rds")
st.all <- mergeSequenceTables(st1, st2, st3)

In [None]:
dim(st.all)

In [None]:
st.all.2 = collapseNoMismatch(st.all)


In [None]:
dim(st.all.2)

*Note that summed total of sequences in all three experiments is 77,639 so approximately 16k sequences are shared between expriments

In [None]:
outFile = file.path(OutFolder,"seqtab-merged.rds")

saveRDS(st.all.2, outFile)

# Checkpoint
* read merged seqtab

In [None]:
CheckPoint = file.path(OutFolder,"seqtab-merged.rds")

seqtab.nochim = readRDS(CheckPoint)

# Assign taxonomy

In [None]:
taxa <- assignTaxonomy(seqtab.nochim, TrainingSet, multithread=ncores)


In [None]:
outFile = file.path(OutFolder,'taxa.rds')
outFile
saveRDS(taxa, outFile)

# Checkpoint read taxa file

In [None]:
CheckPoint = file.path(OutFolder,"taxa.rds")



In [None]:
taxa = readRDS(CheckPoint)

In [None]:
#inspect taxonomic assignment

taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)


# hand off to phyloseq

In [None]:
# read sample data
samdf = read.delim(file = SamData, header = TRUE, sep = '\t', row.names = "SampleID")
head(samdf)
rownames(samdf)[1:4]

In [None]:
#rename taxa
taxa = cbind(taxa, row.names(taxa))
colnames(taxa)[7] = "Seq"

In [None]:
dim(taxa)
head(taxa)

## Create phyloseq object

In [None]:
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), 
               sample_data(samdf), 
               tax_table(taxa))
ps

## rename taxa

In [None]:
new.names <- paste0("ASV", seq(ntaxa(ps))) # Define new names ASV1, ASV2,
seqs <- taxa_names(ps) # Store sequences
names(seqs) <- new.names # Make map from ASV1 to full sequence
taxa_names(ps) <- new.names # Rename to human-friendly format
taxa_names(ps)[1:10]

In [None]:
taxa_sums(ps)[1:10]

## save full phyloseq object

In [None]:
saveRDS(ps, file = '~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_full.rds')

In [5]:
ps = readRDS(file = '~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_full.rds')

## Threshold to remove minor sequences

In [6]:
tax_table(ps) %>% head
ps.thresh = filter_taxa(ps, function(x) sum(x > 3) > 3, TRUE)
ps.thresh

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Seq
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium,TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV2,Bacteria,,,,,,GACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGATAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGAAGCTAGCAATACTGTCAAGCTAGAGGGCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAACATCAATAAAGGCGAAGGCAGTTAACTAGTCTGTCCCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAACGG
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas,TACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV4,Bacteria,,,,,,GACATAGGTGGCGAACGTTATCCGGAATTATTGGGCGTAAAGGATGCGTAGATGGCAGAGTAAGTTACTGGTTGATGTCAAACTCAATTTGACGGAAGCTGGTAATACTGTTTTGCTAGAGGACAGGAGAGGTTGATGGAATTCTGTGTGGAGCGGTGAAATGCGTTGATCTACAGAGGAACACCAAAAAAGGCGAAGGCAGTCAACTATCCTGTTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAACCGG
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter,TACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGG
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGTAGGTGGTTTGTTAAGCTAGCTGTGAAATCCCCGGGCTCAACCTGGGCACTGCAGTTAGAACTGGCAAGCTAGAGTAGGGTAGAGGGGTGTGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACATCAGTGGCGAAGGCGACACCCTGGACTCATACTGACACTGAGGTGCGAAAGCGTGGGGAGCAAACAGG


phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 14615 taxa and 382 samples ]
sample_data() Sample Data:       [ 382 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 14615 taxa by 7 taxonomic ranks ]

In [7]:
rm(ps)

## Remove and save sequences from tax_table
*removing sequences will greatly speed up psmelt and subsequent operations

In [8]:
Seqs_df = cbind(rownames(tax_table(ps.thresh)), tax_table(ps.thresh)[,'Seq'])

In [10]:
colnames(Seqs_df)[1:2] = c("ASV", "Seq")
head(Seqs_df)


Unnamed: 0,ASV,Seq
ASV1,ASV1,TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV2,ASV2,GACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGATAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGAAGCTAGCAATACTGTCAAGCTAGAGGGCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAACATCAATAAAGGCGAAGGCAGTTAACTAGTCTGTCCCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAACGG
ASV3,ASV3,TACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV4,ASV4,GACATAGGTGGCGAACGTTATCCGGAATTATTGGGCGTAAAGGATGCGTAGATGGCAGAGTAAGTTACTGGTTGATGTCAAACTCAATTTGACGGAAGCTGGTAATACTGTTTTGCTAGAGGACAGGAGAGGTTGATGGAATTCTGTGTGGAGCGGTGAAATGCGTTGATCTACAGAGGAACACCAAAAAAGGCGAAGGCAGTCAACTATCCTGTTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAACCGG
ASV5,ASV5,TACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGG
ASV6,ASV6,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGTAGGTGGTTTGTTAAGCTAGCTGTGAAATCCCCGGGCTCAACCTGGGCACTGCAGTTAGAACTGGCAAGCTAGAGTAGGGTAGAGGGGTGTGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACATCAGTGGCGAAGGCGACACCCTGGACTCATACTGACACTGAGGTGCGAAAGCGTGGGGAGCAAACAGG


In [13]:
taxa_df = tax_table(ps.thresh)[,1:6]
head(taxa_df)


Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium
ASV2,Bacteria,,,,,
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas
ASV4,Bacteria,,,,,
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio


In [14]:
#save table of seqs
write.table(Seqs_df, file = '~/Hyphosphere/data/3Exp/phyloseq/taxa_seqs.txt', sep = '\t')

In [18]:
# save fasta file of seqs
outfile = '~/Hyphosphere/data/3Exp/phyloseq/seqs_thresh.fasta'

SeqNames = Seqs_df[,'ASV'] %>%
    as.list()
SeqNames[1:4]
seqs = Seqs_df[,'Seq'] %>% as.list()
seqs[1:4]
seqinr::write.fasta(sequences = as.list(seqs), names = SeqNames, file.out = outfile)

## Save thresholded phyloseq with simplified taxa_table

In [19]:
head(taxa_df)

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium
ASV2,Bacteria,,,,,
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas
ASV4,Bacteria,,,,,
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio


In [20]:
tax_table(ps.thresh) %>% head

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Seq
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium,TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV2,Bacteria,,,,,,GACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGATAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGAAGCTAGCAATACTGTCAAGCTAGAGGGCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAACATCAATAAAGGCGAAGGCAGTTAACTAGTCTGTCCCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAACGG
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas,TACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV4,Bacteria,,,,,,GACATAGGTGGCGAACGTTATCCGGAATTATTGGGCGTAAAGGATGCGTAGATGGCAGAGTAAGTTACTGGTTGATGTCAAACTCAATTTGACGGAAGCTGGTAATACTGTTTTGCTAGAGGACAGGAGAGGTTGATGGAATTCTGTGTGGAGCGGTGAAATGCGTTGATCTACAGAGGAACACCAAAAAAGGCGAAGGCAGTCAACTATCCTGTTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAACCGG
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter,TACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGG
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGTAGGTGGTTTGTTAAGCTAGCTGTGAAATCCCCGGGCTCAACCTGGGCACTGCAGTTAGAACTGGCAAGCTAGAGTAGGGTAGAGGGGTGTGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACATCAGTGGCGAAGGCGACACCCTGGACTCATACTGACACTGAGGTGCGAAAGCGTGGGGAGCAAACAGG


In [21]:
tax_table(ps.thresh) = taxa_df
head(tax_table(ps.thresh))

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium
ASV2,Bacteria,,,,,
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas
ASV4,Bacteria,,,,,
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio


In [22]:
saveRDS(ps.thresh, file = '~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_thresh.rds')

# Next steps: sequences will be used to build a tree in a python script and then tree will be re-united with phyloseq object in subsequent notebook

In [23]:
sessionInfo()

R version 3.5.1 (2018-07-02)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 16.04.3 LTS

Matrix products: default
BLAS/LAPACK: /data/home/be68/anaconda3/envs/MyR/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ggplot2_3.0.0   phyloseq_1.26.1 dplyr_0.7.6     tidyr_0.8.1    
[5] dada2_1.10.0    Rcpp_0.12.18   

loaded via a namespace (and not attached):
 [1] Biobase_2.42.0              splines_3.5.1              
 [3] jsonlite_1.5                foreach_1.4.4              
 [5] RcppParallel_4.4.2          ass