# Merge sequence tables from multiple runs and create phyloseq object

In [1]:
ncores = 20
OutFolder = "~/Hyphosphere/data/MiSeq/merged/dada2_intermediates"

#location of taxonomy database
TrainingSet = '~/databases/silva_nr_v132_train_set.fa.gz'
SpeciesTraining = '~/databases/silva_species_assignment_v132.fa.gz'

#Metadata file
SamData = '~/Hyphosphere/3exp_metadata.txt'

In [2]:
library(dada2)
library(tidyr)
library(dplyr)
library(phyloseq)
library(ggplot2)

Loading required package: Rcpp
“package ‘Rcpp’ was built under R version 3.6.1”Registered S3 method overwritten by 'dplyr':
  method               from  
  as.data.frame.tbl_df tibble
“multiple methods tables found for ‘colMeans’”
Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

“package ‘ggplot2’ was built under R version 3.6.1”

In [92]:

                   # Merge multiple runs (if necessary)
st1 <- readRDS("~/Hyphosphere/data/Exp1/DADA2Files/seqtab-nochim.rds")
st2 <- readRDS("~/Hyphosphere/data/Exp10/DADA2Files/seqtab-nochim.rds")
st3 <- readRDS("~/Hyphosphere/data/Exp11/DADA2Files/seqtab-nochim.rds")
st.all <- mergeSequenceTables(st1, st2, st3)

In [93]:
rownames(st1)

In [94]:
dim(st.all)

In [95]:
st1["HCBP3_S212",1:25]

In [96]:
st.all.2 = collapseNoMismatch(st.all)


In [97]:
dim(st.all.2)

*Note that summed total of sequences in all three experiments is 77,639 so approximately 16k sequences are shared between expriments

In [98]:
outFile = file.path(OutFolder,"seqtab-merged.rds")

saveRDS(st.all.2, outFile)

# Checkpoint
* read merged seqtab

In [3]:
CheckPoint = file.path(OutFolder,"seqtab-merged.rds")

seqtab.nochim = readRDS(CheckPoint)

# Assign taxonomy

In [100]:
taxa <- assignTaxonomy(seqtab.nochim, TrainingSet, multithread=ncores)


In [None]:
outFile = file.path(OutFolder,'taxa.rds')
outFile
saveRDS(taxa, outFile)

# Checkpoint read taxa file

In [4]:
CheckPoint = file.path(OutFolder,"taxa.rds")



In [5]:
taxa = readRDS(CheckPoint)

In [6]:
dim(taxa)

In [7]:
#inspect taxonomic assignment

taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)


Kingdom,Phylum,Class,Order,Family,Genus
Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium
Bacteria,,,,,
Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas
Bacteria,,,,,
Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter
Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio


# Build tree with DECIPHER and raxml

In [9]:
library(DECIPHER)

“package ‘DECIPHER’ was built under R version 3.6.1”Loading required package: Biostrings
“package ‘Biostrings’ was built under R version 3.6.1”Loading required package: BiocGenerics
“package ‘BiocGenerics’ was built under R version 3.6.1”Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order

In [None]:
seqs <- getSequences(seqtab.nochim)
names(seqs) <- seqs # This propagates to the tip labels of the tree
alignment <- AlignSeqs(DNAStringSet(seqs), anchor=NA)

Determining distance matrix based on shared 8-mers:

In [None]:
# phang.align <- phyDat(as(alignment, "matrix"), type="DNA")
# dm <- dist.ml(phang.align)
# treeNJ <- NJ(dm) # Note, tip order != sequence order
# fit = pml(treeNJ, data=phang.align)

# ## negative edges length changed to 0!

# fitGTR <- update(fit, k=4, inv=0.2)
# fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE,
#                       rearrangement = "stochastic", control = pml.control(trace = 0))
# detach("package:phangorn", unload=TRUE)

# hand off to phyloseq

In [33]:
# read sample data
samdf = read.delim(file = SamData, header = TRUE, sep = '\t', row.names = "SampleID")
head(samdf)
rownames(samdf)[1:4]

Unnamed: 0,Sample,Experiment,Plant,Fungus,Soil,SampleType,Treatment,TimePoint,DAI,Rep,Concentration_ng.ul
HCBN1_S211,HCBN1,1,N1,Gv,Lansing,CS,HN,,81,1,6.08
HCBN2_S223,HCBN2,1,N2,Gv,Lansing,CS,HN,,81,3,7.58
HCBN3_S235,HCBN3,1,N3,Gv,Lansing,CS,HN,,81,5,7.05
HCBF1_S247,HCBF1,1,F1,Gv,Florence,CS,HN,,81,7,2.32
HCBF2_S259,HCBF2,1,F2,Gv,Florence,CS,HN,,81,9,2.08
HCBF3_S271,HCBF3,1,F3,Gv,Florence,CS,HN,,81,1,1.89


In [34]:
dim(samdf)

In [35]:
#rename taxa
taxa = cbind(taxa, row.names(taxa))
colnames(taxa)[7] = "Seq"

In [36]:
dim(taxa)
head(taxa)

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Seq
TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium,TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
GACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGATAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGAAGCTAGCAATACTGTCAAGCTAGAGGGCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAACATCAATAAAGGCGAAGGCAGTTAACTAGTCTGTCCCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAACGG,Bacteria,,,,,,GACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGATAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGAAGCTAGCAATACTGTCAAGCTAGAGGGCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAACATCAATAAAGGCGAAGGCAGTTAACTAGTCTGTCCCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAACGG
TACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas,TACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
GACATAGGTGGCGAACGTTATCCGGAATTATTGGGCGTAAAGGATGCGTAGATGGCAGAGTAAGTTACTGGTTGATGTCAAACTCAATTTGACGGAAGCTGGTAATACTGTTTTGCTAGAGGACAGGAGAGGTTGATGGAATTCTGTGTGGAGCGGTGAAATGCGTTGATCTACAGAGGAACACCAAAAAAGGCGAAGGCAGTCAACTATCCTGTTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAACCGG,Bacteria,,,,,,GACATAGGTGGCGAACGTTATCCGGAATTATTGGGCGTAAAGGATGCGTAGATGGCAGAGTAAGTTACTGGTTGATGTCAAACTCAATTTGACGGAAGCTGGTAATACTGTTTTGCTAGAGGACAGGAGAGGTTGATGGAATTCTGTGTGGAGCGGTGAAATGCGTTGATCTACAGAGGAACACCAAAAAAGGCGAAGGCAGTCAACTATCCTGTTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAACCGG
TACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGG,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter,TACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGG
TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGTAGGTGGTTTGTTAAGCTAGCTGTGAAATCCCCGGGCTCAACCTGGGCACTGCAGTTAGAACTGGCAAGCTAGAGTAGGGTAGAGGGGTGTGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACATCAGTGGCGAAGGCGACACCCTGGACTCATACTGACACTGAGGTGCGAAAGCGTGGGGAGCAAACAGG,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGTAGGTGGTTTGTTAAGCTAGCTGTGAAATCCCCGGGCTCAACCTGGGCACTGCAGTTAGAACTGGCAAGCTAGAGTAGGGTAGAGGGGTGTGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACATCAGTGGCGAAGGCGACACCCTGGACTCATACTGACACTGAGGTGCGAAAGCGTGGGGAGCAAACAGG


In [37]:
rownames(seqtab.nochim)[1:50]

In [38]:
rownames(seqtab.nochim) %>% length()

In [39]:
rownames(samdf) %>% length

In [40]:
toremove = setdiff(rownames(seqtab.nochim),rownames(samdf))

In [41]:
toremove

In [42]:

allsamples = rownames(samdf)
allsamples = allsamples[!(allsamples %in% toremove)]
seqtab.nochim2 = seqtab.nochim[allsamples,]
seqtab.nochim2 %>% dim()

## Create phyloseq object

In [43]:
ps <- phyloseq(otu_table(seqtab.nochim2, taxa_are_rows=FALSE), 
               sample_data(samdf), 
               tax_table(taxa))
ps

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 61130 taxa and 434 samples ]
sample_data() Sample Data:       [ 434 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 61130 taxa by 7 taxonomic ranks ]

In [44]:
sample_names(ps)[1:55]

In [45]:
ps %>% subset_samples(., Experiment == "3")

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 61130 taxa and 191 samples ]
sample_data() Sample Data:       [ 191 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 61130 taxa by 7 taxonomic ranks ]

## rename taxa

* 'TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG' should have 305 reads in sample 100_5.5BP

In [46]:
new.names <- paste0("ASV", seq(ntaxa(ps))) # Define new names ASV1, ASV2,
seqs <- taxa_names(ps) # Store sequences
names(seqs) <- new.names # Make map from ASV1 to full sequence
taxa_names(ps) <- new.names # Rename to human-friendly format
taxa_names(ps)[1:10]

In [50]:
subset_taxa(ps, Seq == 'TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG', TRUE) %>% 
 subset_samples(Sample == '5.5BP') %>% sample_sums()

* taxa names assigned properly

In [51]:
taxa_sums(ps)[1:10]

## save full phyloseq object

In [52]:
saveRDS(ps, file = '~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_full.rds')

In [53]:
ps = readRDS(file = '~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_full.rds')

## Threshold to remove minor sequences

In [54]:
tax_table(ps) %>% head
ps.thresh = filter_taxa(ps, function(x) sum(x > 3) > 3, TRUE)
ps.thresh

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Seq
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium,TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV2,Bacteria,,,,,,GACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGATAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGAAGCTAGCAATACTGTCAAGCTAGAGGGCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAACATCAATAAAGGCGAAGGCAGTTAACTAGTCTGTCCCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAACGG
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas,TACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV4,Bacteria,,,,,,GACATAGGTGGCGAACGTTATCCGGAATTATTGGGCGTAAAGGATGCGTAGATGGCAGAGTAAGTTACTGGTTGATGTCAAACTCAATTTGACGGAAGCTGGTAATACTGTTTTGCTAGAGGACAGGAGAGGTTGATGGAATTCTGTGTGGAGCGGTGAAATGCGTTGATCTACAGAGGAACACCAAAAAAGGCGAAGGCAGTCAACTATCCTGTTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAACCGG
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter,TACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGG
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGTAGGTGGTTTGTTAAGCTAGCTGTGAAATCCCCGGGCTCAACCTGGGCACTGCAGTTAGAACTGGCAAGCTAGAGTAGGGTAGAGGGGTGTGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACATCAGTGGCGAAGGCGACACCCTGGACTCATACTGACACTGAGGTGCGAAAGCGTGGGGAGCAAACAGG


phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 15600 taxa and 434 samples ]
sample_data() Sample Data:       [ 434 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 15600 taxa by 7 taxonomic ranks ]

In [55]:
tax_table(ps.thresh)[1:18]

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Seq
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium,TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV2,Bacteria,,,,,,GACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGATAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGAAGCTAGCAATACTGTCAAGCTAGAGGGCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAACATCAATAAAGGCGAAGGCAGTTAACTAGTCTGTCCCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAACGG
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas,TACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV4,Bacteria,,,,,,GACATAGGTGGCGAACGTTATCCGGAATTATTGGGCGTAAAGGATGCGTAGATGGCAGAGTAAGTTACTGGTTGATGTCAAACTCAATTTGACGGAAGCTGGTAATACTGTTTTGCTAGAGGACAGGAGAGGTTGATGGAATTCTGTGTGGAGCGGTGAAATGCGTTGATCTACAGAGGAACACCAAAAAAGGCGAAGGCAGTCAACTATCCTGTTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAACCGG
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter,TACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGG
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGTAGGTGGTTTGTTAAGCTAGCTGTGAAATCCCCGGGCTCAACCTGGGCACTGCAGTTAGAACTGGCAAGCTAGAGTAGGGTAGAGGGGTGTGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACATCAGTGGCGAAGGCGACACCCTGGACTCATACTGACACTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV7,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,Mitochondria,,GACGGGGGGGGCAAGTGTTCTTCGGAATGACTGGGCGTAAAGGGCACGTAGGCGGTGAATCGGGTTGAAAGTGAAAGTCGCCAAAAAGTGGCGGAATGCTCTCGAAACCAATTCACTTGAGTGAGACAGAGGAGAGTGGAATTTCGTGTGTAGGGGTGAAATCCGTAGATCTACGAAGGAACGCCAAAAGCGAAGGCAGCTCTCTGGGTCCCTACCGACGCTGGGGTGCGAAAGCATGGGGAGCGAACAGG
ASV8,Bacteria,Proteobacteria,Alphaproteobacteria,Caulobacterales,Caulobacteraceae,Asticcacaulis,TACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGGTTATCAAGTTGGAGGTGAAAGCCCAGGGCTCAACCTTGGAATTGCCTTCAAAACTGATAACCTAGAGGATGATAGAGGTAAGTGGAACTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGACTTACTGGATCATTACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAACAGG
ASV9,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,Mitochondria,,TACGGGAGGAGCGAGCATTATTCGGAATGATTAGGCGTAAAGGGTTTGTAGGTGGTTTTTTAAGTTGAAAAAAACAGATTAAAGCTCAACTTTATAAATTTTTTCAAAACTGAGAAACTTGAGTATAAATAGAGGATAATGGAATTTCTATTGGAGGGATAAAATACGTTGATAATAGAAGGAAGGCCTAAAGCGAAGGCAATTATCTGGGTATATACTGACACTGAGAAACGAAAGCTTGGGTAGCAAACGGG
ASV10,Bacteria,Cyanobacteria,Oxyphotobacteria,Chloroplast,,,GACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGTCTGTAGGTGGCTTTTCAAGTCCGCCGTCAAATCCCAGGGCTCAACCCTGGACAGGCGGTGGAAACTACCAAGCTGGAGTACGGTAGGGGCAGAGGGAATTTCCGGTGGAGCGGTGAAATGCATTGAGATCGGAAAGAACACCAACGGCGAAAGCACTCTGCTGGGCCGACACTGACACTGAGAGACGAAAGCTAGGGGAGCAAATGGG


In [59]:
rm(ps)
rm(seqtab.nochim, seqtab.nochim2, taxa)

“object 'taxa' not found”

## Remove and save sequences from tax_table
*removing sequences will greatly speed up psmelt and subsequent operations

In [60]:
Seqs_df = cbind(rownames(tax_table(ps.thresh)), tax_table(ps.thresh)[,'Seq'])

In [61]:
colnames(Seqs_df)[1:2] = c("ASV", "Seq")
head(Seqs_df)


Unnamed: 0,ASV,Seq
ASV1,ASV1,TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV2,ASV2,GACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGATAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGAAGCTAGCAATACTGTCAAGCTAGAGGGCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAACATCAATAAAGGCGAAGGCAGTTAACTAGTCTGTCCCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAACGG
ASV3,ASV3,TACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV4,ASV4,GACATAGGTGGCGAACGTTATCCGGAATTATTGGGCGTAAAGGATGCGTAGATGGCAGAGTAAGTTACTGGTTGATGTCAAACTCAATTTGACGGAAGCTGGTAATACTGTTTTGCTAGAGGACAGGAGAGGTTGATGGAATTCTGTGTGGAGCGGTGAAATGCGTTGATCTACAGAGGAACACCAAAAAAGGCGAAGGCAGTCAACTATCCTGTTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAACCGG
ASV5,ASV5,TACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGG
ASV6,ASV6,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGTAGGTGGTTTGTTAAGCTAGCTGTGAAATCCCCGGGCTCAACCTGGGCACTGCAGTTAGAACTGGCAAGCTAGAGTAGGGTAGAGGGGTGTGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACATCAGTGGCGAAGGCGACACCCTGGACTCATACTGACACTGAGGTGCGAAAGCGTGGGGAGCAAACAGG


In [62]:
taxa_df = tax_table(ps.thresh)[,1:6]
head(taxa_df)


Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium
ASV2,Bacteria,,,,,
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas
ASV4,Bacteria,,,,,
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio


In [63]:
#save table of seqs
write.table(Seqs_df, file = '~/Hyphosphere/data/3Exp/phyloseq/taxa_seqs.txt', sep = '\t')

In [64]:
# save fasta file of seqs
outfile = '~/Hyphosphere/data/3Exp/phyloseq/seqs_thresh.fasta'

SeqNames = Seqs_df[,'ASV'] %>%
    as.list()
SeqNames[1:4]
seqs = Seqs_df[,'Seq'] %>% as.list()
seqs[1:4]
seqinr::write.fasta(sequences = as.list(seqs), names = SeqNames, file.out = outfile)

## Save thresholded phyloseq with simplified taxa_table

In [65]:
head(taxa_df)

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium
ASV2,Bacteria,,,,,
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas
ASV4,Bacteria,,,,,
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio


In [66]:
tax_table(ps.thresh) %>% head

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Seq
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium,TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV2,Bacteria,,,,,,GACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGATAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGAAGCTAGCAATACTGTCAAGCTAGAGGGCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAACATCAATAAAGGCGAAGGCAGTTAACTAGTCTGTCCCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAACGG
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas,TACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
ASV4,Bacteria,,,,,,GACATAGGTGGCGAACGTTATCCGGAATTATTGGGCGTAAAGGATGCGTAGATGGCAGAGTAAGTTACTGGTTGATGTCAAACTCAATTTGACGGAAGCTGGTAATACTGTTTTGCTAGAGGACAGGAGAGGTTGATGGAATTCTGTGTGGAGCGGTGAAATGCGTTGATCTACAGAGGAACACCAAAAAAGGCGAAGGCAGTCAACTATCCTGTTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAACCGG
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter,TACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGG
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGTAGGTGGTTTGTTAAGCTAGCTGTGAAATCCCCGGGCTCAACCTGGGCACTGCAGTTAGAACTGGCAAGCTAGAGTAGGGTAGAGGGGTGTGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACATCAGTGGCGAAGGCGACACCCTGGACTCATACTGACACTGAGGTGCGAAAGCGTGGGGAGCAAACAGG


In [67]:
tax_table(ps.thresh) = taxa_df
head(tax_table(ps.thresh))

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus
ASV1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Phyllobacterium
ASV2,Bacteria,,,,,
ASV3,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas
ASV4,Bacteria,,,,,
ASV5,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Micrococcaceae,Pseudarthrobacter
ASV6,Bacteria,Proteobacteria,Gammaproteobacteria,Cellvibrionales,Cellvibrionaceae,Cellvibrio


In [68]:
ps.thresh

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 15600 taxa and 434 samples ]
sample_data() Sample Data:       [ 434 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 15600 taxa by 6 taxonomic ranks ]

In [69]:
saveRDS(ps.thresh, file = '~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_thresh.rds')

# Next steps: sequences will be used to build a tree in a python script and then tree will be re-united with phyloseq object in subsequent notebook

In [70]:
sessionInfo()

R version 3.5.1 (2018-07-02)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 16.04.6 LTS

Matrix products: default
BLAS/LAPACK: /data/home/be68/anaconda3/envs/MyR/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ggplot2_3.0.0   phyloseq_1.26.1 dplyr_0.7.6     tidyr_0.8.1    
[5] dada2_1.10.0    Rcpp_0.12.18   

loaded via a namespace (and not attached):
 [1] Biobase_2.42.0              splines_3.5.1              
 [3] jsonlite_1.5                foreach_1.4.4              
 [5] RcppParallel_4.4.2          ass