# Merge sequence tables from multiple runs and create phyloseq object

In [90]:
ncores = 20
OutFolder = "~/Hyphosphere/data/MiSeq/merged/dada2_intermediates"

#location of taxonomy database
TrainingSet = '~/databases/silva_nr_v132_train_set.fa.gz'
SpeciesTraining = '~/databases/silva_species_assignment_v132.fa.gz'

#Metadata file
SamData = '~/Hyphosphere/3exp_metadata.txt'

In [91]:
library(dada2)
library(tidyr)
library(dplyr)
library(phyloseq)
library(ggplot2)

In [92]:

                   # Merge multiple runs (if necessary)
st1 <- readRDS("~/Hyphosphere/data/Exp1/DADA2Files/seqtab-nochim.rds")
st2 <- readRDS("~/Hyphosphere/data/Exp10/DADA2Files/seqtab-nochim.rds")
st3 <- readRDS("~/Hyphosphere/data/Exp11/DADA2Files/seqtab-nochim.rds")
st.all <- mergeSequenceTables(st1, st2, st3)

In [93]:
rownames(st1)

In [94]:
dim(st.all)

In [95]:
st1["HCBP3_S212",1:25]

In [96]:
st.all.2 = collapseNoMismatch(st.all)


In [97]:
dim(st.all.2)

*Note that summed total of sequences in all three experiments is 77,639 so approximately 16k sequences are shared between expriments

In [98]:
outFile = file.path(OutFolder,"seqtab-merged.rds")

saveRDS(st.all.2, outFile)

# Checkpoint
* read merged seqtab

In [99]:
CheckPoint = file.path(OutFolder,"seqtab-merged.rds")

seqtab.nochim = readRDS(CheckPoint)

# Assign taxonomy

In [None]:
taxa <- assignTaxonomy(seqtab.nochim, TrainingSet, multithread=ncores)


In [None]:
outFile = file.path(OutFolder,'taxa.rds')
outFile
saveRDS(taxa, outFile)

# Checkpoint read taxa file

In [None]:
CheckPoint = file.path(OutFolder,"taxa.rds")



In [None]:
taxa = readRDS(CheckPoint)

In [None]:
#inspect taxonomic assignment

taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)


# hand off to phyloseq

In [None]:
# read sample data
samdf = read.delim(file = SamData, header = TRUE, sep = '\t', row.names = "SampleID")
head(samdf)
rownames(samdf)[1:4]

In [None]:
dim(samdf)

In [None]:
#rename taxa
taxa = cbind(taxa, row.names(taxa))
colnames(taxa)[7] = "Seq"

In [None]:
dim(taxa)
head(taxa)

In [None]:
rownames(seqtab.nochim)[1:50]

## Create phyloseq object

In [None]:
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), 
               sample_data(samdf), 
               tax_table(taxa))
ps

In [None]:
sample_names(ps)[1:55]

In [None]:
ps %>% subset_samples(., Experiment == "2")

* missing two samples, what happened?

## rename taxa

* 'TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG' should have 305 reads in sample 100_5.5BP

In [None]:
new.names <- paste0("ASV", seq(ntaxa(ps))) # Define new names ASV1, ASV2,
seqs <- taxa_names(ps) # Store sequences
names(seqs) <- new.names # Make map from ASV1 to full sequence
taxa_names(ps) <- new.names # Rename to human-friendly format
taxa_names(ps)[1:10]

In [None]:
subset_taxa(ps, Seq == 'TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTATTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGTAGTCTTGAGTTCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG', TRUE) %>% otu_table

* taxa names assigned properly

In [None]:
taxa_sums(ps)[1:10]

## save full phyloseq object

In [None]:
saveRDS(ps, file = '~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_full.rds')

In [None]:
ps = readRDS(file = '~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_full.rds')

## Threshold to remove minor sequences

In [None]:
tax_table(ps) %>% head
ps.thresh = filter_taxa(ps, function(x) sum(x > 3) > 3, TRUE)
ps.thresh

In [None]:
tax_table(ps.thresh)[1:18]

In [None]:
rm(ps)

## Remove and save sequences from tax_table
*removing sequences will greatly speed up psmelt and subsequent operations

In [None]:
Seqs_df = cbind(rownames(tax_table(ps.thresh)), tax_table(ps.thresh)[,'Seq'])

In [None]:
colnames(Seqs_df)[1:2] = c("ASV", "Seq")
head(Seqs_df)


In [None]:
taxa_df = tax_table(ps.thresh)[,1:6]
head(taxa_df)


In [None]:
#save table of seqs
write.table(Seqs_df, file = '~/Hyphosphere/data/3Exp/phyloseq/taxa_seqs.txt', sep = '\t')

In [None]:
# save fasta file of seqs
outfile = '~/Hyphosphere/data/3Exp/phyloseq/seqs_thresh.fasta'

SeqNames = Seqs_df[,'ASV'] %>%
    as.list()
SeqNames[1:4]
seqs = Seqs_df[,'Seq'] %>% as.list()
seqs[1:4]
seqinr::write.fasta(sequences = as.list(seqs), names = SeqNames, file.out = outfile)

## Save thresholded phyloseq with simplified taxa_table

In [None]:
head(taxa_df)

In [None]:
tax_table(ps.thresh) %>% head

In [None]:
tax_table(ps.thresh) = taxa_df
head(tax_table(ps.thresh))

In [None]:
ps.thresh

In [None]:
saveRDS(ps.thresh, file = '~/Hyphosphere/data/3Exp/phyloseq/3Exp_phyloseq_thresh.rds')

# Next steps: sequences will be used to build a tree in a python script and then tree will be re-united with phyloseq object in subsequent notebook

In [None]:
sessionInfo()