# Shotgun nMDS

In [2]:
# Libraries
library(ROBITools)
library(tidyverse)
library(reshape2)
library(vegan)
library(ggalt)
library(ggpubr)
library(ggrepel)
library(grid)
library(wordcloud)

*Load data*

In [None]:
# Data
dfraw <- read.delim("lca.taxa.count.tsv", check.names=F)
colnames(dfraw) <- str_replace(colnames(dfraw),"06-mergeLCA//","")

# Rename Samples
oldnames <- c('GP_1','GP_2','GP_3','A0_2','B0_2','C0_2','A1_2','B1_2','C1_2','A5_0','B5_0','C5_0','A8_0','B8_0','C8_0')
newnames <- c('EN0.2A','EN0.2B','EN0.2C','OP0.2A','OP0.2B','OP0.2C','OP1.2A','OP1.2B','OP1.2C','OP5.0A','OP5.0B','OP5.0C','OP8.0A','OP8.0B','OP8.0C')
names(newnames) <- oldnames
dfraw <- dfraw %>% rename_at(vars(oldnames), ~newnames)

*Superkingdom relative abundance Barplot*

In [None]:
## Plots Superkingdom
spkm_bars <- dfraw %>% filter(superkingdom %in% c("Bacteria","Archaea","Eukaryota","Viruses")) %>% select(c(1,9:23)) %>% melt() %>% group_by(superkingdom, variable) %>% summarise(count = sum(value)) %>% ungroup() %>% group_by(variable) %>% mutate(perc = count/sum(count) * 100)

spkm_bars$superkingdom <- factor(spkm_bars$superkingdom, levels = c("Archaea","Viruses","Bacteria","Eukaryota"))
spkm_bars$variable <- factor(spkm_bars$variable, levels = newnames)
spkm_bars <- spkm_bars %>% mutate(type = ifelse(grepl("EN", variable), "Enclosed", "Open"))

p <- spkm_bars %>% ggplot() + geom_col(aes(x=variable,y=perc,fill=superkingdom)) +
 labs(x = "Sample", y = "Percentage", fill="Superkingdom") +
 scale_fill_manual(values = c("coral3", "darkolivegreen3", "darkcyan", "orange3")) +
 facet_wrap(~type, scales="free_x") +
 theme_classic() %+replace% theme(axis.text.x = element_text(angle = 90, size=18),
                                 axis.text.y = element_text(size=18),
                                 axis.title = element_text(size=20),
                                 strip.text.x = element_text(size = 16, face="bold"),
                                 strip.background = element_blank(),
                                 legend.title = element_text(size=18, face="bold"),
                                 legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = 0.26*gt$widths[5]

png(file="07-Plots/bars.spkm.raw.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

*Phylum relative abundance barplot*

In [None]:
## Plots Phylum
phyl_bars <- dfraw %>% filter(superkingdom %in% c("Bacteria","Archaea","Eukaryota","Viruses")) %>% select(c(1,3,9:23)) %>% melt() %>% drop_na() %>% group_by(superkingdom, phylum, variable) %>% summarise(count = sum(value)) %>% ungroup() %>% group_by(variable) %>% mutate(perc = count/sum(count) * 100) %>% filter(count > 0)

phyl_bars$superkingdom <- factor(phyl_bars$superkingdom, levels = c("Archaea","Viruses","Bacteria","Eukaryota"))
phyl_bars$variable <- factor(phyl_bars$variable, levels = newnames)
phyl_bars <- phyl_bars %>% mutate(type = ifelse(grepl("EN", variable), "Enclosed", "Open"))

p <- phyl_bars %>% ggplot() + geom_col(aes(x=variable,y=perc,fill=superkingdom), color="black") +
 labs(x = "Sample", y = "Percentage", fill="Superkingdom") +
 #guides(fill = "none") +
 scale_fill_manual(values = c("coral3", "darkcyan", "orange3")) +
 facet_wrap(~type, scales="free_x") +
 theme_classic() %+replace% theme(axis.text.x = element_text(angle = 90, size=18),
                                 axis.text.y = element_text(size=18),
                                 axis.title = element_text(size=20),
                                 strip.text.x = element_text(size = 16, face="bold"),
                                 strip.background = element_blank(),
                                 legend.title = element_text(size=18, face="bold"),
                                 legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = 0.28*gt$widths[5]

png(file="bars.phyl.raw.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

*Top phyla relative abundance barplot*

In [None]:
## Plot Phylum
phyl_bars <- dfraw %>% filter(superkingdom %in% c("Bacteria","Archaea","Eukaryota","Viruses")) %>% select(c(1,3,9:23)) %>% melt() %>% drop_na() %>% group_by(superkingdom, phylum, variable) %>% summarise(count = sum(value)) %>% ungroup() %>% group_by(variable) %>% mutate(perc = count/sum(count) * 100) %>% filter(count > 0)

phyl_bars$superkingdom <- factor(phyl_bars$superkingdom, levels = c("Archaea","Viruses","Bacteria","Eukaryota"))
phyl_bars$variable <- factor(phyl_bars$variable, levels = newnames)
phyl_bars <- phyl_bars %>% mutate(type = ifelse(grepl("EN", variable), "Enclosed", "Open"))

top15 <- phyl_bars %>% select(superkingdom, phylum, perc) %>% group_by(superkingdom) %>%
top_n(15) %>% ungroup() %>% select(phylum) %>% unique()

phyl_bars_others <- phyl_bars %>%
  mutate(category = ifelse(phylum %in% top15$phylum, phylum, "Others")) %>%
  group_by(variable, category) %>%
  summarise(total_perc = sum(perc)) %>%
  ungroup()

phyl_bars_others <- phyl_bars_others %>% mutate(type = ifelse(grepl("EN", variable), "Enclosed", "Open"))

col = c("#FF5733", "#33FF57", "#3366FF", "#9933FF", "#FFA500", "#A52A2A","grey")
order <- (phyl_bars_others %>% group_by(category) %>% summarise(total_perc = sum(total_perc)) %>%
filter(category != "Others") %>% arrange(desc(total_perc)))$category
phyl_bars_others$category <- factor(phyl_bars_others$category, levels = c(order,"Others"))

p <- phyl_bars_others %>% ggplot() + geom_col(aes(x=variable,y=total_perc,fill=category), color="black") +
 labs(x="Sample", y="Percentage", fill="Phylum") +
 scale_fill_manual(values = col) +
 facet_wrap(~type, scales="free_x") +
 theme_classic() %+replace% theme(axis.text.x = element_text(angle = 90, size=18),
                                 axis.text.y = element_text(size=18),
                                 axis.title = element_text(size=20),
                                 strip.text.x = element_text(size = 16, face="bold"),
                                 strip.background = element_blank(),
                                 legend.title = element_text(size=18, face="bold"),
                                 legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = 0.34*gt$widths[5]

png(file="bars.phyl.top15.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

*nMDS function*

In [4]:
# Relative Abundance PCA (nMDS) - Mads!

nMDS_plot <- function(reads, samples) {
  # Create scores
  nmds = metaMDS(reads, distance = "bray")
  nmds.sites.scores = as.data.frame(vegan::scores(nmds)$sites)
  #nmds.species.scores = as.data.frame(vegan::scores(nmds)$species)

  # Add metadata
  nmds.sites.scores$sample = samples$sample
  nmds.sites.scores$pore = samples$pore

  # Factorize
  nmds.sites.scores <- nmds.sites.scores %>%
    dplyr::mutate(pore=factor(pore, levels=c("EN0.2", "OP0.2", "OP1.2", "OP5.0", "OP8.0")))

  # Plot
  p <- ggplot(nmds.sites.scores, aes(x = NMDS1, y = NMDS2, label=sample)) + 
    geom_point(size = 5, aes(colour = pore)) +
    geom_encircle(aes(fill = pore), s_shape = 1, expand = 0,
                  alpha = 0.2, color = "black", show.legend = FALSE) +
    geom_text_repel(size=4) +
    labs(colour='Filter type') +
    theme_classic() %+replace% theme(axis.text.x = element_text(size=14),
                                 axis.text.y = element_text(size=14),
                                 axis.title = element_text(size=20),
                                 legend.title = element_text(size=18, face="bold"),
                                 legend.text = element_text(size=16))
  
  return(p)
}

*Load data into specific format for easy rarefaction*

In [5]:
# Normalization (Rarefy)
# Function rarefy gives the expected species richness in random subsamples of size sample from the community. The function rarefy is based on Hurlbert's (1971) formulation, and the standard errors on Heck et al. (1975). In order to rarefy the data we need to create a count of species (rows) per sample (columns). The workflow is based in Eva's script including Mads modifications (Vandloeb_Summer_20200529.Rmd).
dfimp <- import.metabarcoding.data("Pore_size_table_for_rarefy.txt") # Read data with ROBITools

## nMDS plot (before rarefaction)
samples <- dfimp@samples
samples <- samples %>% mutate(pore = ifelse(sample %in% c("OP0.2A","OP0.2B","OP0.2C"), "OP0.2",
                                     ifelse(sample %in% c("OP1.2A","OP1.2B","OP1.2C"), "OP1.2",
                                     ifelse(sample %in% c("OP5.0A","OP5.0B","OP5.0C"), "OP5.0",
                                     ifelse(sample %in% c("OP8.0A","OP8.0B","OP8.0C"), "OP8.0", "EN0.2")))))
reads <- dfimp@reads

*Raw nMDS*

In [6]:
p <- nMDS_plot(reads, samples)

png(file="07-Plots/nMDS.raw.png", width=2300, height=2000, res=300)
p
dev.off()

Square root transformation
Wisconsin double standardization
Run 0 stress 0.04893688 
Run 1 stress 0.04893688 
... Procrustes: rmse 3.534301e-06  max resid 8.456717e-06 
... Similar to previous best
Run 2 stress 0.04893688 
... Procrustes: rmse 9.355799e-06  max resid 2.402621e-05 
... Similar to previous best
Run 3 stress 0.04893688 
... New best solution
... Procrustes: rmse 2.61904e-06  max resid 6.451125e-06 
... Similar to previous best
Run 4 stress 0.214736 
Run 5 stress 0.04893688 
... Procrustes: rmse 2.215447e-06  max resid 5.745128e-06 
... Similar to previous best
Run 6 stress 0.04893689 
... Procrustes: rmse 5.351117e-06  max resid 1.278868e-05 
... Similar to previous best
Run 7 stress 0.04893688 
... Procrustes: rmse 3.810022e-06  max resid 1.036425e-05 
... Similar to previous best
Run 8 stress 0.04893688 
... Procrustes: rmse 1.128188e-05  max resid 2.808414e-05 
... Similar to previous best
Run 9 stress 0.04893688 
... Procrustes: rmse 3.094298e-06  max resid 8.290937e-

*Data rarefaction*

In [7]:
## Rarefy by sample
mdn <- summary(rowSums(dfimp@reads[dfimp@samples$sample,]))[[3]]
dfrar <- ROBITools::rarefy(dfimp, n = mdn, MARGIN="sample")

## Update TaxaCounts
dfrar@motus$count <- colSums(dfrar@reads) # Add count to rarefy species

## Clean data after rarefy
table(colSums(dfrar@reads)>0) # 2040 taxa equal to 0
dfrar <- dfrar[,colSums(dfrar@reads)>0] # Remove species with no hits
dfrar <- dfrar[rowSums(dfrar@reads)>0,] # Remove samples with no hits

## Transform data to final dataframe
dfend <- dfrar@reads %>% t() %>% as.data.frame() %>% mutate(id = colnames(dfrar@reads))
dfend <- merge(dfend,dfrar@motus, by="id") %>% select(-c(Var.17,id,count))
dfend <- dfend[,c(16:23,1:15)]
colnames(dfend)[1:8] <- c("superkingdom","kingdom","phylum","class","order","family","genus","species")


FALSE  TRUE 
  795  7471 

*Superkingdom rarefied relative abundance Barplot*

In [19]:
# Plots
spkm_bars <- dfend %>% filter(superkingdom %in% c("Bacteria","Archaea","Eukaryota","Viruses")) %>% select(c(1,9:23)) %>% melt() %>% group_by(superkingdom, variable) %>% summarise(count = sum(value)) %>% ungroup() %>% group_by(variable) %>% mutate(perc = count/sum(count) * 100)

spkm_bars$superkingdom <- factor(spkm_bars$superkingdom, levels = c("Archaea","Viruses","Bacteria","Eukaryota"))
spkm_bars$variable <- factor(spkm_bars$variable, levels = newnames)
spkm_bars <- spkm_bars %>% mutate(type = ifelse(grepl("EN", variable), "Enclosed", "Open"))

p <- spkm_bars %>% ggplot() + geom_col(aes(x=variable,y=perc,fill=superkingdom)) +
 labs(x = "Sample", y = "Percentage", fill="Superkingdom") +
 scale_fill_manual(values = c("coral3", "darkolivegreen3", "darkcyan", "orange3")) +
 facet_wrap(~type, scales="free_x") +
 theme_classic() %+replace% theme(axis.text.x = element_text(angle = 90, size=18),
                                 axis.text.y = element_text(size=18),
                                 axis.title = element_text(size=20),
                                 strip.text.x = element_text(size = 16, face="bold"),
                                 strip.background = element_blank(),
                                 legend.title = element_text(size=18, face="bold"),
                                 legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = 0.28*gt$widths[5]

png(file="07-Plots/bars.spkm.rarefy.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

Using superkingdom as id variables

[1m[22m`summarise()` has grouped output by 'superkingdom'. You can override using the
`.groups` argument.


*Rarefied nMDS plot*

In [8]:
## nMDS plot (after rarefaction)
samples <- dfrar@samples
samples <- samples %>% mutate(pore = ifelse(sample %in% c("OP0.2A","OP0.2B","OP0.2C"), "OP0.2",
                                     ifelse(sample %in% c("OP1.2A","OP1.2B","OP1.2C"), "OP1.2",
                                     ifelse(sample %in% c("OP5.0A","OP5.0B","OP5.0C"), "OP5.0",
                                     ifelse(sample %in% c("OP8.0A","OP8.0B","OP8.0C"), "OP8.0", "EN0.2")))))
reads <- dfrar@reads
p <- nMDS_plot(reads, samples)

png(file="07-Plots/nMDS.rarefy.png", width=2300, height=2000, res=300)
p
dev.off()

Square root transformation
Wisconsin double standardization
Run 0 stress 0.03937219 
Run 1 stress 0.03937219 
... Procrustes: rmse 2.3399e-06  max resid 5.479825e-06 
... Similar to previous best
Run 2 stress 0.03937219 
... Procrustes: rmse 3.228395e-06  max resid 6.784848e-06 
... Similar to previous best
Run 3 stress 0.2211713 
Run 4 stress 0.03937219 
... Procrustes: rmse 3.915327e-06  max resid 7.596578e-06 
... Similar to previous best
Run 5 stress 0.03937219 
... Procrustes: rmse 4.892426e-06  max resid 1.131157e-05 
... Similar to previous best
Run 6 stress 0.03937219 
... Procrustes: rmse 2.518187e-06  max resid 4.901389e-06 
... Similar to previous best
Run 7 stress 0.03937219 
... Procrustes: rmse 3.479705e-06  max resid 8.136378e-06 
... Similar to previous best
Run 8 stress 0.03937219 
... Procrustes: rmse 3.552628e-06  max resid 6.868828e-06 
... Similar to previous best
Run 9 stress 0.2335904 
Run 10 stress 0.03937219 
... Procrustes: rmse 7.179775e-06  max resid 1.51293