# Pre-Validation

In [None]:
# Libraries
library(tidyverse)
library(reshape2)
library(grid)

In [51]:
# Data
metabar_taxa  <- read.delim("GenomeDK_Metabar/classified.txt")
metabar_count <- read.delim("GenomeDK_Metabar/DADA2_nochim.table")

In [52]:
# Select controls
m_cne <- grepl("CNE", names(metabar_count))
m_cne[1] <- TRUE
m_ntc <- grepl("NTC", names(metabar_count))
m_ntc[1] <- TRUE

cne <- metabar_count[,m_cne]
ntc <- metabar_count[,m_ntc]

# Find contaminants
cne_cont <- cne %>% mutate(total = rowSums(cne[,-1])) %>% filter(total > 0)
ntc_cont <- ntc %>% mutate(total = rowSums(ntc[,-1])) %>% filter(total > 0)

In [53]:
# Select contaminant sequences
cont <- unique(c(cne_cont$id, ntc_cont$id))

# Remove contaminants and filter
metabarcoding <- metabar_taxa %>% filter(!qseqid %in% cont) %>% filter(pident.max.best > 90)

# Select contaminants
contaminants <- metabar_taxa %>% filter(qseqid %in% cont)

# Metabarcoding diversity
tx <- metabarcoding[,c(8:14)]
dk <- length(unique(tx$kingdom[!is.na(tx$kingdom)]))
dp <- length(unique(tx$phylum[!is.na(tx$phylum)]))
dc <- length(unique(tx$class[!is.na(tx$class)]))
do <- length(unique(tx$order[!is.na(tx$order)]))
df <- length(unique(tx$family[!is.na(tx$family)]))
dg <- length(unique(tx$genus[!is.na(tx$genus)]))
ds <- length(unique(tx$species[!is.na(tx$species)]))

message(paste("Metabarcoding diversity: kingdom: ", dk, ", phylum: ", dp, ", class: ",
              dc, ", order: ", do, ", family: ", df, ", genus: ", dg, " and species: ", ds, sep=""))

Metabarcoding diversity: kingdom: 3, phylum: 46, class: 155, order: 432, family: 808, genus: 1292 and species: 2058



In [54]:
# Remove contaminant from counts
metabar_count_clean <- metabar_count %>% filter(!id %in% cont)
metabar_count_clean <- metabar_count_clean[,!grepl("NTC", names(metabar_count_clean))]
metabar_count_clean <- metabar_count_clean[,!grepl("CNE", names(metabar_count_clean))]

In [55]:
## Merge replicates

# Transpose dataset
x <- as.data.frame(t(metabar_count_clean))
colnames(x) <- x[1,]
x <- x[-1,]
# To numeric
mat <- apply(x, 2, as.numeric)
rownames(mat) <- rownames(x)
mat <- as.data.frame(mat)
# Split dataframe
gps <- rownames(mat)[grepl("GP",rownames(mat))]
nogps <- mat[rownames(mat)[!rownames(mat) %in% gps],]
gps <- mat[gps,]
# Mutate
nogps <- nogps %>% mutate(sample = unlist(lapply(str_split(rownames(nogps), "_"), function(x) paste(x[1], x[2], sep="_"))))
gps <- gps  %>% mutate(sample = unlist(lapply(str_split(rownames(gps), "_"), function(x) x[1])))
# Bind
mat <- rbind(nogps,gps)
# Group replicates Median
mat <- melt(mat) %>% group_by(sample, variable) %>% summarize(mean = mean(value))                                         

Using sample as id variables

[1m[22m`summarise()` has grouped output by 'sample'. You can override using the `.groups`
argument.


In [56]:
# Rename samples
cols <- mat$sample

clean_cols <- str_replace(str_replace(cols, "sample.", ""),"u","")
clean_cols <- str_replace(clean_cols, "GP1", "EN0.2A")
clean_cols <- str_replace(clean_cols, "GP2", "EN0.2B")
clean_cols <- str_replace(clean_cols, "GP3", "EN0.2C")
clean_cols <- str_replace(clean_cols, "1_0.2", "OP0.2A")
clean_cols <- str_replace(clean_cols, "2_0.2", "OP0.2B")
clean_cols <- str_replace(clean_cols, "3_0.2", "OP0.2C")
clean_cols <- str_replace(clean_cols, "1_1.2", "OP1.2A")
clean_cols <- str_replace(clean_cols, "2_1.2", "OP1.2B")
clean_cols <- str_replace(clean_cols, "3_1.2", "OP1.2C")
clean_cols <- str_replace(clean_cols, "1_5.0", "OP5.0A")
clean_cols <- str_replace(clean_cols, "2_5.0", "OP5.0B")
clean_cols <- str_replace(clean_cols, "3_5.0", "OP5.0C")
clean_cols <- str_replace(clean_cols, "1_8.0", "OP8.0A")
clean_cols <- str_replace(clean_cols, "2_8.0", "OP8.0B")
clean_cols <- str_replace(clean_cols, "3_8.0", "OP8.0C")

mat$correct_sample <- clean_cols

# Validation


In [7]:
newnames <- c('EN0.2A','EN0.2B','EN0.2C','OP0.2A','OP0.2B','OP0.2C','OP1.2A','OP1.2B','OP1.2C','OP5.0A','OP5.0B','OP5.0C','OP8.0A','OP8.0B','OP8.0C')

In [67]:
# Database
arter <- read.table("arter.tsv", sep="\t", header=T)
colnames(arter) <- c("Phylum","Class","Order","Family","Genus","Species", "Marine")
marine <- arter[arter$Marine == "Yes",]

### Shotgun

In [68]:
# Data
shotgun_taxa_count <- read.delim("GenomeDK_LCA/counts.lca.rarefy.tsv")

**Phylum**

In [69]:
# Transform
ps <- shotgun_taxa_count %>% filter(superkingdom == "Eukaryota")
ps <- ps[,c(3,9:23)]
ps <- ps %>% drop_na(phylum)
ps <- melt(ps) %>% group_by(phylum,variable) %>% summarise(value = sum(value)) %>%
mutate(origin = ifelse(phylum %in% marine$Phylum, "Danish", "Exotic"))

Using phylum as id variables

[1m[22m`summarise()` has grouped output by 'phylum'. You can override using the `.groups`
argument.


In [70]:
unique(ps$phylum[ps$phylum %in% marine$Phylum])
unique(ps$phylum[!ps$phylum %in% marine$Phylum])

length(unique(ps$phylum[ps$phylum %in% marine$Phylum]))
length(unique(ps$phylum[!ps$phylum %in% marine$Phylum]))

In [71]:
# Copy dataframe
ps_count <- ps

# Plot
ps_count$variable <- factor(ps$variable, levels = newnames)
ps_count <- ps_count %>% mutate(type = ifelse(grepl("EN", variable), "Enclosed", "Open"))

p <- ps_count %>% drop_na() %>%
ggplot() + geom_col(aes(x=variable, y=value, fill=origin), color="black") +
scale_fill_manual(values = c("forestgreen", "firebrick")) +
labs(fill = "", x="", y="#Reads per phyla", title="Phyla") +
facet_wrap(~type, scales="free_x") +
theme_classic() %+replace% theme(
    title = element_text(size=20, face="bold"),
    axis.text.x = element_text(angle = 90, size=18),
    axis.text.y = element_text(size=18),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 16, face="bold"),
    strip.background = element_blank(),
    legend.title = element_text(size=18, face="bold"),
    legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = unit(0.28, "null")

png(file="GenomeDK_LCA/07-Plots/validation.shotgun.phylum.count.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

In [72]:
# Copy dataframe
ps_perc <- ps

# Plot
ps_perc <- ps_perc %>% group_by(variable) %>% drop_na(phylum) %>%
filter(value != 0) %>% mutate(total = sum(value)) %>% ungroup() %>% 
mutate(perc = value / total * 100)

ps_perc$variable <- factor(ps_perc$variable, levels = newnames)
ps_perc <- ps_perc %>% mutate(type = ifelse(grepl("EN", variable), "Enclosed", "Open"))

p <- ps_perc %>% drop_na() %>%
ggplot() + geom_col(aes(x=variable, y=perc, fill=origin), color="black") +
scale_fill_manual(values = c("forestgreen", "firebrick")) +
labs(fill = "", x="", y="#Reads per phyla", title="Phylum") +
facet_wrap(~type, scales="free_x") +
theme_classic() %+replace% theme(
    title = element_text(size=20, face="bold"),
    axis.text.x = element_text(angle = 90, size=18),
    axis.text.y = element_text(size=18),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 16, face="bold"),
    strip.background = element_blank(),
    legend.title = element_text(size=18, face="bold"),
    legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = unit(0.28, "null")

png(file="GenomeDK_LCA/07-Plots/validation.shotgun.phylum.perc.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

In [73]:
val_perc_phyla <- ps_perc %>% select(variable, origin, perc) %>% group_by(variable,origin) %>% summarise(perc = sum(perc))
write.table(val_perc_phyla, "validation.shotgun.phylum_perc.tsv", quote=F, row.names=F, sep="\t")
val_perc_phyla

[1m[22m`summarise()` has grouped output by 'variable'. You can override using the `.groups`
argument.


variable,origin,perc
<fct>,<chr>,<dbl>
EN0.2A,Danish,1.4447632
EN0.2A,Exotic,98.5552368
EN0.2B,Danish,2.5044818
EN0.2B,Exotic,97.4955182
EN0.2C,Danish,0.9695176
EN0.2C,Exotic,99.0304824
OP0.2A,Danish,1.9528834
OP0.2A,Exotic,98.0471166
OP0.2B,Danish,0.9043133
OP0.2B,Exotic,99.0956867


In [74]:
table(unique(marine$Phylum) %in% ps_count$phylum)


FALSE  TRUE 
    8    17 

**Metazoa**

In [75]:
# Transform
ps <- shotgun_taxa_count %>% filter(superkingdom == "Eukaryota", kingdom == "Metazoa")
ps <- ps[,c(3,9:23)]
ps <- melt(ps) %>% group_by(phylum,variable) %>% summarise(value = sum(value)) %>%
mutate(origin = ifelse(phylum %in% marine$Phylum, "Danish", "Exotic"))

Using phylum as id variables

[1m[22m`summarise()` has grouped output by 'phylum'. You can override using the `.groups`
argument.


In [76]:
unique(ps$phylum[ps$phylum %in% marine$Phylum])
unique(ps$phylum[!ps$phylum %in% marine$Phylum])

length(unique(ps$phylum[ps$phylum %in% marine$Phylum]))
length(unique(ps$phylum[!ps$phylum %in% marine$Phylum]))

In [77]:
# Copy dataframe
ps_count <- ps

# Plot
ps_count$variable <- factor(ps_count$variable, levels = newnames)
ps_count <- ps_count %>% mutate(type = ifelse(grepl("EN", variable), "Enclosed", "Open"))

p <- ps_count %>% drop_na() %>%
ggplot() + geom_col(aes(x=variable, y=value, fill=origin), color="black") +
scale_fill_manual(values = c("forestgreen", "firebrick")) +
labs(fill = "", x="", y="#Reads per phyla", title="Metazoa") +
facet_wrap(~type, scales="free_x") +
theme_classic() %+replace% theme(
    title = element_text(size=20, face="bold"),
    axis.text.x = element_text(angle = 90, size=18),
    axis.text.y = element_text(size=18),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 16, face="bold"),
    strip.background = element_blank(),
    legend.title = element_text(size=18, face="bold"),
    legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = unit(0.28, "null")

png(file="GenomeDK_LCA/07-Plots/validation.shotgun.metazoa.read_count.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

In [78]:
# Copy dataframe
ps_perc <- ps

# Plot
ps_perc <- ps_perc %>% group_by(variable) %>% drop_na(phylum) %>%
filter(value != 0) %>% mutate(total = sum(value)) %>% ungroup() %>% 
mutate(perc = value / total * 100)

ps_perc$variable <- factor(ps_perc$variable, levels = newnames)
ps_perc <- ps_perc %>% mutate(type = ifelse(grepl("EN", variable), "Enclosed", "Open"))

p <- ps_perc %>% drop_na() %>%
ggplot() + geom_col(aes(x=variable, y=perc, fill=origin), color="black") +
scale_fill_manual(values = c("forestgreen", "firebrick")) +
labs(fill = "", x="", y="%Reads per phyla", title="Metazoa") +
facet_wrap(~type, scales="free_x") +
theme_classic() %+replace% theme(
    title = element_text(size=20, face="bold"),
    axis.text.x = element_text(angle = 90, size=18),
    axis.text.y = element_text(size=18),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 16, face="bold"),
    strip.background = element_blank(),
    legend.title = element_text(size=18, face="bold"),
    legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = unit(0.28, "null")

png(file="GenomeDK_LCA/07-Plots/validation.shotgun.metazoa.read_perc.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

In [79]:
val_perc_metazoa <- ps_perc %>% select(variable, origin, perc) %>% group_by(variable,origin) %>% summarise(perc = sum(perc))
write.table(val_perc_metazoa, "validation.shotgun.metazoa_perc.tsv", quote=F, row.names=F, sep="\t")
val_perc_metazoa

[1m[22m`summarise()` has grouped output by 'variable'. You can override using the `.groups`
argument.


variable,origin,perc
<fct>,<chr>,<dbl>
EN0.2A,Danish,99.93493819
EN0.2A,Exotic,0.06506181
EN0.2B,Danish,99.94680851
EN0.2B,Exotic,0.05319149
EN0.2C,Danish,100.0
OP0.2A,Danish,100.0
OP0.2B,Danish,100.0
OP0.2C,Danish,100.0
OP1.2A,Danish,100.0
OP1.2B,Danish,100.0


In [80]:
table(unique(marine$Phylum) %in% ps_perc$phylum)


FALSE  TRUE 
    8    17 

### Metabarcoding

In [57]:
# Dataset
taxa <- metabar_taxa[, c(2,8:14)]
dataset <- merge(mat, taxa, by.x="variable", by.y="qseqid")
head(dataset)

Unnamed: 0_level_0,variable,sample,mean,correct_sample,kingdom,phylum,class,order,family,genus,species
Unnamed: 0_level_1,<fct>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,seq1,sample.3_5.0u,50763.5,OP5.0C,Metazoa,Arthropoda,Hexanauplia,Calanoida,Paracalanidae,Paracalanus,Paracalanus parvus
2,seq1,sample.1_8.0u,299849.75,OP8.0A,Metazoa,Arthropoda,Hexanauplia,Calanoida,Paracalanidae,Paracalanus,Paracalanus parvus
3,seq1,sample.2_5.0u,29921.25,OP5.0B,Metazoa,Arthropoda,Hexanauplia,Calanoida,Paracalanidae,Paracalanus,Paracalanus parvus
4,seq1,sample.3_8.0u,2406.75,OP8.0C,Metazoa,Arthropoda,Hexanauplia,Calanoida,Paracalanidae,Paracalanus,Paracalanus parvus
5,seq1,sample.3_1.2u,1291.25,OP1.2C,Metazoa,Arthropoda,Hexanauplia,Calanoida,Paracalanidae,Paracalanus,Paracalanus parvus
6,seq1,sample.1_0.2u,1224.75,OP0.2A,Metazoa,Arthropoda,Hexanauplia,Calanoida,Paracalanidae,Paracalanus,Paracalanus parvus


**Phylum**

In [58]:
phylmat <- dataset %>% select(correct_sample, mean, phylum) %>% drop_na(phylum) %>%
    group_by(correct_sample, phylum) %>% summarise(mean = sum(mean)) %>% ungroup() %>%
    group_by(correct_sample) %>% mutate(perc = mean/sum(mean)) %>% ungroup() %>%
    mutate(origin = ifelse(phylum %in% marine$Phylum, "Danish", "Exotic"))

[1m[22m`summarise()` has grouped output by 'correct_sample'. You can override using the
`.groups` argument.


In [59]:
length(unique(phylmat$phylum[phylmat$phylum %in% marine$Phylum]))
length(unique(phylmat$phylum[!phylmat$phylum %in% marine$Phylum]))

unique(phylmat$phylum[phylmat$phylum %in% marine$Phylum])
unique(phylmat$phylum[!phylmat$phylum %in% marine$Phylum])

table(unique(marine$Phylum) %in% phylmat$phylum)


FALSE  TRUE 
    8    17 

In [61]:
newnames <- c('EN0.2A','EN0.2B','EN0.2C','OP0.2A','OP0.2B','OP0.2C','OP1.2A','OP1.2B','OP1.2C','OP5.0A','OP5.0B','OP5.0C','OP8.0A','OP8.0B','OP8.0C')
phylmat$correct_sample <- factor(phylmat$correct_sample, levels = newnames)
phylmat <- phylmat %>% mutate(type = ifelse(grepl("EN", correct_sample), "Enclosed", "Open"))

p <- phylmat %>% drop_na() %>%
ggplot() + geom_col(aes(x=correct_sample, y=perc, fill=origin), color="black") +
scale_fill_manual(values = c("forestgreen", "firebrick")) +
labs(fill = "", x="", y="#Reads per phyla", title="Phylum") +
facet_wrap(~type, scales="free_x") +
theme_classic() %+replace% theme(
    title = element_text(size=20, face="bold"),
    axis.text.x = element_text(angle = 90, size=18),
    axis.text.y = element_text(size=18),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 16, face="bold"),
    strip.background = element_blank(),
    legend.title = element_text(size=18, face="bold"),
    legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = unit(0.28,"null")

png(file="GenomeDK_Metabar/validation.metabar.phylum.read_mean_perc.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

In [62]:
val_perc_phyla <- phylmat %>% select(correct_sample, origin, perc) %>%
    group_by(correct_sample,origin) %>% summarise(perc = sum(perc))
write.table(val_perc_phyla, "validation.metabar.phyla_perc.tsv", quote=F, row.names=F, sep="\t")
val_perc_phyla

[1m[22m`summarise()` has grouped output by 'correct_sample'. You can override using the
`.groups` argument.


correct_sample,origin,perc
<fct>,<chr>,<dbl>
EN0.2A,Danish,0.2585448
EN0.2A,Exotic,0.7414552
EN0.2B,Danish,0.2695439
EN0.2B,Exotic,0.7304561
EN0.2C,Danish,0.1326119
EN0.2C,Exotic,0.8673881
OP0.2A,Danish,0.1168476
OP0.2A,Exotic,0.8831524
OP0.2B,Danish,0.2086304
OP0.2B,Exotic,0.7913696


**Metazoa**

In [63]:
metazoamat <- dataset %>% filter(kingdom == "Metazoa") %>%
    select(correct_sample, mean, phylum) %>% drop_na(phylum) %>%
    group_by(correct_sample, phylum) %>% summarise(mean = sum(mean)) %>% ungroup() %>%
    group_by(correct_sample) %>% mutate(perc = mean/sum(mean)) %>% ungroup() %>%
    mutate(origin = ifelse(phylum %in% marine$Phylum, "Danish", "Exotic"))

[1m[22m`summarise()` has grouped output by 'correct_sample'. You can override using the
`.groups` argument.


In [64]:
length(unique(metazoamat$phylum[metazoamat$phylum %in% marine$Phylum]))
length(unique(metazoamat$phylum[!metazoamat$phylum %in% marine$Phylum]))

unique(metazoamat$phylum[metazoamat$phylum %in% marine$Phylum])
unique(metazoamat$phylum[!metazoamat$phylum %in% marine$Phylum])

table(unique(marine$Phylum) %in% phylmat$phylum)


FALSE  TRUE 
    8    17 

In [65]:
newnames <- c('EN0.2A','EN0.2B','EN0.2C','OP0.2A','OP0.2B','OP0.2C','OP1.2A','OP1.2B','OP1.2C','OP5.0A','OP5.0B','OP5.0C','OP8.0A','OP8.0B','OP8.0C')
metazoamat$correct_sample <- factor(metazoamat$correct_sample, levels = newnames)
metazoamat <- metazoamat %>% mutate(type = ifelse(grepl("EN", correct_sample), "Enclosed", "Open"))

p <- metazoamat %>% drop_na() %>%
ggplot() + geom_col(aes(x=correct_sample, y=perc, fill=origin), color="black") +
scale_fill_manual(values = c("forestgreen", "firebrick")) +
labs(fill = "", x="", y="#Reads per phyla", title="Metazoa") +
facet_wrap(~type, scales="free_x") +
theme_classic() %+replace% theme(
    title = element_text(size=20, face="bold"),
    axis.text.x = element_text(angle = 90, size=18),
    axis.text.y = element_text(size=18),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 16, face="bold"),
    strip.background = element_blank(),
    legend.title = element_text(size=18, face="bold"),
    legend.text = element_text(size=16))

gt = ggplot_gtable(ggplot_build(p))
gt$widths[5] = unit(0.28, "null")

png(file="GenomeDK_Metabar/validation.metabar.metazoa.read_mean_perc.png", width=2300, height=2000, res=300)
grid.draw(gt)
dev.off()

In [66]:
val_perc_metazoa <- metazoamat %>% select(correct_sample, origin, perc) %>%
    group_by(correct_sample,origin) %>% summarise(perc = sum(perc))
write.table(val_perc_metazoa, "validation.metabar.metazoa_perc.tsv", quote=F, row.names=F, sep="\t")
val_perc_metazoa

[1m[22m`summarise()` has grouped output by 'correct_sample'. You can override using the
`.groups` argument.


correct_sample,origin,perc
<fct>,<chr>,<dbl>
EN0.2A,Danish,0.9972116
EN0.2A,Exotic,0.002788352
EN0.2B,Danish,0.9988592
EN0.2B,Exotic,0.001140805
EN0.2C,Danish,0.9991932
EN0.2C,Exotic,0.0008067894
OP0.2A,Danish,0.9971967
OP0.2A,Exotic,0.002803326
OP0.2B,Danish,0.9956435
OP0.2B,Exotic,0.004356528
