In [None]:
library(tidyverse)
library(here)
library(cowplot)

In [None]:
inPathF = here('analysis','variant_calling_GWAS_SNPS/')
inPathU = here('analysis','variant_calling_GWAS_SNPS_subset100000/')

## Analyzing the overall QUAL of the SNPs for both, filtered and unfiltered VCF files

In [None]:
varQualF = read_delim(paste0(inPathF, 'SNPS_GWAS.lqual'), delim = "\t",
                     col_names = c("chr", "pos", "qual"), skip = 1)
varQualU = read_delim(paste0(inPathU, 'SNPS_135-samples_subset100000.lqual'), delim = "\t",
                     col_names = c("chr", "pos", "qual"), skip = 1)

In [None]:
summary(varQualF$qual)
quantile(varQualF$qual, prob = c(0.25, 0.5, 0.75, 1))
summary(varQualU$qual)
quantile(varQualU$qual, prob = c(0.25, 0.5, 0.75, 1))

In [None]:
nrow(varQualF)
nrow(varQualU)

In [None]:
# What is the percentage of variants having a QUAL value below 30

# Cumulative distribution function
fF <- ecdf(varQualF$qual)
# How many sites have QUAL <= 1000 (percent)
fF(100000) * 100
# How many sites have QUAL > 1000 (percent)
100 - (fF(100000) * 100)


# Cumulative distribution function
fU <- ecdf(varQualU$qual)
# How many sites have QUAL <= 30 (percent)
fU(100) * 100
# How many sites have QUAL > 30 (percent)
100 - (fU(30) * 100)

In [None]:
# Estimating the numbers of retained SNPs after filtering for QUAL > 30

In [None]:
# Creating plot for quality 
PlotQualF = ggplot(varQualF, aes(qual)) + 
    geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + 
    theme_light() + 
    ggtitle("Filtered") + 
    theme(plot.title = element_text(hjust = 0.5, size = 25),
        legend.position = "none", 
        axis.title.x = element_text(size = 20, margin = margin(t = 20, r = 0, b = 0, l = 0)),  
        axis.title.y = element_text(size = 20, margin = margin(t = 0, r = 20, b = 0, l = 0)),
        axis.text.x = element_text(size=15),
        axis.text.y = element_text(size=15)) + 
    xlab("Quality per SNP") + 
    ylab("Density") +
    annotate("text", label = "bar(x) == '278,941'", parse = TRUE, x = 6250, y = 0.000145, size = 7, hjust=0) + 
    annotate("text", label = "tilde(x) == '16,593'", parse = TRUE, x = 6250, y = 0.00013, size = 7, hjust=0) + 
    xlim(0, 10000) +
    scale_y_continuous(breaks = c(0, 0.00005, 0.0001, 0.00015, 0.0002),
                       labels = expression('0', paste(5, 'x', 10^-5), paste(10^-4), paste(1.5, 'x', 10^-4), paste(2, 'x', 10^-4)))
# Creating plot for quality 
PlotQualU = ggplot(varQualU, aes(qual)) + 
    geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + 
    theme_light() + 
    ggtitle("Unfiltered") + 
    theme(plot.title = element_text(hjust = 0.5, size = 25),
        legend.position = "none", 
        axis.title.x = element_text(size = 20, margin = margin(t = 20, r = 0, b = 0, l = 0)),  
        axis.title.y = element_text(size = 20, margin = margin(t = 0, r = 20, b = 0, l = 0)),
        axis.text.x = element_text(size=15),
        axis.text.y = element_text(size=15)) + 
    xlab("Quality per SNP") + 
    ylab("Density") +
    annotate("text", label = "bar(x) == '19,579'", parse = TRUE, x = 125, y = 0.1, size = 7, hjust=0) + 
    annotate("text", label = "tilde(x) == '6'", parse = TRUE, x = 125, y = 0.092, size = 7, hjust=0) + 
    xlim(0, 200)

PlotQualF
PlotQualU

## Analyzing the average read depth of the SNPs for both, filtered and unfiltered VCF files

In [None]:
varDepthF <- read_delim(paste0(inPathF, 'SNPS_GWAS.ldepth.mean'), delim = "\t",
                     col_names = c("chr", "pos", "mean_depth", "var_depth"), skip = 1)
varDepthU <- read_delim(paste0(inPathU, 'SNPS_135-samples_subset100000.ldepth.mean'), delim = "\t",
                     col_names = c("chr", "pos", "mean_depth", "var_depth"), skip = 1)

In [None]:
# Statistics about the mean sequencing depth for each site
summary(varDepthF$mean_depth)
quantile(varDepthF$mean_depth, prob = c(0.25, 0.5, 0.75, 1), na.rm = TRUE)

summary(varDepthU$mean_depth)
quantile(varDepthU$mean_depth, prob = c(0.25, 0.5, 0.75, 1), na.rm = TRUE)

In [None]:
# What is the percentage of variants having a QUAL value below 30
# Cumulative distribution function
fDF <- ecdf(varDepthF$mean_depth)
# How many sites have read mean depth <= 61 (percent)
fDF(61) * 100
# How many sites have read mean depth > 61 (percent)
100 - (fDF(61) * 100)
# Estimating the numbers of retained SNPs after filtering for read mean depth > 61

# What is the percentage of variants having a QUAL value below 30
# Cumulative distribution function
fDU <- ecdf(varDepthU$mean_depth)
# How many sites have read mean depth <= 61 (percent)
fDU(36) * 100
# How many sites have read mean depth > 61 (percent)
100 - (fDU(36) * 100)
# Estimating the numbers of retained SNPs after filtering for read mean depth > 61

In [None]:
# Creating plot for read depth
PlotDepthF = ggplot(varDepthF, aes(mean_depth)) + 
    geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + 
    theme_light() + 
    ggtitle("Filtered") + 
    theme(plot.title = element_text(hjust = 0.5, size = 25),
        legend.position = "none", 
        axis.title.x = element_text(size = 20, margin = margin(t = 20, r = 0, b = 0, l = 0)),  
        axis.title.y = element_text(size = 20, margin = margin(t = 0, r = 20, b = 0, l = 0)),
        axis.text.x = element_text(size=15),
        axis.text.y = element_text(size=15)) + 
    xlab("Mean depth per SNP") + 
    ylab("Density") +
    annotate("text", label = "bar(x) == '465'", parse = TRUE, x = 875, y = 0.00148, size = 7, hjust=0) + 
    annotate("text", label = "tilde(x) == '425'", parse = TRUE, x = 875, y = 0.00135, size = 7, hjust=0) + 
    xlim(0, 1500) +
    scale_y_continuous(breaks = c(0, 0.0005, 0.001, 0.0015, 0.002),
                       labels = expression('0', paste(5, 'x', 10^-4), paste(10^-3), paste(1.5, 'x', 10^-3), paste(2, 'x', 10^-3)))
PlotDepthU = ggplot(varDepthU, aes(mean_depth)) + 
    geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + 
    theme_light() + 
    ggtitle("Unfiltered") + 
    theme(plot.title = element_text(hjust = 0.5, size = 25),
        legend.position = "none", 
        axis.title.x = element_text(size = 20, margin = margin(t = 20, r = 0, b = 0, l = 0)),  
        axis.title.y = element_text(size = 20, margin = margin(t = 0, r = 20, b = 0, l = 0)),
        axis.text.x = element_text(size=15),
        axis.text.y = element_text(size=15)) + 
    xlab("Mean depth per SNP") + 
    ylab("Density") +
    annotate("text", label = "bar(x) == '36'", parse = TRUE, x = 60, y = 0.18, size = 7, hjust=0) + 
    annotate("text", label = "tilde(x) == '4'", parse = TRUE, x = 60, y = 0.162, size = 7, hjust=0) + 
    xlim(0, 100)

PlotDepthF
PlotDepthU

## Analyzing the distribution of average read depth among the samples for both, filtered and unfiltered VCF files

In [None]:
indDepthF <- read_delim(paste0(inPathF, 'SNPS_GWAS.idepth'), delim = "\t",
                     col_names = c("ind", "nsites", "depth"), skip = 1)
indDepthU <- read_delim(paste0(inPathU, 'SNPS_135-samples_subset100000.idepth'), delim = "\t",
                     col_names = c("ind", "nsites", "depth"), skip = 1)

In [None]:
# Statistics about the mean sequencing depth for each individual
summary(indDepthF$depth)
quantile(indDepthF$depth, prob = c(0.25, 0.5, 0.75, 1))

summary(indDepthU$depth)
quantile(indDepthU$depth, prob = c(0.25, 0.5, 0.75, 1))

In [None]:
# What is the percentage of individuals having an average read depth below 61
# Cumulative distribution function
fDeF <- ecdf(indDepthF$depth)
# How many samples have read mean depth <= 61 (percent)
fDeF(61) * 100
# How many samples have read mean depth > 61 (percent)
100 - (fDeF(61) * 100)
# Estimating the numbers of retained SNPs after filtering for read mean depth > 61

# What is the percentage of individuals having an average read depth below 61
# Cumulative distribution function
fDeU <- ecdf(indDepthU$depth)
# How many samples have read mean depth <= 61 (percent)
fDeU(61) * 100
# How many samples have read mean depth > 61 (percent)
100 - (fDeU(61) * 100)
# Estimating the numbers of retained SNPs after filtering for read mean depth > 61

In [None]:
PlotDepthSampleF = ggplot(indDepthF, aes(depth)) + 
    geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + 
    theme_light() + 
    ggtitle("Filtered") + 
    theme(plot.title = element_text(hjust = 0.5, size = 25),
        legend.position = "none", 
        axis.title.x = element_text(size = 20, margin = margin(t = 20, r = 0, b = 0, l = 0)),  
        axis.title.y = element_text(size = 20, margin = margin(t = 0, r = 20, b = 0, l = 0)),
        axis.text.x = element_text(size=15),
        axis.text.y = element_text(size=15)) + 
    xlab("Mean depth per sample") + 
    ylab("Density") +
    annotate("text", label = "bar(x) == '465'", parse = TRUE, x = 670, y = 0.005, size = 7, hjust=0) + 
    annotate("text", label = "tilde(x) == '467'", parse = TRUE, x = 670, y = 0.0045, size = 7, hjust=0) + 
    xlim(0, 1000) +
    scale_y_continuous(breaks = c(0, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008),
                       labels = expression('0', paste(10^-3), paste(2, 'x', 10^-3), paste(3, 'x', 10^-3), paste(4, 'x', 10^-3), paste(5, 'x', 10^-3), paste(6, 'x', 10^-3), paste(7, 'x', 10^-3), paste(8, 'x', 10^-3)))

PlotDepthSampleU = ggplot(indDepthU, aes(depth)) + 
    geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + 
    theme_light() + 
    ggtitle("Unfiltered") + 
    theme(plot.title = element_text(hjust = 0.5, size = 25),
        legend.position = "none", 
        axis.title.x = element_text(size = 20, margin = margin(t = 20, r = 0, b = 0, l = 0)),  
        axis.title.y = element_text(size = 20, margin = margin(t = 0, r = 20, b = 0, l = 0)),
        axis.text.x = element_text(size=15),
        axis.text.y = element_text(size=15)) + 
    xlab("Mean depth per sample") + 
    ylab("Density") +
    annotate("text", label = "bar(x) == '66'", parse = TRUE, x = 150, y = 0.05, size = 7, hjust=0) + 
    annotate("text", label = "tilde(x) == '65'", parse = TRUE, x = 150, y = 0.045, size = 7, hjust=0) + 
    xlim(0, 200) +
    scale_y_continuous(breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06))

PlotDepthSampleF
PlotDepthSampleU

In [None]:
png(filename=here('Figures', 'VCF_QC.png'), res=100, width = 1400, height = 1800)
plot_grid(PlotQualU, PlotQualF, PlotDepthU, PlotDepthF, PlotDepthSampleU, PlotDepthSampleF,
          nrow = 3, ncol = 2, labels = "auto", label_size = 20, scale = 0.9)
dev.off()

# Analyzing QUAL>30 & MaxDepth filtered data

In [None]:
inPath <- here('analysis','variant_calling_GWAS_SNPS/')
varDepth <- read_delim(paste0(inPath, 'SNPS_135-samples_QUAL-30_MaxDepth.ldepth.mean'), delim = "\t",
                     col_names = c("chr", "pos", "mean_depth", "var_depth"), skip = 1)

In [None]:
# Statistics about the mean sequencing depth for each site
summary(varDepth$mean_depth)
quantile(varDepth$mean_depth, prob = c(0.25, 0.5, 0.75, 1), na.rm = TRUE)

# What is the percentage of variants having a QUAL value below 30
# Cumulative distribution function
fD <- ecdf(varDepth$mean_depth)
# Defining the threshold for inspection
threshold <- 10
# How many sites have read mean depth <= threshold (percent)
fD(threshold) * 100
# How many sites have read mean depth > threshold (percent)
100 - (fD(threshold) * 100)
# Estimating the numbers of retained SNPs after filtering for read the threshold
210229 * (1 - fD(threshold))

In [None]:
# Creating plot for read depth
ggplot(varDepth, aes(mean_depth)) + 
    geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + 
    theme_light()