# Estimating archaic TMRCAs from the real data

In [2]:
library(tidyverse)
library(magrittr)
library(here)
library(furrr)
library(scales)

devtools::load_all(".")

plan(multiprocess)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.0     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract

here() starts at /mnt/expressions/mp/ychr
Loading required package: future

Attaching package: ‘scales’

The following object is masked from ‘package:purrr’:

    discard

The following objec

In [None]:
highcov_gt <- read_vcf(here("data/vcf/full_highcov.vcf.gz"), mindp = 3, maxdp = 0.975)

In [None]:
tafr <- calculate_tafr(highcov_gt)

In [None]:
tafr %>% group_by(afr) %>% unnest(counts_afr) %>% summarise_if(is.numeric, mean)

In [None]:
tmrca_full_low %<-%
    map_dfr(c("den4", "den8", "spy1", "shotgun_mez2", "shotgun_spy1"), function(arch) {
        map_dfr(c("all", "tv_only"), function(sites) {
            future_map_dfr(1:8, function(dp) {
                read_genotypes(arch, "full", mindp = dp, maxdp = 0.975, tv_only = sites == "tv_only") %>%
                    calculate_tarch(tafr) %>%
                    mutate(dp = dp, sites = sites)
            })
        })
    })

In [None]:
tmrca_full_subsampled %<-%
    map_dfr(, function(arch) {
        map_dfr(c("all", "tv_only"), function(sites) {
            future_map_dfr(1:8, function(dp) {
                read_genotypes(arch, "full", mindp = dp, maxdp = 0.975, tv_only = sites == "tv_only") %>%
                    calculate_tarch(tafr) %>%
                    mutate(dp = dp, sites = sites)
            })
        })
    })

In [None]:
tmrca_full_high %<-%
    map_dfr(c("mez2", "mez2_snpad"), function(arch) {
        map_dfr(c("all", "tv_only"), function(sites) {
            future_map_dfr(1:20, function(dp) {
                read_genotypes(arch, "full", mindp = dp, maxdp = 0.975, tv_only = sites == "tv_only") %>%
                    calculate_tarch(tafr) %>%
                    mutate(dp = dp, sites = sites)
            })
        })
    })

In [None]:
tmrca_lippold %<-%
    map_dfr(c("elsidron2"), function(arch) {
        map_dfr(c("all", "tv_only"), function(sites) {
            future_map_dfr(1:20, function(dp) {
                read_genotypes(arch, "lippold", mindp = dp, maxdp = 0.975, tv_only = sites == "tv_only") %>%
                    calculate_tarch(tafr) %>%
                    mutate(dp = dp, sites = sites)
            })
        })
    })

In [None]:
tmrca_exome %<-%
    map_dfr(c("elsidron1"), function(arch) {
        map_dfr(c("all", "tv_only"), function(sites) {
            future_map_dfr(1:8, function(dp) {
                read_genotypes(arch, "exome", mindp = dp, maxdp = 0.975, tv_only = sites == "tv_only") %>%
                    calculate_tarch(tafr) %>%
                    mutate(dp = dp, sites = sites)
            })
        })
    })

In [3]:
resolved(futureOf(tmrca_full_low))
resolved(futureOf(tmrca_full_subsampled))
resolved(futureOf(tmrca_full_high))
resolved(futureOf(tmrca_lippold))
resolved(futureOf(tmrca_exome))

ERROR: Error: No such future variable: tmrca_full_low


In [None]:
tmrca_full_low %<>% mutate(capture = "full", tafr = "all")
tmrca_full_subsampled %<>% mutate(capture = "full", tafr = "all")
tmrca_full_high %<>% mutate(capture = "full", tafr = "all")
tmrca_lippold %<>% mutate(capture = "lippold", tafr = "all")
tmrca_exome %<>% mutate(capture = "exome", tafr = "all")

In [None]:
tmrca_df <- bind_rows(tmrca_full_low, tmrca_full_high, tmrca_lippold, tmrca_exome)

In [None]:
saveRDS(tmrca_df, here("data/rds/tmrca_df.rds"))

In [None]:
tmrca_df <- readRDS(here("data/rds/tmrca_df.rds"))

In [None]:
tmrca_df %>% filter(sites == "tv_only", arch == "mez2", capture == "full", dp == 3, afr == "a00") %>%
unnest(counts_arch) %>% group_by(arch) %>% summarise_if(is.numeric, mean)

In [None]:
set_dim(8, 6)

tmrca_df %>%
filter(afr == "a00") %>%
filter(capture == "full" & arch %in% c("den8", "mez2", "mez2sub") | capture == "lippold" & arch == "elsidron2" | capture == "exome" & arch == "elsidron1") %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
unnest(counts_arch) %>%
mutate(alpha2 = (a + d) / d, tmrca_arch2 = alpha2 * tmrca_f) %>%
select(arch, tmrca_arch, tmrca_arch2, sites, dp) %>%
group_by(arch, sites, dp) %>% summarise_all(mean) %>% 
gather(tmrca, value, -c(arch, sites, dp)) %>%
ggplot(aes(dp, value, color = arch)) +
    geom_point(aes(as.factor(dp), value), alpha = 1/4) +
    geom_smooth(aes(fill = arch), size = 0.5) +
    labs(x = "Minimum coverage required", y = "TMRCA estimate") +
    theme_bw() +
    coord_cartesian(y = c(200000, 850000)) +
    facet_grid(tmrca ~ sites) +
    theme(legend.position = "bottom") +
    ggtitle("TMRCAs calculated using two different formulas") +
    geom_hline(yintercept = c(350000, 650000), linetype = 2, size = 1/3)

In [None]:
set_dim(8, 6)

tmrca_df %>%
filter(afr == "a00") %>%
filter(capture == "full" & arch %in% c("den8", "mez2") | capture == "lippold" & arch == "elsidron2" | capture == "exome" & arch == "elsidron1") %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
unnest(counts_arch) %>%
mutate(alpha2 = (a + d) / d, tmrca_arch2 = alpha2 * tmrca_f) %>%
select(arch, tmrca_arch, tmrca_arch2, sites, dp) %>%
group_by(arch, sites, dp) %>% summarise_all(mean) %>% 
gather(tmrca, value, -c(arch, sites, dp)) %>%
ggplot(aes(dp, value, color = arch)) +
    geom_point(aes(as.factor(dp), value), alpha = 1/4) +
    geom_smooth(aes(fill = arch), size = 0.5) +
    labs(x = "Minimum coverage required", y = "TMRCA estimate") +
    theme_bw() +
    coord_cartesian(y = c(200000, 850000)) +
    facet_grid(tmrca ~ sites) +
    theme(legend.position = "bottom") +
    ggtitle("TMRCAs calculated using two different formulas") +
    geom_hline(yintercept = c(350000, 650000), linetype = 2, size = 1/3)

In [None]:
set_dim(8, 5)

tmrca_df %>%
filter(afr == "a00", dp <= 4) %>%
filter(capture == "full" & arch %in% c("den8", "mez2", "den4", "spy1") |
       capture == "lippold" & arch == "elsidron2" |
       capture == "exome" & arch == "elsidron1") %>%
unnest(counts_arch) %>%
mutate(alpha2 = (a + d) / d, tmrca_arch2 = alpha2 * tmrca_f) %>%
select(arch, tmrca_arch, tmrca_arch2, sites, dp) %>%
gather(tmrca, value, -c(arch, sites, dp)) %>%
filter(!is.na(value), !is.infinite(value)) %>% {
ggplot(., aes(as.factor(dp), value, color = arch, group = arch)) +
    geom_point(alpha = 1/4) +
    geom_line(data = group_by(., arch, sites, dp, tmrca) %>% summarise_all(mean), size = 1 / 2) +
    facet_grid(tmrca ~ sites) +
    theme_bw() +
    scale_y_continuous(labels = comma) +
    xlab("Minimum coverage") +
    ylab("Proportion of mutations falling on a branch")
}

In [None]:
set_dim(8, 6)

tmrca_df %>%
filter(afr == "a00") %>%
filter(capture == "full" & arch %in% c("den8", "mez2") | capture == "lippold" & arch == "elsidron2") %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
unnest(counts_arch) %>%
mutate(alpha2 = (a + d) / d, tmrca_arch2 = alpha2 * tmrca_f) %>%
select(arch, tmrca_arch, tmrca_arch2, sites, dp) %>%
group_by(arch, sites, dp) %>% summarise_all(mean) %>% 
gather(tmrca, value, -c(arch, sites, dp)) %>%
ggplot(aes(dp, value, color = arch)) +
    geom_point(aes(as.factor(dp), value), alpha = 1/4) +
    geom_smooth(aes(fill = arch), size = 0.5) +
    labs(x = "Minimum coverage required", y = "TMRCA estimate") +
    theme_bw() +
    coord_cartesian(y = c(100000, 1000000)) +
    facet_grid(tmrca ~ sites) +
    theme(legend.position = "bottom") +
    ggtitle("TMRCAs calculated using two different formulas") +
    geom_hline(yintercept = c(350000, 650000), linetype = 2, size = 1/3)

# Why do `d` and `e` counts differ so much depending on coverage?

In [None]:
set_dim(6, 3)

tmrca_df %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
filter(afr %in% "a00") %>%
filter(capture == "full" & arch %in% c("den8"), sites == "all") %>%
unnest(counts_arch) %>%
mutate(n_muts = a + b + c + d + e + f) %>% select(-starts_with("tmrca"), -mut_rate) %>%
group_by(arch, afr, dp) %>%
summarise_if(is.numeric, mean) %>%
arrange(dp)

^^^ note that the `a` length is getting A LOT shorter as we increase the coverage cutoff

d/e ~1 for DP >= 1, as it should be, because there's no reason for them to be different, really

### The following means that the `a` branch is getting increasingly shorter than `e` the higher the required coverage cutoff is

But note that it's the same even for Mez2! Probably not archaic-caused ref bias?!

In [None]:
set_dim(8, 4)

tmrca_df %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
filter(sites == "tv_only", capture == "full" & arch %in% c("den8", "mez2") | capture == "lippold" & arch == "asd") %>%
unnest(counts_arch) %>%
mutate(n_muts = a + b + c + d + e + f) %>%
#select(arch, afr, dp, tmrca_arch, alpha, a, b, c, d, e, f, total, n_muts, sites) %>%
#gather(branch, count, -c(arch, afr, dp, alpha, tmrca_arch, total, n_muts, sites)) %>% 
#filter(branch %in% c("d", "e")) %>% 
group_by(arch, afr, dp, sites) %>%
summarise_if(is.numeric, mean) %>%
mutate(prop = e / a) %>% 
ggplot(aes(as.factor(dp), prop, color = afr, fill = afr, group = afr)) +
    geom_point() +
    geom_smooth(size = 1/3, alpha = 1/2) +
    facet_wrap(~ arch, scales = "free_y") +
    theme_bw() + xlab("Minimum coverage") + ylab("Proportion e / a") +
    theme(legend.position = "bottom", axis.text.x = element_text(hjust = 1, angle = 45))

### ... but not the `d` branch... ?

In [None]:
set_dim(8, 4)

tmrca_df %>%
filter(dp < 9, !is.infinite(alpha), !is.nan(alpha)) %>%
filter(sites == "tv_only", capture == "full" & arch %in% c("den8", "mez2") | capture == "lippold" & arch == "asd") %>%
unnest(counts_arch) %>%
mutate(n_muts = a + b + c + d + e + f) %>%
group_by(arch, afr, dp, sites) %>%
summarise_if(is.numeric, mean) %>%
mutate(prop = d / a) %>% 
ggplot(aes(as.factor(dp), prop, color = afr, fill = afr, group = afr)) +
    geom_point() +
    geom_smooth(size = 1/3, alpha = 1/2) +
    facet_wrap(~ arch, scales = "free_y") +
    theme_bw() + xlab("Minimum coverage") + ylab("Proportion d / a") +
    theme(legend.position = "bottom", axis.text.x = element_text(hjust = 1, angle = 45))

### What about `d` vs `e`? This proportion should be ~1.

In [None]:
set_dim(8, 4)

tmrca_df %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
filter(sites == "tv_only", capture == "full" & arch %in% c("den8", "mez2")) %>%
unnest(counts_arch) %>%
mutate(n_muts = a + b + c + d + e + f) %>%
group_by(arch, afr, dp, sites) %>%
summarise_if(is.numeric, mean) %>%
mutate(prop = e / d) %>% 
ggplot(aes(as.factor(dp), prop, color = afr, fill = afr, group = afr)) +
    geom_point() +
    geom_smooth(size = 1/3, alpha = 1/4) +
    facet_grid(. ~ arch) +
    theme_bw() + xlab("Minimum coverage") + ylab("Proportion e / d") +
    theme(legend.position = "bottom", axis.text.x = element_text(hjust = 1, angle = 45))

In [None]:
set_dim(8, 4)

tmrca_df %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
filter(sites == "tv_only", capture == "full" & arch %in% c("den8", "mez2")) %>%
unnest(counts_arch) %>%
mutate(n_muts = a + b + c + d + e + f) %>%
group_by(arch, afr, dp, sites) %>%
summarise_if(is.numeric, mean) %>%
mutate(prop = b / a) %>% 
ggplot(aes(as.factor(dp), prop, color = afr, fill = afr, group = afr)) +
    geom_point() +
    geom_smooth(size = 1/3, alpha = 1/4) +
    facet_wrap(~ arch, scales = "free") +
    theme_bw() + xlab("Minimum coverage") + ylab("Proportion b / a") +
    theme(legend.position = "bottom", axis.text.x = element_text(hjust = 1, angle = 45))

In [None]:
set_dim(8, 4)

tmrca_df %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
filter(sites == "tv_only", capture == "full" & arch %in% c("den8", "mez2")) %>%
unnest(counts_arch) %>%
mutate(n_muts = a + b + c + d + e + f) %>%
group_by(arch, afr, dp, sites) %>%
summarise_if(is.numeric, mean) %>%
mutate(prop = a / c) %>% 
ggplot(aes(as.factor(dp), prop, color = afr, fill = afr, group = afr)) +
    geom_point() +
    geom_smooth(size = 1/3, alpha = 1/4) +
    facet_wrap(~ arch, scales = "free") +
    theme_bw() + xlab("Minimum coverage") + ylab("Proportion a / c") +
    theme(legend.position = "bottom", axis.text.x = element_text(hjust = 1, angle = 45))

In [None]:
tmrca_df %>% filter(arch == "den4", capture == "full", dp == 3, sites == "all") %>%
unnest(counts_arch) %>% select(-starts_with("tmrca"), -counts_afr) %>% mutate(c / b)

In [None]:
tmrca_df %>% filter(arch == "den8", capture == "full", dp == 3, sites == "all") %>%
unnest(counts_arch) %>% select(-starts_with("tmrca"), -counts_afr) %>% mutate(c / b)

In [None]:
tmrca_df %>% filter(arch == "spy1", capture == "full", dp == 3, sites == "all") %>%
unnest(counts_arch) %>% select(-starts_with("tmrca"), -counts_afr) %>% mutate(c / b)

In [None]:
tmrca_df %>% filter(arch == "mez2", capture == "full", dp == 3, sites == "all") %>%
unnest(counts_arch) %>% select(-starts_with("tmrca"), -counts_afr) %>% mutate(c / b)

### A00 - both `a` and `d` relatively stable across coverage, but `e` is increasing

In [None]:
set_dim(13, 7)

tmrca_df %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
filter(afr %in% "a00", dp <= 8) %>%
filter(capture == "full" & arch %in% c("den8", "mez2", "spy1", "den4") | arch == "elsidron1") %>%
filter((dp <= 4 & arch == "den4") |
       (dp <= 10 & arch == "den8") |
       (dp <= 29 & arch == "mez2") |
       (dp <= 4 & arch == "spy1") |
       (dp <= 5 & arch == "elsidron1")) %>%
unnest(counts_arch) %>%
mutate(n_muts = a + b + c + d + e + f) %>%
select(arch, afr, dp, tmrca_arch, alpha, a, b, c, d, e, f, total, n_muts, sites) %>%
gather(branch, count, -c(arch, afr, dp, alpha, tmrca_arch, total, n_muts, sites)) %>% 
mutate(prop = count / n_muts) %>% {
ggplot(., aes(as.factor(dp), prop, color = branch, fill = branch, group = branch)) +
    geom_point() +
    geom_line(data = group_by(., arch, afr, dp, sites, branch) %>% summarise_if(is.numeric, mean), size = 1 / 2,
              aes(linetype = !branch %in% c("a", "d", "f"))) +
    facet_wrap(sites ~ arch, scales = "free", ncol = 5) +
    geom_vline(xintercept = 3) +
    theme_bw() +
    xlab("Minimum coverage") +
    ylab("Proportion of mutations falling on a branch") +
    theme(legend.position = "bottom")
}

In [None]:
set_dim(13, 7)

tmrca_df %>%
filter(!is.infinite(alpha), !is.nan(alpha)) %>%
filter(afr %in% "a00") %>%
filter(capture == "full" & arch %in% c("den8", "mez2", "spy1", "den4")) %>%
filter((dp <= 4 & arch == "den4") |
       (dp <= 10 & arch == "den8") |
       (dp <= 29 & arch == "mez2") |
       (dp <= 4 & arch == "spy1")) %>%
unnest(counts_arch) %>%
mutate(n_muts = a + b + c + d + e + f) %>%
select(arch, afr, dp, tmrca_arch, alpha, a, b, c, d, e, f, total, n_muts, sites) %>%
gather(branch, count, -c(arch, afr, dp, alpha, tmrca_arch, total, n_muts, sites)) %>% 
mutate(prop = count / n_muts) %>% {
ggplot(., aes(as.factor(dp), prop, color = branch, fill = branch, group = branch)) +
    geom_point() +
    geom_line(data = group_by(., arch, afr, dp, sites, branch) %>% summarise_if(is.numeric, mean), size = 1 / 2,
              aes(linetype = !branch %in% c("a", "d", "f"))) +
    facet_wrap(sites ~ arch, scales = "free", ncol = 4) +
    geom_vline(xintercept = 3) +
    theme_bw() +
    xlab("Minimum coverage") +
    ylab("Proportion of mutations falling on a branch") +
    theme(legend.position = "bottom")
}

^^^ `a` and `f` appear to level after `dp >= 3` which seems OK???

but `e` is increasing steadily - this is super weird and could be driving `a / (a + d + e)` higher at higher cutoffs?
* is the increased proportion of `e` branch the reference bias signal?


In [None]:
set_dim(8, 5)

tmrca_df %>%
filter(dp < 9, capture == "full", afr == "a00", arch %in% c("mez2", "den8")) %>% unnest(counts_arch) %>%
select(arch, dp, tmrca_arch, alpha_mendez = alpha, sites, tmrca_f, a, b, c, d, e, f) %>%
mutate(
    alpha_ad = (a + d) / d,
    alpha_ae = (a + e) / e
) %>%
select(arch, dp, sites, starts_with("alpha")) %>%
gather(stat, value, -arch, -dp, -sites) %>%
ggplot(aes(as.factor(dp), value, color = stat)) +
    geom_point() +
    geom_smooth(aes(group = stat)) +
    theme_bw() +
    facet_wrap(sites ~ arch, scales = "free") +
    ylab("archaic->A00 TMRCA scaling factor 'alpha'")

points here:

1. 3X coverage seems to remove most of the error effects nicely - best argument for using this cutoff in the middle panel.
2. TV-only removes all erors (TMRCA doesn't change) => sequencing errors minimal beyond aDNA damage.
3. Minimal effect of reference bias with increasing coverage??? Based on the Neanderthal line in the center panel, but significant bias in Denisova?

TMRCAs of A00 are overlapping the ones of other Africans, especially striking in the Lippold captures, but exome data looks weird too - too much noise in the data or not enough sequence to accumulate enough informative sites?

In [None]:
library(scales)

colors <- c("red", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

tafr <- tmrca_df %>%
    filter(dp == 3, afr == "a00", arch == "den8", capture == "full", sites == "all") %>%
    select(name = afr, tmrca = tmrca_afr)

tarch <- tmrca_df %>%
    filter(afr == "a00") %>%
    filter(dp == 2 & sites == "tv_only" & capture == "full" & arch %in% c("den4", "den8", "mez2", "spy1", "shotgun_spy1", "shotgun_mez2") |
           dp == 2 & sites == "tv_only" & capture == "lippold" & arch == "elsidron2" |
           dp %in% c(1, 3) & sites == "all" & capture == "exome" & arch == "elsidron1") %>%
    mutate(arch = case_when(arch == "elsidron1" & dp == 1 ~ "elsidron_dp1",
                            arch == "elsidron1" & dp == 3 ~ "elsidron_dp3",
                            TRUE ~ arch)) %>%
    select(name = arch, tmrca = tmrca_arch, tmrca_arch2)
tmrca <- bind_rows(tafr, tarch) %>%
    mutate(set = case_when(name == "a00" ~ "A00",
                       name %in% c("den4", "den8") ~ "Denisovan",
                       name %in% c("spy1", "mez2", "elsidron2", "shotgun_spy1", "shotgun_mez2") ~ "Neanderthal",
                       TRUE ~ "other")) %>%
    mutate(name = case_when(name == "den4" ~ "Denisova 4 (1.6X)",
                            name == "den8" ~ "Denisova 8 (3.6X)",
                            name == "elsidron2" ~ "El Sidron (8X)",
                            name == "mez2" ~ "Mezmaiskaya 2 (15X)",
                            name == "spy1" ~ "Spy 1 (0.9X)",
                            name == "shotgun_spy1" ~ "Spy1 shot",
                            name == "shotgun_mez2" ~ "Mez2 shot",
                            name == "elsidron_dp1" ~ "El Sidron (3X, Mendez et al.)",
                            name == "elsidron_dp3" ~ "El Sidron (3X, Mendez at al., filtered)",
                            name == "a00" ~ "A00 lineage (21X)")) %>%
    mutate(name = fct_relevel(name, "El Sidron (8X)", "El Sidron (3X, Mendez et al.)",
                              "El Sidron (3X, Mendez at al., filtered)", "A00 lineage (21X)", after = Inf))

In [None]:
set_dim(5, 4)

tmrca %>%
ggplot(aes(name, tmrca, color = set)) +
    geom_boxplot() +
    geom_jitter() + 
    theme_classic() +
    xlab("") + ylab("Y chromosome TMRCA [years ago]") +
    expand_limits(y = 130000) +
    guides(color = guide_legend("archaic human")) +
    scale_y_continuous(labels = comma) +
    geom_hline(yintercept = c(630000, 350000)) +
    theme(legend.position = "none",
          axis.text.x = element_text(hjust = 1, angle = 30, size = 12),
          axis.title.x = element_blank())

In [None]:
set_dim(5, 4)

tmrca %>%
ggplot(aes(name, tmrca_arch2, color = set)) +
    geom_boxplot() +
    geom_jitter() + 
    theme_classic() +
    xlab("") + ylab("Y chromosome TMRCA [years ago]") +
    expand_limits(y = 130000) +
    guides(color = guide_legend("archaic human")) +
    scale_y_continuous(labels = comma) +
    geom_hline(yintercept = c(630000, 350000)) +
    theme(legend.position = "none",
          axis.text.x = element_text(hjust = 1, angle = 30, size = 12),
          axis.title.x = element_blank())

In [None]:
x %<-% read_vcf("../data/vcf/full_shotgun_mez2.vcf.gz", mindp = 3, maxdp = 1)

In [None]:
y %<-% read_vcf("../data/vcf/full_mez2.vcf.gz", mindp = 3, maxdp = 1)

In [None]:
head(x)

In [None]:
df = full_join(x, y)

In [None]:
head(df)

In [None]:
filter(df, shotgun_mez2 != mez2)