In [1]:
library(tidyverse)
library(here)
library(glue)
library(furrr)
suppressPackageStartupMessages(library(rtracklayer))
devtools::load_all(".")

plan(multiprocess, workers = 20)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──

[32m✔[39m [34mggplot2[39m 3.3.0     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 3.0.0     [32m✔[39m [34mdplyr  [39m 0.8.5
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

here() starts at /mnt/expressions/mp/archaic-ychr


Attaching package: ‘glue’


The following object is masked from ‘package:dplyr’:

    collapse


Loading required package: future

Loading ychr



# Full 6.9 Mb capture

In [2]:
samples_full <- list.files(here("data/vcf"), "full_.*.vcf.gz$") %>%
    str_subset("mez2_dp|modern|merged|deam", negate = T) %>%
    str_replace("full_(.*).vcf.gz", "\\1")

In [3]:
samples_full

In [4]:
bed_full <- here("data/coord/capture_full.bed")

In [5]:
capture_full <- import.bed(bed_full)

In [6]:
sum(width(capture_full))

In [7]:
counts_full <- future_map_dfr(samples_full, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 0, maxdp = 1, bed_filter = bed_full)
    gt_nodmg <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 0, maxdp = 1, nodmg = T, bed_filter = bed_full)

    tibble(
        individual = ind,
        total = sum(width(capture_full)),
        all = sum(!is.na(gt_all[[ind]])),
        all_prop = all / total,
        nodmg = sum(!is.na(gt_nodmg[[ind]])),
        nodmg_prop = nodmg / total
    )
})

In [8]:
mindp_counts_full <- future_map_dfr(samples_full, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 3, maxdp = 1, bed_filter = bed_full)
    gt_nodmg <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 3, maxdp = 1, nodmg = T, bed_filter = bed_full)

    tibble(
        individual = ind,
        total = sum(width(capture_full)),
        all = sum(!is.na(gt_all[[ind]])),
        all_prop = all / total,
        nodmg = sum(!is.na(gt_nodmg[[ind]])),
        nodmg_prop = nodmg / total
    )
})

In [9]:
maxdp_mindp_counts_full <- future_map_dfr(samples_full, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, bed_filter = bed_full)
    gt_nodmg <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, nodmg = T, bed_filter = bed_full)

    tibble(
        individual = ind,
        total = sum(width(capture_full)),
        all = sum(!is.na(gt_all[[ind]])),
        all_prop = all / total,
        nodmg = sum(!is.na(gt_nodmg[[ind]])),
        nodmg_prop = nodmg / total
    )
})

In [10]:
counts_full_archaic_df <- counts_full %>% filter(str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|shotgun)"))

In [11]:
mindp_counts_full_archaic_df <- mindp_counts_full %>% filter(str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|shotgun)"))

In [12]:
maxdp_mindp_counts_full_archaic_df <- maxdp_mindp_counts_full %>% filter(str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|shotgun)"))

In [13]:
full_unfiltered <- counts_full_archaic_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.1f%%", 100 * .)) %>%
    select(-total) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F)) %>%
    mutate(filter = "unfiltered")

In [14]:
full_mindp <- mindp_counts_full_archaic_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.1f%%", 100 * .)) %>%
    select(-total) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F)) %>%
    mutate(filter = "min DP >= 3")

In [15]:
full_maxdp <- maxdp_mindp_counts_full_archaic_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.1f%%", 100 * .)) %>%
    select(-total) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F)) %>%
    mutate(filter = "min DP >= 3 and max DP <= 98% quantile")

In [16]:
bind_rows(full_unfiltered, full_mindp, full_maxdp)

name,all,all_prop,nodmg,nodmg_prop,filter
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Spy 94a (shotgun),2664787,38.5%,2636405,38.1%,unfiltered
Spy 94a,3502380,50.7%,3470671,50.2%,unfiltered
Spy 94a (snpAD),3556151,51.4%,3522222,51.0%,unfiltered
Mezmaiskaya 2 (shotgun),3822106,55.3%,3762768,54.4%,unfiltered
Denisova 4,4731302,68.4%,4705860,68.1%,unfiltered
Denisova 4 (snpAD),4823855,69.8%,4797798,69.4%,unfiltered
Denisova 8,5851332,84.6%,5828356,84.3%,unfiltered
Denisova 8 (snpAD),6259573,90.6%,6235802,90.2%,unfiltered
Mezmaiskaya 2,6348948,91.8%,6346684,91.8%,unfiltered
Mezmaiskaya 2 (snpAD),6669431,96.5%,6667075,96.4%,unfiltered


# 560 kb capture of regions from Lippold et al.

In [17]:
samples_lippold <- list.files(here("data/vcf"), "lippold_.*.vcf.gz$") %>%
    str_subset("mez2_dp|modern|merged|elsidron2_dp", negate = T) %>%
    str_replace("lippold_(.*).vcf.gz", "\\1")

In [18]:
samples_lippold <- "elsidron2"

In [19]:
bed_lippold <- here("data/coord/capture_lippold.bed")

In [20]:
capture_lippold <- import.bed(bed_lippold)

In [21]:
sum(width(capture_lippold))

In [22]:
counts_lippold <- future_map_dfr(samples_lippold, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 0, maxdp = 1, bed_filter = bed_lippold)
    gt_nodmg <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 0, maxdp = 1, nodmg = T, bed_filter = bed_lippold)

    tibble(
        individual = ind,
        total = sum(width(capture_lippold)),
        all = sum(!is.na(gt_all[[ind]])),
        all_prop = all / total,
        nodmg = sum(!is.na(gt_nodmg[[ind]])),
        nodmg_prop = nodmg / total
    )
})

In [23]:
mindp_counts_lippold <- future_map_dfr(samples_lippold, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 3, maxdp = 1, bed_filter = bed_lippold)
    gt_nodmg <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 3, maxdp = 1, nodmg = T, bed_filter = bed_lippold)

    tibble(
        individual = ind,
        total = sum(width(capture_lippold)),
        all = sum(!is.na(gt_all[[ind]])),
        all_prop = all / total,
        nodmg = sum(!is.na(gt_nodmg[[ind]])),
        nodmg_prop = nodmg / total
    )
})

In [24]:
maxdp_mindp_counts_lippold <- future_map_dfr(samples_lippold, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, bed_filter = bed_lippold)
    gt_nodmg <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, nodmg = T, bed_filter = bed_lippold)

    tibble(
        individual = ind,
        total = sum(width(capture_lippold)),
        all = sum(!is.na(gt_all[[ind]])),
        all_prop = all / total,
        nodmg = sum(!is.na(gt_nodmg[[ind]])),
        nodmg_prop = nodmg / total
    )
})

In [25]:
counts_lippold_archaic_df <- counts_lippold %>% filter(str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|shotgun|elsidron)"))

In [26]:
mindp_counts_lippold_archaic_df <- mindp_counts_lippold %>% filter(str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|shotgun|elsidron)"))

In [27]:
maxdp_mindp_counts_lippold_archaic_df <- maxdp_mindp_counts_lippold %>% filter(str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|shotgun|elsidron)"))

In [28]:
lippold_unfiltered <- counts_lippold_archaic_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.1f%%", 100 * .)) %>%
    select(-total) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F)) %>%
    mutate(filter = "unfiltered")

In [29]:
lippold_mindp <- mindp_counts_lippold_archaic_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.1f%%", 100 * .)) %>%
    select(-total) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F)) %>%
    mutate(filter = "min DP >= 3")

In [30]:
lippold_maxdp <- maxdp_mindp_counts_lippold_archaic_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.1f%%", 100 * .)) %>%
    select(-total) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F)) %>%
    mutate(filter = "min DP >= 3 and max DP <= 98% quantile")

In [31]:
bind_rows(lippold_unfiltered, lippold_mindp, lippold_maxdp)

name,all,all_prop,nodmg,nodmg_prop,filter
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
El Sidrón 1253 (560 kb),530172,95.3%,530046,95.3%,unfiltered
El Sidrón 1253 (560 kb),477397,85.8%,477293,85.8%,min DP >= 3
El Sidrón 1253 (560 kb),467563,84.1%,467461,84.0%,min DP >= 3 and max DP <= 98% quantile
