In [1]:
library(tidyverse)
library(here)
library(glue)
library(furrr)
suppressPackageStartupMessages(library(rtracklayer))
devtools::load_all(".")

plan(multiprocess, workers = 20)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
here() starts at /mnt/expressions/mp/archaic-ychr

Attaching package: ‘glue’

The following object is masked from ‘package:dplyr’:

    collapse

Loading required package: future
Loading ychr


# Full 6.9 Mb capture

In [2]:
samples_full <- list.files(here("data/vcf"), "full_.*.vcf.gz$") %>%
    str_subset("mez2_dp|modern|merged", negate = T) %>%
    str_replace("full_(.*).vcf.gz", "\\1")

In [3]:
samples_full

In [4]:
bed_full <- here("data/coord/capture_full.bed")

In [5]:
capture_full <- import.bed(bed_full)

In [6]:
sum(width(capture_full))

In [7]:
full_counts <- future_map_dfr(samples_full, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, bed_filter = bed_full)
    gt_nodmg <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, nodmg = T, bed_filter = bed_full)

    tibble(
        individual = ind,
        total = sum(width(capture_full)),
        all = nrow(gt_all),
        all_prop = all / total,
        nodmg = nrow(gt_nodmg),
        nodmg_prop = nodmg / total
    )
})

In [8]:
full_counts %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.2f%%", 100 * .)) %>%
    select(-total) %>%
    filter(!individual %in% c("den", "den_snpad", "a00_1", "a00_2", "chimp")) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual)

name,all,all_prop,nodmg,nodmg_prop
<chr>,<int>,<chr>,<int>,<chr>
Spy 94a (shotgun),2664973,38.55%,2636589,38.14%
Spy 94a,3502527,50.67%,3470817,50.21%
Mezmaiskaya 2 (shotgun),3822247,55.29%,3762906,54.43%
Denisova 4,4731419,68.44%,4705977,68.07%
Denisova 8,5851442,84.64%,5828466,84.31%
Mezmaiskaya 2,6349143,91.84%,6346879,91.81%
Mezmaiskaya 2 (snpAD),6670132,96.49%,6667661,96.45%
A00,6873362,99.43%,6872468,99.41%
Ust'-Ishim,6888333,99.64%,6888248,99.64%
S_Yoruba-2,6897403,99.77%,6897122,99.77%


# 560 kb capture of regions from Lippold et al.

In [9]:
samples_lippold <- list.files(here("data/vcf"), "lippold_.*.vcf.gz$") %>%
    str_subset("mez2_dp|modern|merged", negate = T) %>%
    str_replace("lippold_(.*).vcf.gz", "\\1")

In [10]:
samples_lippold

In [11]:
bed_lippold <- here("data/coord/capture_lippold.bed")

In [12]:
capture_lippold <- import.bed(bed_lippold)

In [13]:
sum(width(capture_lippold))

In [14]:
counts_lippold <- future_map_dfr(samples_lippold, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, bed_filter = bed_lippold)
    gt_nodmg <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, nodmg = T, bed_filter = bed_lippold)

    tibble(
        individual = ind,
        total = sum(width(capture_lippold)),
        all = nrow(gt_all),
        all_prop = all / total,
        nodmg = nrow(gt_nodmg),
        nodmg_prop = nodmg / total
    )
})

In [15]:
counts_lippold %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.2f%%", 100 * .)) %>%
    select(-total) %>%
    filter(!individual %in% c("den", "den_snpad", "mez2_snpad", "a00_1", "a00_2", "chimp")) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual)

name,all,all_prop,nodmg,nodmg_prop
<chr>,<int>,<chr>,<int>,<chr>
Spy 94a,275626,49.55%,272798,49.04%
Denisova 4,408775,73.49%,406508,73.08%
Denisova 8,484377,87.08%,482608,86.76%
Mezmaiskaya 2,509145,91.53%,508962,91.50%
El Sidrón 1253 (560 kb),530172,95.31%,530046,95.29%
S_Yoruba-2,553203,99.45%,553112,99.43%
S_Mandenka-1,553348,99.48%,553257,99.46%
S_Gambian-1,553365,99.48%,553275,99.46%
S_Mbuti-1,553381,99.48%,553288,99.47%
A00,553618,99.53%,553517,99.51%
