In [1]:
library(tidyverse)
library(here)
library(glue)
library(furrr)
suppressPackageStartupMessages(library(rtracklayer))
devtools::load_all(".")

plan(multiprocess, workers = 20)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
here() starts at /mnt/expressions/mp/archaic-ychr

Attaching package: ‘glue’

The following object is masked from ‘package:dplyr’:

    collapse

Loading required package: future
Loading ychr


# Full 6.9 Mb capture

In [2]:
samples_full <- list.files(here("data/vcf"), "full_.*.vcf.gz$") %>%
    str_subset("mez2_dp|modern|merged", negate = T) %>%
    str_replace("full_(.*).vcf.gz", "\\1")

In [3]:
samples_full

In [4]:
bed_full <- here("data/coord/capture_full.bed")

In [5]:
capture_full <- import.bed(bed_full)

In [6]:
sum(width(capture_full))

In [7]:
counts_full <- future_map_dfr(samples_full, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, bed_filter = bed_full)
    gt_nodmg <- read_vcf(here(glue("data/vcf/full_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, nodmg = T, bed_filter = bed_full)

    tibble(
        individual = ind,
        total = sum(width(capture_full)),
        all = nrow(gt_all),
        all_prop = all / total,
        nodmg = nrow(gt_nodmg),
        nodmg_prop = nodmg / total
    )
})

In [8]:
full_archaic_df <- counts_full %>% filter(str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|shotgun)"))
full_modern_df <- counts_full %>% filter(!str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|chimp|shotgun)"))

In [9]:
full_archaic_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.1f%%", 100 * .)) %>%
    select(-total) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F))

name,all,all_prop,nodmg,nodmg_prop
<chr>,<chr>,<chr>,<chr>,<chr>
Spy 94a (shotgun),2664787,38.5%,2636405,38.1%
Spy 94a,3502380,50.7%,3470671,50.2%
Spy 94a (snpAD),3557124,51.5%,3522759,51.0%
Mezmaiskaya 2 (shotgun),3822106,55.3%,3762768,54.4%
Denisova 4,4731302,68.4%,4705860,68.1%
Denisova 4 (snpAD),4824242,69.8%,4798037,69.4%
Denisova 8,5851332,84.6%,5828356,84.3%
Denisova 8 (snpAD),6260242,90.6%,6236272,90.2%
Mezmaiskaya 2,6348948,91.8%,6346684,91.8%
Mezmaiskaya 2 (snpAD),6669912,96.5%,6667441,96.5%


In [10]:
full_modern_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.2f%%", 100 * .)) %>%
    select(-total) %>%
    filter(!individual %in% c("den_snpad", "den")) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F))

name,all,all_prop,nodmg,nodmg_prop
<chr>,<chr>,<chr>,<chr>,<chr>
A00-1,6852077,99.12%,6851175,99.11%
A00-2,6867616,99.35%,6866716,99.33%
A00,6873078,99.43%,6872184,99.41%
Ust'-Ishim,6888071,99.64%,6887986,99.64%
S_Yoruba-2,6897101,99.77%,6896820,99.77%
S_Mandenka-1,6898162,99.79%,6897879,99.79%
S_Mbuti-1,6898694,99.80%,6898400,99.79%
S_Gambian-1,6899210,99.80%,6898929,99.80%
S_Papuan-2,6904393,99.88%,6904208,99.88%
S_Thai-1,6905023,99.89%,6904846,99.89%


# 560 kb capture of regions from Lippold et al.

In [11]:
samples_lippold <- list.files(here("data/vcf"), "lippold_.*.vcf.gz$") %>%
    str_subset("mez2_dp|modern|merged|elsidron2_dp", negate = T) %>%
    str_replace("lippold_(.*).vcf.gz", "\\1")

In [12]:
samples_lippold

In [13]:
bed_lippold <- here("data/coord/capture_lippold.bed")

In [14]:
capture_lippold <- import.bed(bed_lippold)

In [15]:
sum(width(capture_lippold))

In [16]:
counts_lippold <- future_map_dfr(samples_lippold, function(ind) {
    gt_all <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, bed_filter = bed_lippold)
    gt_nodmg <- read_vcf(here(glue("data/vcf/lippold_{ind}.vcf.gz")), mindp = 3, maxdp = 0.98, nodmg = T, bed_filter = bed_lippold)

    tibble(
        individual = ind,
        total = sum(width(capture_lippold)),
        all = nrow(gt_all),
        all_prop = all / total,
        nodmg = nrow(gt_nodmg),
        nodmg_prop = nodmg / total
    )
})

In [17]:
lippold_archaic_df <- counts_lippold %>% filter(str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|shotgun|elsidron)"))
lippold_modern_df <- counts_lippold %>% filter(!str_detect(individual, "^([Ss]py|[Dd]en|[Mm]ez|chimp|shotgun|elsidron)"))

In [18]:
lippold_archaic_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.2f%%", 100 * .)) %>%
    select(-total) %>%
    filter(!individual %in% c("den_snpad", "den")) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F))

name,all,all_prop,nodmg,nodmg_prop
<chr>,<chr>,<chr>,<chr>,<chr>
Spy 94a,275626,49.55%,272798,49.04%
Denisova 4,408775,73.49%,406508,73.08%
Denisova 8,484377,87.08%,482608,86.76%
Mezmaiskaya 2,509145,91.53%,508962,91.50%
El Sidrón 1253 (560 kb),530172,95.31%,530046,95.29%


In [19]:
lippold_modern_df %>%
    arrange(all) %>%
    mutate_at(vars(contains("prop")), ~ sprintf("%.2f%%", 100 * .)) %>%
    select(-total) %>%
    filter(!individual %in% c("den_snpad", "den")) %>%
    mutate(name = fix_name(individual)) %>%
    select(name, everything(), -individual) %>%
    mutate(all = format(all, big.mark = ",", scientific = F),
           nodmg = format(nodmg, big.mark = ",", scientific = F))

name,all,all_prop,nodmg,nodmg_prop
<chr>,<chr>,<chr>,<chr>,<chr>
A00-1,552557,99.33%,552456,99.32%
S_Yoruba-2,553203,99.45%,553112,99.43%
S_Mandenka-1,553348,99.48%,553257,99.46%
S_Gambian-1,553365,99.48%,553275,99.46%
S_Mbuti-1,553381,99.48%,553288,99.47%
A00-2,553422,99.49%,553321,99.47%
A00,553618,99.53%,553517,99.51%
Ust'-Ishim,554575,99.70%,554550,99.69%
S_Thai-1,555830,99.92%,555787,99.92%
S_Dinka-1,555842,99.93%,555785,99.91%
