In [1]:
library(tidyverse)
library(magrittr)
library(here)
library(furrr)

devtools::load_all(".")

plan(multiprocess)

options(future.globals.maxSize= 600*1024^2)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1     [32m✔[39m [34mpurrr  [39m 0.3.0
[32m✔[39m [34mtibble [39m 2.1.1     [32m✔[39m [34mdplyr  [39m 0.7.8
[32m✔[39m [34mtidyr  [39m 0.8.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.3.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract

here() starts at /mnt/expressions/mp/ychr
Loading required package: future
Loading ychr
“no function found corresponding to methods exports from ‘GenomicRanges’ for: ‘concatenateObjects’”

# Lineage assignment (Mez2 vs A00 vs chimp)

In [2]:
df <- read_genotypes("mez2", "full", mindp = 5, maxdp = 0.975, tv_only = TRUE) %>%
    select(chrom, pos, REF, ALT, mez2, chimp, a00) %>%
    filter(complete.cases(.))

In [3]:
head(df)

chrom,pos,REF,ALT,mez2,chimp,a00
Y,2649827,G,,0,0,0
Y,2649828,T,,0,0,0
Y,2649829,T,,0,0,0
Y,2649830,A,,0,0,0
Y,2649831,A,,0,0,0
Y,2649832,T,,0,0,0


In [6]:
nrow(df)

In [7]:
props <- future_map_dfr(c("den4", "den8", "spy1", "elsidron2", "shotgun_spy1", "shotgun_mez2"), function(arch) {
    capture <- ifelse(arch == "elsidron2", "lippold", "full")

    arch_df <- read_vcf(here(paste0("data/vcf/", capture, "_", arch, ".vcf.gz")), mindp = 3, maxdp = 0.975)
    joined <- inner_join(df, arch_df, by = c("chrom", "pos", "REF")) %>% filter(ALT.y == "" | ALT.x == ALT.y)

    joined <- filter(joined, complete.cases(joined))

    tibble(
        hum   = sum(joined[["mez2"]] == joined[["chimp"]] & joined[["a00"]] != joined[["mez2"]]  & joined[[arch]] == joined[["a00"]]),
        neand = sum(joined[["a00"]]  == joined[["chimp"]] & joined[["a00"]] != joined[["mez2"]]  & joined[[arch]] == joined[["mez2"]]),
        anc   = sum(joined[["mez2"]] == joined[["a00"]]   & joined[["a00"]] != joined[["chimp"]] & joined[[arch]] == joined[["chimp"]]),
        total = hum + neand + anc
    ) %>%
        gather(lineage, count, -total) %>%
        mutate(name = arch, prop = count / total) %>%
        select(name, everything())
})

In [8]:
props %>% select(-prop) %>% spread(lineage, count)

name,total,anc,hum,neand
den4,112,110,1,1
den8,355,339,6,10
elsidron2,24,0,0,24
shotgun_mez2,30,0,0,30
shotgun_spy1,9,0,0,9
spy1,18,0,1,17


In [9]:
props %>% select(-count) %>% spread(lineage, prop)

name,total,anc,hum,neand
den4,112,0.9821429,0.008928571,0.008928571
den8,355,0.9549296,0.016901408,0.028169014
elsidron2,24,0.0,0.0,1.0
shotgun_mez2,30,0.0,0.0,1.0
shotgun_spy1,9,0.0,0.0,1.0
spy1,18,0.0,0.055555556,0.944444444


Both Denisovan Y chromosomes fall with close to 100% certainty to a branch ancestral to AMH and Neanderthals.

The small number of sites falling on human or Neanderthal branches is easily explained by DNA damage - a truly ancestral site carrying a damage substitution will flip with 50% probability to one or the other, which is what we see for both Denisova 4 and Denisova 8.

In [32]:
gt <- read_genotypes("den8", "full", mindp = 5, maxdp = 0.975, tv_only = TRUE) %>% filter(complete.cases(.))

In [33]:
head(gt)

chrom,pos,REF,ALT,den8,chimp,ustishim,a00,S_BedouinB_1,S_Turkish_1,⋯,S_Punjabi_1,S_Saami_2,S_Papuan_2,S_Karitiana_1,S_Dinka_1,S_Mbuti_1,S_Yoruba_2,S_Gambian_1,S_Mandenka_1,S_Ju_hoan_North_1
Y,2649899,A,,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Y,2649901,A,,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Y,2649902,A,,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Y,2649903,A,,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Y,2649908,T,,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Y,2649910,T,,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [34]:
tafr <-
    calculate_tafr(gt) %>%
    filter(afr == "a00") %>%
    unnest(counts_afr) %>%
    group_by(afr) %>%
    summarise_if(is.numeric, mean)

In [35]:
tafr

afr,tmrca_afr,tmrca_ad,tmrca_f,mut_rate,age,a,b,c,d,e,f,total
a00,239496,190500.2,288491.8,4.535624e-10,45000,57,0,0.6923077,20.07692,1,115,934671


In [31]:
tafr

afr,tmrca_afr,tmrca_ad,tmrca_f,mut_rate,age,a,b,c,d,e,f,total
a00,237587.3,207793.4,267381.3,4.432091e-10,45000,177.2308,0,1.153846,51.84615,1.769231,294,2510825


In [24]:
counts <- tibble(
    arch = sum(gt$a00 == gt$chimp & gt$mez2 != gt$chimp),
    hum = sum(gt$mez2 == gt$chimp & gt$a00 != gt$chimp),
    anc = sum(gt$mez2 == gt$a00 & gt$a00 != gt$chimp),
    total = nrow(gt)
)

In [25]:
counts

neand,hum,anc,total
259,403,65347,2510825


In [27]:
counts$hum / (counts$total * 4.432091e-10)

In [38]:
archaic <- "den8"

counts <- tibble(
    arch = sum(gt$a00 == gt$chimp & gt[[archaic]] != gt$chimp),
    hum = sum(gt[[archaic]] == gt$chimp & gt$a00 != gt$chimp),
    anc = sum(gt[[archaic]] == gt$a00 & gt$a00 != gt$chimp),
    total = nrow(gt)
)

In [39]:
counts

arch,hum,anc,total
125,234,23809,934671


In [40]:
counts$hum / (counts$total * tafr$mut_rate)

In [2]:
calculate_direct <- function(archaic, mindp) {
    gt <- read_genotypes(archaic, "full", mindp = mindp, maxdp = 0.975, tv_only = TRUE) %>% filter(complete.cases(.))

    tafr <-
        calculate_tafr(gt) %>%
        filter(afr == "a00") %>%
        unnest(counts_afr) %>%
        group_by(afr) %>%
        summarise_if(is.numeric, mean)
    
    tibble(
        name = archaic,
        arch = sum(gt$a00 == gt$chimp & gt[[archaic]] != gt$chimp),
        hum = sum(gt[[archaic]] == gt$chimp & gt$a00 != gt$chimp),
        anc = sum(gt[[archaic]] == gt$a00 & gt$a00 != gt$chimp),
        total = nrow(gt),
        mut_rate = tafr$mut_rate,
        tmrca = hum / (total * mut_rate)
     )
}

In [82]:
direct_tmrca <- future_map_dfr(1:8, function(i) calculate_direct("mez2", i))

ERROR: Error: Failed to retrieve the result of MulticoreFuture (<none>) from the forked worker (on localhost; PID 46428). Post-mortem diagnostic: No process exists with this PID, i.e. the forked localhost worker is no longer alive.


In [None]:
direct_tmrca

In [80]:
calculate_direct("mez2", 5)

name,arch,hum,anc,total,mut_rate,tmrca
mez2,259,403,65347,2510825,4.432091e-10,362142.9


In [None]:
tmrca_df <-
    map_dfr(1:8, function(dp) {
        future_map_dfr(c("den4", "den8", "spy1", "mez2", "shotgun_spy1", "shotgun_mez2"),
                       calculate_direct, mindp = dp) %>%
            mutate(dp = dp)
    })

In [None]:
tmrca_df

In [68]:
tmrca_df %>% arrange(tmrca)

arch,tmrca
mez2,362489.6
shotgun_spy1,390549.3
shotgun_mez2,429080.6
spy1,435584.4
den4,809932.9
den8,847429.9


In [65]:
tmrca_df %>% arrange(tmrca)

arch,tmrca
shotgun_spy1,146250.0
spy1,240157.9
shotgun_mez2,342931.0
mez2,366157.2
den4,541189.4
den8,695579.3
