In [1]:
suppressPackageStartupMessages({

    library(VariantAnnotation)
    library(tidyverse)
    library(magrittr)
    library(here)
    library(bdkn)
    library(rtracklayer)

})

In [2]:
source(here("src/utils.R"))

In [3]:
gt <- read_gt(here("data/vcf/merged_full.vcf.gz"))

In [4]:
refnames <- colnames(gt) %>% str_subset("reference|kk1|mota|bichon|loschbour|ustishim|^S_")

In [3]:
vcf <- readVcf(here("data/vcf/merged_full.vcf.gz"))

In [4]:
gt <- geno(vcf)$GT %>% replace(. == ".", NA)

In [5]:
mode(gt) <- "numeric"

In [6]:
gt <- as_tibble(gt) %>% mutate(ref = 0)

In [10]:
refnames <- colnames(gt) %>% str_subset("reference|kk1|mota|bichon|loschbour|ustishim|^S_")

In [8]:
nrow(gt)

In [6]:
mutation_rate <- 7.4e-10

In [7]:
total_seq <- import.bed(here("data/coord/capture_full.bed")) %>% width %>% sum

# A00 divergence time ($T_{AR}$)

In [8]:
divergence_a00 <- function(gt, ref) {
    list(
        a = sum((gt$ustishim == gt[[ref]]) & (gt$Chimp == gt$a00) & (gt$Chimp != gt[[ref]]), na.rm = TRUE),
        d = sum((gt$Chimp != gt[[ref]]) & (gt$a00 != gt[[ref]]) & (gt$ustishim != gt[[ref]]), na.rm = TRUE),
        e = sum((gt$Chimp == gt[[ref]]) & (gt$a00 == gt[[ref]]) & (gt$ustishim != gt[[ref]]), na.rm = TRUE),
        f = sum((gt$Chimp == gt[[ref]]) & (gt$a00 != gt[[ref]]) & (gt$ustishim == gt[[ref]]), na.rm = TRUE)
    )
}

In [9]:
t_AR_ad <- map_dbl(refnames, ~ divergence_a00(gt, .x) %>% { (.$a + .$d) / (total_seq * mutation_rate) }) %>% setNames(refnames)

In [67]:
t_AR_ad <- map_dbl(refnames, ~ divergence_a00(gt, .x) %>% { (.$a + .$d) / (total_seq * mutation_rate) }) %>% setNames(refnames)

In [10]:
t_AR_ad

In [12]:
mean(t_AR_ad)

In [69]:
mean(t_AR_ad)

In [13]:
tAR <- function(gt, d, e) {
    list(
        a = sum((gt[[e]] == gt[[d]]) & (gt$Chimp == gt$a00) & (gt$Chimp != gt[[d]]), na.rm = TRUE),
        d = sum((gt$Chimp != gt[[d]]) & (gt$a00 != gt[[d]]) & (gt[[e]] != gt[[d]]), na.rm = TRUE),
        e = sum((gt$Chimp == gt[[d]]) & (gt$a00 == gt[[d]]) & (gt[[e]] != gt[[d]]), na.rm = TRUE),
        f = sum((gt$Chimp == gt[[d]]) & (gt$a00 != gt[[d]]) & (gt[[e]] == gt[[d]]), na.rm = TRUE)
    )
}

In [16]:
df_tAR <- crossing(d = str_subset(refnames, "ref|S_"), e = refnames) %>%
    filter(d != e) %>%
    mutate(t_da = map2_dbl(d, e, ~ tAR(gt, .x, .y) %>% { (.$a + .$d) / (total_seq * mutation_rate) }),
           t_f = map2_dbl(d, e, ~ tAR(gt, .x, .y) %>% { .$f / (total_seq * mutation_rate) }))

In [17]:
summary(df_tAR$t_da)
summary(df_tAR$t_f)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 171239  241416  251776  243202  254513  260377 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 147391  207207  217763  209746  218545  221672 

In [18]:
arrange(df_tAR, t_da) %>% print(n = nrow(.))

# A tibble: 165 x 4
    d             e                t_da     t_f
    <chr>         <chr>           <dbl>   <dbl>
  1 S_French-1    bichon        171239. 149150.
  2 S_Han-2       bichon        172021. 148954.
  3 S_Dinka-1     bichon        172607. 148759.
  4 S_Karitiana-1 bichon        172607. 148954.
  5 reference     bichon        173585. 149736.
  6 S_Yoruba-2    bichon        174758. 148368.
  7 S_Mandenka-1  bichon        175735. 148563.
  8 S_Papuan-2    bichon        175735. 149150.
  9 S_Dai-2       bichon        175930. 148954.
 10 S_Sardinian-1 bichon        178081. 147391.
 11 S_Mbuti-1     bichon        178667. 148172.
 12 S_French-1    kk1           216785. 193132.
 13 S_Dinka-1     kk1           219327. 191764.
 14 S_Han-2       kk1           219718. 192351.
 15 S_Karitiana-1 kk1           220304. 192937.
 16 reference     kk1           221086. 193523.
 17 S_Papuan-2    kk1           221672. 192351.
 18 S_Yoruba-2    kk1           222063. 192155.
 19 S_Mandenka-1  kk

In [70]:
t_AR_f <- map_dbl(refnames, ~ divergence_a00(gt, .x) %>% { .$f / (total_seq * mutation_rate) }) %>% setNames(refnames)

In [71]:
t_AR_f

In [72]:
mean(t_AR_f)

In [75]:
(1434 + 305) / (7.83e6 * mutation_rate)

In [76]:
1591 / (7.83e6 * mutation_rate)