In [1]:
suppressPackageStartupMessages({

    library(VariantAnnotation)
    library(tidyverse)
    library(magrittr)
    library(here)
    library(bdkn)
    library(rtracklayer)

})

In [2]:
source(here("src/utils.R"))

In [3]:
vcf <- read_gt(here("data/vcf/merged_full.vcf.gz"), var_only = TRUE)

In [4]:
pops <- tibble(
    name = c(str_subset(colnames(vcf), "^S_"), "a00"),
    pop = case_when(
        name %in% c("S_Burmese_1", "S_Thai_1", "S_Han_2", "S_Dai_2", "S_Punjabi_1", "S_Papuan_2", "S_Karitiana_1") ~ "EastEur",
        name %in% c("S_BedouinB_1", "S_Turkish_1", "S_French_1", "S_Finnish_2", "S_Sardinian_1", "S_Saami_2") ~ "WestEur",
        TRUE ~ "Africa"
    )
)

In [5]:
vcf <- select(vcf, -c(chrom, pos, REF, ALT, reference, a00_1, a00_2, denisova8sub))

In [6]:
counts <- select(vcf, -chimp) %>% rowSums(na.rm = TRUE)

In [7]:
gt <- filter(vcf, counts > 0 & counts < (ncol(vcf) - 1))

In [8]:
derived <- mutate_all(gt, ~ as.integer(.x != gt$chimp))

In [9]:
derived %>% filter(a00 == 1) %>% head

chimp,spy1,mez2,comb_neand,denisova8,S_BedouinB_1,S_Turkish_1,S_French_1,S_Burmese_1,S_Thai_1,⋯,S_Yoruba_2,S_Gambian_1,S_Mandenka_1,S_Ju_hoan_North_1,ustishim,a00,kk1,mota,bichon,loschbour
0,,,,,0,0,0,0,0,⋯,0,0,0,0,0.0,1,0.0,0,,0
0,,,,0.0,1,1,1,1,1,⋯,1,1,1,1,1.0,1,1.0,1,,1
0,,1.0,1.0,0.0,1,1,1,1,1,⋯,1,1,1,1,1.0,1,1.0,1,,1
0,,,,,0,0,0,0,0,⋯,0,0,0,0,,1,0.0,0,,0
0,,,,0.0,1,1,1,1,1,⋯,1,1,1,1,1.0,1,,1,1.0,1
0,,,,,0,0,0,0,0,⋯,0,0,0,0,0.0,1,0.0,0,,0


### What is the rate of back mutations/errors estimated using the high-quality samples?

In [37]:
africans <- filter(pops, pop == "Africa")$name
nonafricans <- filter(pops, pop != "Africa")$name

In [39]:
africans <- c("a00", "S_Ju_hoan_North_1")
nonafricans <- c(nonafricans, "S_Dinka_1", "S_Mbuti_1", "S_Yoruba_2", "S_Gambian_1", "S_Mandenka_1")

In [68]:
afr_anc <- (rowSums(derived[, africans]) == 0)

In [69]:
nonafr_der <- (rowSums(derived[, nonafricans]) == length(nonafricans))

In [70]:
fixed <- derived[afr_anc & nonafr_der, ] %>% select(-one_of(pops$name))

In [71]:
fixed %>% summarise_all(mean, na.rm = TRUE)

chimp,spy1,mez2,comb_neand,denisova8,ustishim,kk1,mota,bichon,loschbour
0,0,0,0,0,1,1,1,1,1


In [38]:
filter(derived, vars()) %>%
    summarise_all(mean, na.rm = TRUE) %>%
    gather(ind, prop) %>%
    arrange(desc(prop))

ind,prop
a00,1.0
denisova8,0.02863436
spy1,0.02702703
comb_neand,0.02586207
mez2,0.02173913
bichon,0.01870748
S_Turkish-1,0.01704036
S_Karitiana-1,0.01697945
S_Sardinian-1,0.01626016
S_Burmese-1,0.01623084
