In [1]:
library(tidyverse)
library(magrittr)
library(here)
library(phangorn)
library(furrr)

devtools::load_all(".")

plan(multiprocess)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract

here() starts at /mnt/expressions/mp/archaic-ychr
Loading required package: ape
Loading required package: future
Loading ychr


### Total number of informative non-African derived sites?

In [2]:
highcov <- read_vcf(here("data/vcf/full_modern.vcf.gz"), mindp = 3, maxdp = 0.98, var_only = TRUE)

In [3]:
samples <- read_info(highcov)
outgroup <- c("a00", "S_Ju_hoan_North_1")
ingroup <- filter(samples, pop %in% c("WestEur", "EastEur"))$name

# define diagnostic positions (outgroup pops different from ingroup pops)
outgroup_pos <- rowMeans(highcov[, outgroup])
ingroup_pos <- rowMeans(highcov[, ingroup], na.rm = T)
info_highcov <- highcov[abs(outgroup_pos - ingroup_pos) == 1, ] %>% filter(!is.na(chrom))

In [4]:
nrow(info_highcov)

### Count non-African derived SNPs in each individual

In [5]:
estimate_cont <- function(individual, gt) {
    # get names of outgroup and ingroup individuals
    samples <- read_info(gt)
    outgroup <- c("a00", "S_Ju_hoan_North_1")
    ingroup <- filter(samples, pop %in% c("WestEur", "EastEur"))$name

    # subset only to relevant individuals
    gt <- gt[, c("chrom", "pos", "REF", "ALT", unique(c(individual, outgroup, ingroup)))]

    # define diagnostic positions (outgroup pops different from ingroup pops)
    outgroup_pos <- rowMeans(gt[, outgroup])
    ingroup_pos <- rowMeans(gt[, ingroup], na.rm = T)
    info_gt <- gt[abs(outgroup_pos - ingroup_pos) == 1, ]

    # extract a vector of alleles for the tested individual
    alleles <- info_gt[[individual]] %>% .[!is.na(.)]

    # convert the vector of alleles to counts of ancestral and derived alleles
    counts <- factor(alleles, levels = c(0, 1)) %>% table %>% as.data.frame
    names(counts) <- c("allele", "count")
    counts %>%
        mutate(allele = ifelse(allele == 0, "derived", "ancestral")) %>%
        spread(allele, count) %>%
        mutate(name = fix_name(individual),
               total = length(alleles),
               derived / total) %>%
        select(name, everything())
}

In [6]:
counts <- future_map_dfr(c("spy1", "mez2", "den4", "den8", "elsidron2"), function(individual) {
    capture <- ifelse(individual == "elsidron2", "lippold", "full")

    gt <- read_genotypes(individual, capture, mindp = 3, maxdp = 0.98, var_only = TRUE)

    estimate_cont(individual, gt)
})

In [7]:
counts

name,ancestral,derived,total,derived/total
<chr>,<int>,<int>,<int>,<dbl>
Spy 94a,15,1,16,0.0625
Mezmaiskaya 2,189,0,189,0.0
Denisova 4,14,0,14,0.0
Denisova 8,90,0,90,0.0
El Sidrón 1253 (560 kb),29,0,29,0.0


### Sanity check - derived-allele counts in present-day SGDP samples

In [8]:
counts_sgdp <- samples$name %>% future_map_dfr(~ estimate_cont(.x, highcov))

In [9]:
counts_sgdp %>% arrange(derived)

name,ancestral,derived,total,derived/total
<chr>,<int>,<int>,<int>,<dbl>
S_Ju_hoan_North_1,268,0,268,0.0
A00,268,0,268,0.0
A00-1,249,0,249,0.0
A00-2,265,0,265,0.0
S_Dinka_1,106,153,259,0.5907336
S_Mandenka_1,107,156,263,0.5931559
S_Gambian_1,106,158,264,0.5984848
S_Yoruba_2,107,159,266,0.5977444
S_Mbuti_1,105,160,265,0.6037736
S_Burmese_1,0,260,260,1.0


# Estimate contamination based on a pileup

In [33]:
info_pos <- info_highcov %>% select(chrom, pos, REF, ALT)

In [54]:
df_pileups <- future_map_dfr(c("spy1", "mez2", "den4", "den8", "elsidron2"), function(individual) {
    capture <- ifelse(individual == "elsidron2", "lippold", "full")
    df_pileup <- read_tsv(here(paste0("data/pileup/", capture, "_", individual, ".txt.gz")),
                          col_type = "cicciiii")

    left_join(info_pos, df_pileup, by = c("chrom", "pos")) %>%
        select(-ref) %>%
        filter(nchar(pileup) >= 3) %>%
        mutate(
            prop = case_when(
                REF == "A" ~ A / (A + C + G + T),
                REF == "C" ~ C / (A + C + G + T),
                REF == "G" ~ G / (A + C + G + T),
                REF == "T" ~ T / (A + C + G + T)
            ),
            name = fix_name(individual)
        )
    }
)

In [55]:
df_pileups %>%
    group_by(name) %>%
    summarise(cont = mean(prop))

name,cont
<chr>,<dbl>
Denisova 4,0.19479167
Denisova 8,0.03354257
El Sidrón 1253 (560 kb),0.02967033
Mezmaiskaya 2,0.03860624
Spy 94a,0.10625


In [56]:
df_pileups %>% filter(name == "Denisova 4")

chrom,pos,REF,ALT,pileup,A,C,G,T,prop,name
<chr>,<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<dbl>,<chr>
Y,2707859,G,C,TCC,0,2,0,1,0.0,Denisova 4
Y,2757670,T,C,TCC,0,2,0,1,0.3333333,Denisova 4
Y,7173143,A,G,GGG,0,0,3,0,0.0,Denisova 4
Y,7533511,A,G,GGAA,2,0,2,0,0.5,Denisova 4
Y,7740081,A,G,GGG,0,0,3,0,0.0,Denisova 4
Y,7778691,T,C,CCCT,0,3,0,1,0.25,Denisova 4
Y,7796647,T,C,CCT,0,2,0,1,0.3333333,Denisova 4
Y,7887815,T,A,TAA,2,0,0,1,0.3333333,Denisova 4
Y,8474189,C,A,AAAA,4,0,0,0,0.0,Denisova 4
Y,8526565,A,G,GAG,1,0,2,0,0.3333333,Denisova 4


In [59]:
chag_pileups <- future_map_dfr(paste0("chag", c("01", "02", "06", "07", "09", "1141", "13", "14", "19", "41")),
                                    function(individual) {
    capture <- ifelse(individual == "elsidron2", "lippold", "full")
    df_pileup <- read_tsv(paste0("../../chag-ychr/data/pileup/", capture, "_", individual, ".txt.gz"),
                          col_type = "cicciiii")

    left_join(info_pos, df_pileup, by = c("chrom", "pos")) %>%
        select(-ref) %>%
        filter(nchar(pileup) >= 3) %>%
        mutate(
            prop = case_when(
                REF == "A" ~ A / (A + C + G + T),
                REF == "C" ~ C / (A + C + G + T),
                REF == "G" ~ G / (A + C + G + T),
                REF == "T" ~ T / (A + C + G + T)
            ),
            name = fix_name(individual)
        )
    }
)

In [60]:
chag_pileups %>%
    group_by(name) %>%
    summarise(cont = mean(prop))

name,cont
<chr>,<dbl>
chag01,0.25528455
chag02,0.0
chag07,0.01960784
chag09,0.125
chag1141,0.0
chag13,0.01666667
chag19,0.07692308
chag41,0.06435971
