In [1]:
library(tidyverse)
library(magrittr)
library(here)
library(phangorn)
library(furrr)

devtools::load_all(".")

plan(multiprocess)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract

here() starts at /mnt/expressions/mp/archaic-ychr
Loading required package: ape
Loading required package: future
Loading ychr


### Total number of informative non-African derived sites?

In [2]:
highcov <- read_vcf(here("data/vcf/full_modern.vcf.gz"), mindp = 3, maxdp = 0.98, var_only = TRUE)

In [3]:
samples <- read_info(highcov)
outgroup <- c("a00", "S_Ju_hoan_North_1")
ingroup <- filter(samples, pop %in% c("WestEur", "EastEur"))$name

# define diagnostic positions (outgroup pops different from ingroup pops)
outgroup_pos <- rowMeans(highcov[, outgroup])
ingroup_pos <- rowMeans(highcov[, ingroup], na.rm = T)
info_highcov <- highcov[abs(outgroup_pos - ingroup_pos) == 1, ] %>% filter(!is.na(chrom))

In [4]:
nrow(info_highcov)

### Count non-African derived SNPs in each individual

In [5]:
estimate_cont <- function(individual, gt) {
    # get names of outgroup and ingroup individuals
    samples <- read_info(gt)
    outgroup <- c("a00", "S_Ju_hoan_North_1")
    ingroup <- filter(samples, pop %in% c("WestEur", "EastEur"))$name

    # subset only to relevant individuals
    gt <- gt[, c("chrom", "pos", "REF", "ALT", unique(c(individual, outgroup, ingroup)))]

    # define diagnostic positions (outgroup pops different from ingroup pops)
    outgroup_pos <- rowMeans(gt[, outgroup])
    ingroup_pos <- rowMeans(gt[, ingroup], na.rm = T)
    info_gt <- gt[abs(outgroup_pos - ingroup_pos) == 1, ]

    # extract a vector of alleles for the tested individual
    alleles <- info_gt[[individual]] %>% .[!is.na(.)]

    # convert the vector of alleles to counts of ancestral and derived alleles
    counts <- factor(alleles, levels = c(0, 1)) %>% table %>% as.data.frame
    names(counts) <- c("allele", "count")
    counts %>%
        mutate(allele = ifelse(allele == 0, "derived", "ancestral")) %>%
        spread(allele, count) %>%
        mutate(name = fix_name(individual),
               total = length(alleles),
               derived / total) %>%
        select(name, everything())
}

In [6]:
counts <- future_map_dfr(c("spy1", "mez2", "den4", "den8", "elsidron2"), function(individual) {
    capture <- ifelse(individual == "elsidron2", "lippold", "full")

    gt <- read_genotypes(individual, capture, mindp = 3, maxdp = 0.98, var_only = TRUE)

    estimate_cont(individual, gt)
})

In [7]:
counts

name,ancestral,derived,total,derived/total
<chr>,<int>,<int>,<int>,<dbl>
Spy 94a,15,1,16,0.0625
Mezmaiskaya 2,189,0,189,0.0
Denisova 4,14,0,14,0.0
Denisova 8,90,0,90,0.0
El Sidrón 1253 (560 kb),29,0,29,0.0


### Sanity check - derived-allele counts in present-day SGDP samples

In [8]:
counts_sgdp <- samples$name %>% future_map_dfr(~ estimate_cont(.x, highcov))

In [9]:
counts_sgdp %>% arrange(derived)

name,ancestral,derived,total,derived/total
<chr>,<int>,<int>,<int>,<dbl>
S_Ju_hoan_North_1,268,0,268,0.0
A00,268,0,268,0.0
A00-1,249,0,249,0.0
A00-2,265,0,265,0.0
S_Dinka_1,106,153,259,0.5907336
S_Mandenka_1,107,156,263,0.5931559
S_Gambian_1,106,158,264,0.5984848
S_Yoruba_2,107,159,266,0.5977444
S_Mbuti_1,105,160,265,0.6037736
S_Burmese_1,0,260,260,1.0


# Estimate contamination based on a pileup

In [None]:
highcov_derived <- filter_derived(highcov)

out_anc <- (rowMeans(highcov_derived[, outgroup]) == 0)
in_der <- (rowMeans(highcov_derived[, ingroup]) == 1)

highcov_derived[out_anc & in_der, ] %>% filter(!is.na(chimp)) %>% nrow

In [74]:
x <- highcov_derived[out_anc & in_der, ]

In [75]:
filter(x, chimp)

chrom,pos,REF,ALT,chimp,ustishim,a00,a00_1,a00_2,S_BedouinB_1,⋯,S_Punjabi_1,S_Saami_2,S_Papuan_2,S_Karitiana_1,S_Ju_hoan_North_1,S_Dinka_1,S_Mbuti_1,S_Yoruba_2,S_Gambian_1,S_Mandenka_1
<chr>,<int>,<chr>,<chr>,<dbl>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
,,,,,,,,,,⋯,,,,,,,,,,
,,,,,,,,,,⋯,,,,,,,,,,
Y,2707859.0,G,C,1.0,1.0,0.0,,0.0,1.0,⋯,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Y,2722506.0,A,G,1.0,1.0,0.0,0.0,0.0,1.0,⋯,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Y,2733618.0,A,C,1.0,1.0,0.0,0.0,0.0,1.0,⋯,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
,,,,,,,,,,⋯,,,,,,,,,,


In [23]:
xder <- x %>% select(chrom, pos, REF, ALT)

In [25]:
xder %>% head

chrom,pos,REF,ALT
<chr>,<int>,<chr>,<chr>
Y,2707859,G,C
Y,2722506,A,G
Y,2733618,A,C
Y,2756471,T,A
Y,2863665,G,A
Y,6744622,G,C


In [None]:
df_der <- 

In [69]:
df_pileup <- read_tsv("../data/pileup/full_spy1.txt.gz")

Parsed with column specification:
cols(
  chrom = [31mcol_character()[39m,
  pos = [32mcol_double()[39m,
  ref = [31mcol_character()[39m,
  pileup = [31mcol_character()[39m,
  A = [32mcol_double()[39m,
  C = [32mcol_double()[39m,
  G = [32mcol_double()[39m,
  T = [32mcol_double()[39m
)


In [65]:
df <- left_join(xder, df_pileup) %>% select(-ref) %>% filter(nchar(pileup) >= 3)

Joining, by = c("chrom", "pos")


In [66]:
cont <- df %>% mutate(
    prop = case_when(
        REF == "A" ~ A / (A + C + G + T),
        REF == "C" ~ C / (A + C + G + T),
        REF == "G" ~ G / (A + C + G + T),
        REF == "T" ~ T / (A + C + G + T)
    )
)

In [68]:
cont

chrom,pos,REF,ALT,pileup,A,C,G,T,prop
<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Y,2707859,G,C,CCC,0,3,0,0,0.0
Y,7173143,A,G,GGG,0,0,3,0,0.0
Y,14079528,T,C,TTT,0,0,0,3,1.0
Y,14149010,A,G,GGG,0,0,3,0,0.0
Y,14324714,T,C,TTC,0,1,0,2,0.6666667
Y,16364286,A,C,CCC,0,3,0,0,0.0
Y,17186912,T,C,CCC,0,3,0,0,0.0
Y,17464197,C,T,CTT,0,1,0,2,0.3333333
Y,17493513,T,C,CCC,0,3,0,0,0.0
Y,17753650,G,A,AAA,3,0,0,0,0.0


In [67]:
mean(cont$prop, na.rm=T)

In [54]:
mean(cont$prop, na.rm=T)