# Comparing various genotyping strategies

In [1]:
library(tidyverse)
library(here)

devtools::load_all(".")

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
here() starts at /mnt/expressions/mp/ychr
Loading ychr


# A00

In [2]:
read_vcf(here("test/a00_baq.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [3]:
read_vcf(here("test/a00_nobaq.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [4]:
read_vcf(here("test/a00_consensus.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [5]:
read_vcf(here("test/a00_tolerance.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [18]:
a00 <- read_vcf(here("test/genotyping_a00.vcf.gz"), mindp = 3, maxdp = 0.98) %>%
    filter(!is.na(baq) | !is.na(nobaq) | !is.na(cons) | !is.na(tol))

In [19]:
nrow(a00)

In [23]:
filter(a00, baq != nobaq)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Y,2805552,G,A,0,1,,
Y,3715248,A,G,0,1,,
Y,8561368,C,T,0,1,,
Y,8561376,C,T,0,1,,
Y,17029481,G,A,0,1,,
Y,17029484,T,A,0,1,,


In [38]:
filter(a00, baq != nobaq) %>% group_by(baq, nobaq) %>% tally

baq,nobaq,n
<dbl>,<dbl>,<int>
0,1,6


In [74]:
filter(a00, is.na(cons) & !(is.na(cons) & is.na(tol))) %>%
    group_by(alts = ALT == "") %>% sample_n(10) %>% ungroup %>% select(-alts)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Y,19128936,A,C,,1,,1
Y,10035969,C,T,1.0,1,,1
Y,14243754,C,T,1.0,1,,1
Y,10031638,C,G,1.0,1,,1
Y,23466468,T,C,1.0,1,,1
Y,22549620,G,A,1.0,1,,1
Y,8388616,A,C,,1,,1
Y,7759364,T,C,1.0,1,,1
Y,9930635,C,G,1.0,1,,1
Y,17566434,C,A,1.0,1,,1


Based on the table right above, tolerance 90% cutoff produces calls consistent with bcftools.

# Mezmaiskaya 2 (high-coverage archaic, ideal case)

In [28]:
read_vcf(here("test/mez2_baq.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [29]:
read_vcf(here("test/mez2_nobaq.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [30]:
read_vcf(here("test/mez2_consensus.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [31]:
read_vcf(here("test/mez2_tolerance.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [32]:
mez2 <- read_vcf(here("test/genotyping_mez2.vcf.gz"), mindp = 3, maxdp = 0.98) %>%
    filter(!is.na(baq) | !is.na(nobaq) | !is.na(cons) | !is.na(tol))

In [35]:
nrow(mez2)

In [39]:
filter(mez2, baq != nobaq) %>% sample_n(20)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Y,21153166,T,A,0,1,,
Y,9417686,C,T,0,1,,
Y,19564220,G,A,0,1,,
Y,15787832,G,A,1,0,,
Y,15516284,C,T,0,1,,
Y,8316824,A,G,0,1,,
Y,8455563,T,C,1,0,,
Y,9984843,G,A,0,1,,
Y,7280606,T,C,0,1,,
Y,23540925,C,T,0,1,,


In [40]:
filter(mez2, baq != nobaq) %>% group_by(baq, nobaq) %>% tally

baq,nobaq,n
<dbl>,<dbl>,<int>
0,1,35
1,0,5


In [41]:
filter(mez2, cons != tol)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>


In [75]:
filter(mez2, is.na(cons) & !(is.na(cons) & is.na(tol))) %>%
    group_by(alts = ALT == "") %>% sample_n(10) %>% ungroup %>% select(-alts)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Y,15304343,T,C,1,1,,1
Y,3718196,C,T,1,1,,1
Y,17028173,C,T,1,1,,1
Y,22646080,A,G,1,1,,1
Y,23575707,T,C,1,1,,1
Y,17032088,T,C,1,1,,1
Y,18109555,A,C,1,1,,1
Y,22889018,A,G,1,1,,1
Y,18025230,A,C,1,1,,1
Y,18113272,T,C,1,1,,1


# Denisova 8 (low coverage archaic, extreme case)

At 90% consensus cutoff we expect the lower coverage genotypes to be the same as strict 100% consensus.

In [51]:
read_vcf(here("test/den8_baq.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [52]:
read_vcf(here("test/den8_nobaq.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [53]:
read_vcf(here("test/den8_consensus.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [54]:
read_vcf(here("test/den8_tolerance.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [55]:
den8 <- read_vcf(here("test/genotyping_den8.vcf.gz"), mindp = 3, maxdp = 0.98) %>%
    filter(!is.na(baq) | !is.na(nobaq) | !is.na(cons) | !is.na(tol))

In [56]:
nrow(den8)

In [57]:
filter(den8, baq != nobaq) %>% sample_n(20)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Y,18822816,C,T,0,1,,
Y,2664276,C,T,0,1,,
Y,8516147,G,A,0,1,,
Y,18650271,C,T,0,1,,
Y,6426410,C,T,0,1,,
Y,16462198,G,A,0,1,,
Y,22786283,C,T,0,1,,
Y,15524224,C,T,0,1,,
Y,21373591,C,T,0,1,,
Y,14997141,C,T,0,1,,


In [58]:
filter(den8, baq != nobaq) %>% group_by(baq, nobaq) %>% tally

baq,nobaq,n
<dbl>,<dbl>,<int>
0,1,224
1,0,11


In [59]:
filter(den8, cons != tol)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>


In [83]:
filter(den8, is.na(cons) & !(is.na(cons) & is.na(tol))) %>%
    group_by(alts = ALT == "") %>% sample_n(3) %>% ungroup %>% select(-alts)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Y,17517362,A,G,1,1,,1
Y,8396636,A,G,1,1,,1
Y,18759298,A,G,1,1,,1
Y,17243535,C,,0,0,,0
Y,21489690,C,,0,0,,0
Y,9415633,C,,0,0,,0


# Spy 1 (low coverage archaic, even more extreme case)

At 90% consensus cutoff we expect the lower coverage genotypes to be the same as strict 100% consensus.

In [84]:
read_vcf(here("test/spy1_baq.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [85]:
read_vcf(here("test/spy1_nobaq.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [86]:
read_vcf(here("test/spy1_consensus.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [87]:
read_vcf(here("test/spy1_tolerance.vcf.gz"), mindp = 3, maxdp = 0.98) %>% nrow

In [88]:
spy1 <- read_vcf(here("test/genotyping_spy1.vcf.gz"), mindp = 3, maxdp = 0.98) %>%
    filter(!is.na(baq) | !is.na(nobaq) | !is.na(cons) | !is.na(tol))

In [90]:
nrow(spy1)

In [91]:
filter(spy1, baq != nobaq) %>% sample_n(20)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Y,21242313,C,T,0,1,,
Y,8458374,C,T,0,1,,
Y,14621853,G,A,0,1,,
Y,8586539,C,T,1,0,,
Y,21398150,C,G,1,0,,
Y,15551255,G,A,0,1,,
Y,8566258,C,T,0,1,,
Y,18126700,G,A,0,1,,
Y,19519649,C,T,0,1,,
Y,23509937,C,T,0,1,,


In [92]:
filter(spy1, baq != nobaq) %>% group_by(baq, nobaq) %>% tally

baq,nobaq,n
<dbl>,<dbl>,<int>
0,1,61
1,0,14


In [93]:
filter(spy1, cons != tol)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>


In [100]:
filter(spy1, is.na(cons) & !(is.na(cons) & is.na(tol))) %>%
    group_by(alts = ALT == "") %>% sample_n(10) %>% ungroup %>% select(-alts)

chrom,pos,REF,ALT,baq,nobaq,cons,tol
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
