In [1]:
library(tidyverse)
library(here)

suppressPackageStartupMessages(library(VariantAnnotation))

devtools::load_all(".")

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1       [32m✔[39m [34mpurrr  [39m 0.3.2  
[32m✔[39m [34mtibble [39m 2.1.1       [32m✔[39m [34mdplyr  [39m 0.8.0.[31m1[39m
[32m✔[39m [34mtidyr  [39m 0.8.3       [32m✔[39m [34mstringr[39m 1.4.0  
[32m✔[39m [34mreadr  [39m 1.3.1       [32m✔[39m [34mforcats[39m 0.4.0  
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
here() starts at /Users/martin_petr/projects/ychr
Loading ychr


### Read Mez2 genotypes generated by `bam-sample`

In [2]:
bamsample <- read_vcf(here("data/vcf/full_mez2.vcf.gz"), mindp = 3, maxdp = 0.975)

### Read Mez2 genotypes generated by snpAD

In [49]:
path <- here("data/vcf/snpad.vcf.gz")

vcf <- VariantAnnotation::readVcf(path)
gr <- GenomicRanges::granges(vcf)
dp <- VariantAnnotation::geno(vcf)$DP

mask <- apply(dp, 2, function(i) ifelse(i >= 3 & i <= quantile(i, 0.975, na.rm = TRUE), TRUE, FALSE))
if ("chimp" %in% colnames(mask)) 
    mask[, "chimp"] <- TRUE

gt <- VariantAnnotation::geno(vcf)$GT %>% replace(. == ".", NA) %>% replace(!mask, NA)

biallelic_pos <- rep(TRUE, length(gr))
gt_df <- tibble::as_tibble(gt)
info_df <- tibble::tibble(chrom = as.character(GenomicRanges::seqnames(gr))[biallelic_pos], 
    pos = GenomicRanges::start(gr)[biallelic_pos], REF = as.character(gr$REF)[biallelic_pos], 
    ALT = as.character(unlist(gr$ALT[biallelic_pos, ])))
df <- dplyr::bind_cols(info_df, gt_df)
colnames(df) <- str_replace_all(colnames(df), "-", "_")

In [50]:
snpad <- df

### Read pileups

In [5]:
pileups <- read_tsv(here("data/pileup/full_mez2.txt.gz"), col_types = "cicccccc") %>% rename(REF = ref)

In [6]:
head(pileups)

chrom,pos,REF,pileup,A,C,G,T
<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Y,2649811,A,A,1,0,0,0
Y,2649812,A,AA,2,0,0,0
Y,2649813,A,AAA,3,0,0,0
Y,2649814,A,AAAA,4,0,0,0
Y,2649815,A,AAAA,4,0,0,0
Y,2649816,A,AAAA,4,0,0,0


### Merge all three tables into one

In [75]:
merged <-
    full_join(bamsample, df, by = c("chrom", "pos", "REF")) %>%
    left_join(pileups, by = c("chrom", "pos", "REF"))

In [76]:
nrow(merged)

In [77]:
head(merged)

chrom,pos,REF,ALT.x,mez2,ALT.y,full_mez2,pileup,A,C,G,T
<chr>,<int>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Y,2649811,A,,,,,A,1,0,0,0
Y,2649812,A,,,,,AA,2,0,0,0
Y,2649813,A,,0.0,,,AAA,3,0,0,0
Y,2649814,A,,0.0,,,AAAA,4,0,0,0
Y,2649815,A,,0.0,,,AAAA,4,0,0,0
Y,2649816,A,,0.0,,,AAAA,4,0,0,0


In [78]:
all(merged$ALT.x == merged$ALT.y, na.rm = T)

In [80]:
merged <- mutate(merged, ALT = ALT.y) %>% select(-ALT.x, -ALT.y) %>%
    mutate(total = as.numeric(A) + as.numeric(C) + as.numeric(G) + as.numeric(T))

# Miscalled ALTs?

In [81]:
filter(merged, mez2 == "1" & full_mez2 != "1/1") %>% head

chrom,pos,REF,mez2,full_mez2,pileup,A,C,G,T,ALT,total
<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>


# Miscalled REFs?

In [82]:
filter(merged, mez2 == "0" & full_mez2 != "0/0") %>% head

chrom,pos,REF,mez2,full_mez2,pileup,A,C,G,T,ALT,total
<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>


# Investigate snpAD hets (despite this being Y chromosome)

How many?

In [83]:
filter(merged, full_mez2 != "0/0", full_mez2 != "1/1") %>% nrow

Do I even call something at snpAD het sites?

In [84]:
filter(merged, full_mez2 == "0/1") %>% filter(!is.na(mez2))

chrom,pos,REF,mez2,full_mez2,pileup,A,C,G,T,ALT,total
<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>


Nope, all snpAD het sites are excluded with my genotyper.

Write out all snpAD het sites:

In [85]:
filter(merged, full_mez2 == "0/1")

chrom,pos,REF,mez2,full_mez2,pileup,A,C,G,T,ALT,total
<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
Y,2841055,G,,0/1,GGGGGGGGAAAAAAAGGG,7,0,11,0,A,18
Y,2854738,G,,0/1,GAGGGAAGGGGGAAA,6,0,9,0,A,15
Y,3405761,T,,0/1,TTTTTTTACATTTTAT,3,1,0,12,A,16
Y,3405762,A,,0/1,AAAAAAATATAAAATAA,14,0,0,3,T,17
Y,3406061,G,,0/1,GGAAGAAGGGGGGGGAGG,5,0,13,0,A,18
Y,3406277,C,,0/1,CCTTCCCCCCCCCCTTCTCCCTCCTCCTCC,0,22,0,8,T,30
Y,3406292,T,,0/1,CTTTTTCTTTTTTTTTCTTCTTCTTT,0,5,0,21,C,26
Y,3406365,G,,0/1,GGGAAAGGGGGGAAGGGAGG,6,0,14,0,A,20
Y,3406710,C,,0/1,CCCCCAACCCCCCACCCCA,4,15,0,0,A,19
Y,3406853,T,,0/1,TTTTTTTTTCTCTTCTCCTTCTCT,0,7,0,17,C,24


Mixture of bases at sites that I ignore but snpAD calls?

In [87]:
filter(merged, is.na(mez2) & !is.na(full_mez2) & !is.na(A)) %>% arrange(desc(A), desc(C), desc(G), desc(T)) %>% filter(total < 4)

chrom,pos,REF,mez2,full_mez2,pileup,A,C,G,T,ALT,total
<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
Y,2652955,A,,0/0,AAC,2,1,0,0,,3
Y,7637931,A,,0/0,CAA,2,1,0,0,,3
Y,8668392,A,,0/0,ACA,2,1,0,0,,3
Y,14852078,A,,0/0,ACA,2,1,0,0,,3
Y,16375336,A,,0/0,AAC,2,1,0,0,,3
Y,18696792,A,,0/0,ACA,2,1,0,0,,3
Y,21642771,A,,0/0,AAC,2,1,0,0,,3
Y,23387307,A,,0/0,ACA,2,1,0,0,,3
Y,2656951,A,,0/0,AAG,2,0,1,0,,3
Y,2693166,A,,0/0,AAG,2,0,1,0,,3
