In [1]:
library(tidyverse)
library(magrittr)
library(here)
suppressPackageStartupMessages(library(rtracklayer))

devtools::load_all(".")

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.0     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract

here() starts at /mnt/expressions/mp/ychr
Loading ychr


Read functionally important positions from Mendez et al.:

In [2]:
temp <- tempfile()
download.file("https://ars.els-cdn.com/content/image/1-s2.0-S0002929716300337-mmc6.zip", temp)
df <- read_tsv(unz(temp, "ajhg2064mmc6_V1.txt"),
               col_names = c("pos", "REF", "mendez_a00", "mendez_elsidron1", "mendez_chimp", "branch", "cov_ef",
                             "neanderhal_discordant_base_calls",
                             "Filter2", "Filter3", "Filter4", "Genomic_location/effect/functional_prediction"),
               skip = 1) %>%
    select(-starts_with("Filter")) %>%
    mutate(chrom = "Y") %>%
    select(chrom, pos, everything())
unlink(temp)

Parsed with column specification:
cols(
  pos = [32mcol_double()[39m,
  REF = [31mcol_character()[39m,
  mendez_a00 = [31mcol_character()[39m,
  mendez_elsidron1 = [31mcol_character()[39m,
  mendez_chimp = [31mcol_character()[39m,
  branch = [31mcol_character()[39m,
  cov_ef = [32mcol_double()[39m,
  neanderhal_discordant_base_calls = [32mcol_double()[39m,
  Filter2 = [31mcol_character()[39m,
  Filter3 = [31mcol_character()[39m,
  Filter4 = [31mcol_character()[39m,
  `Genomic_location/effect/functional_prediction` = [31mcol_character()[39m
)


In [3]:
positions <- c("2,844,774", "2,847,322", "4,967,724", "5,605,569", "6,932,032", "14,832,610",
         "14,832,620", "14,838,553", "15,816,262", "21,868,167", "21,905,071", "23,545,399") %>%
    str_replace_all(",", "") %>% as.integer

In [4]:
mez2 <- read_vcf(here("data/vcf/full_mez2.vcf.gz"), mindp = 1, maxdp = 1)

In [5]:
nrow(df)
nrow(mez2)

In [16]:
nrow(df)
nrow(mez2)

### Which positions of functional importance according to Mendez did we capture?

In [23]:
capture <- import.bed(here("data/coord/capture_full.bed"))

In [24]:
positions_gr <- GRanges(seqnames = "Y", ranges = IRanges(start = positions, width = 1))

In [25]:
positions_covered <- positions_gr[positions_gr %over% capture]
positions_covered

GRanges object with 7 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]        Y   6932032      *
  [2]        Y  14832610      *
  [3]        Y  14832620      *
  [4]        Y  14838553      *
  [5]        Y  21868167      *
  [6]        Y  21905071      *
  [7]        Y  23545399      *
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [5]:
nrow(df)
nrow(mez2)

In [27]:
merged <- right_join(mez2, df, by = c("chrom", "pos", "REF")) %>%
    select(chrom, pos, REF, ALT, chimp, mendez_a00, a00, mendez_elsidron1, mez2, everything()) %>%
    mutate(
        mendez_elsidron1 = as.integer(mendez_elsidron1 != REF),
        mendez_chimp = as.integer(mendez_chimp != REF),
        mendez_a00 = as.integer(mendez_a00 != REF)
    )

In [28]:
merged %>% filter(pos %in% positions)

chrom,pos,REF,ALT,chimp,mendez_a00,a00,mendez_elsidron1,mez2,ustishim,⋯,S_Punjabi_1,S_Saami_2,S_Papuan_2,S_Karitiana_1,S_Ju_hoan_North_1,mendez_chimp,branch,cov_ef,neanderhal_discordant_base_calls,Genomic_location/effect/functional_prediction
<chr>,<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<dbl>,<dbl>,<chr>
Y,2844774,T,,,0,,1,,,⋯,,,,,,0,f,1.0,0,ZFY/coding_intronic/Non-synonymous
Y,2847322,T,,,1,,0,,,⋯,,,,,,0,e,19.0,0,ZFY/coding/Non-synonymous
Y,4967724,A,,,0,,1,,,⋯,,,,,,0,f,1.0,0,PCDH11Y/coding/Non-synonymous
Y,5605569,C,,,0,,1,,,⋯,,,,,,0,f,1.0,0,PCDH11Y/coding/Non-synonymous
Y,6932032,G,,0.0,0,0.0,1,,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0,f,5.0,0,TBL1Y/coding/Non-synonymous
Y,14832610,A,,0.0,0,0.0,1,,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0,f,3.0,0,USP9Y/coding/Non-synonymous
Y,14832620,G,T,1.0,1,1.0,1,1.0,0.0,⋯,0.0,0.0,0.0,0.0,1.0,1,d,,0,USP9Y/coding/Non-synonymous
Y,14838553,G,,0.0,0,0.0,1,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0,f,4.0,0,USP9Y/coding/Non-synonymous
Y,15816262,C,,,0,,1,,,⋯,,,,,,0,f,3.0,0,TMSB4Y/coding/Stop
Y,21868167,C,T,1.0,0,0.0,1,1.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,1,a,,0,KDM5D/coding/Non-synonymous


In [34]:
tibble(chrom = "Y", start = positions - 1, end = positions) %>% write_tsv("../asd.bed", col_names = F)

In [100]:
x <- merged %>% mutate(
        mendez_elsidron1 = as.integer(mendez_elsidron1 != REF),
        mendez_chimp = as.integer(mendez_chimp != REF),
        mendez_a00 = as.integer(mendez_a00 != REF)
    )

In [103]:
x %>% filter(pos %in% positions) %>% nrow

In [6]:
p <- read_tsv("../data/pileup//full_mez2.txt.gz", col_types = "cicciiii")

In [7]:
head(p)

chrom,pos,ref,pileup,A,C,G,T
<chr>,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>
Y,2649811,A,A,1,0,0,0
Y,2649812,A,AA,2,0,0,0
Y,2649813,A,AAA,3,0,0,0
Y,2649814,A,AAAA,4,0,0,0
Y,2649815,A,AAAA,4,0,0,0
Y,2649816,A,AAAA,4,0,0,0


In [20]:
merged <- left_join(df, p, by = c("chrom", "pos", "REF" = "ref")) %>%
    left_join(mez2, by = c("chrom", "pos", "REF")) %>%
    select(chrom, pos, REF, ALT, mendez_elsidron1, mez2, A, C, G, T, pileup, everything())

In [31]:
merged %>% filter(pos %in% start(positions_covered))

chrom,pos,REF,ALT,mendez_elsidron1,mez2,A,C,G,T,pileup,mendez_a00,mendez_chimp,branch,cov_ef,neanderhal_discordant_base_calls,Genomic_location/effect/functional_prediction
<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
Y,6932032,G,,C,,0,15,0,2,CCCCCCCTCCCCCCCTC,G,G,f,5.0,0,TBL1Y/coding/Non-synonymous
Y,14832610,A,,G,,1,0,5,0,GGGGGA,A,A,f,3.0,0,USP9Y/coding/Non-synonymous
Y,14832620,G,T,T,1.0,0,0,0,7,TTTTTTT,T,T,d,,0,USP9Y/coding/Non-synonymous
Y,14838553,G,,A,0.0,0,0,18,0,GGGGGGGGGGGGGGGGGG,G,G,f,4.0,0,USP9Y/coding/Non-synonymous
Y,21868167,C,T,T,1.0,0,0,0,12,TTTTTTTTTTTT,C,T,a,,0,KDM5D/coding/Non-synonymous
Y,21905071,C,T,T,1.0,0,0,0,16,TTTTTTTTTTTTTTTT,C,T,a,,0,KDM5D/coding/Non-synonymous
Y,23545399,G,,G,0.0,0,0,26,0,GGGGGGGGGGGGGGGGGGGGGGGGGG,A,G,e,16.0,0,PRORY/coding/Non-synonymous
