In [46]:
### A wonderful tutorial:
### https://www.bioconductor.org/packages/devel/bioc/vignettes/VariantAnnotation/inst/doc/VariantAnnotation.html#variant-call-format-vcf-files

In [2]:
library(VariantAnnotation)
library(dplyr)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

In [3]:
vcf = readVcf("./Z.variantCall.SNPs.vcf", "hg19")

In [4]:
vcf

class: CollapsedVCF 
dim: 3559138 1 
rowRanges(vcf):
  GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
info(vcf):
  DataFrame with 22 columns: AC, AF, AN, BaseQRankSum, ClippingRankSum, DP, ...
info(header(vcf)):
                       Number Type    Description                              
   AC                  A      Integer Allele count in genotypes, for each AL...
   AF                  A      Float   Allele Frequency, for each ALT allele,...
   AN                  1      Integer Total number of alleles in called geno...
   BaseQRankSum        1      Float   Z-score from Wilcoxon rank sum test of...
   ClippingRankSum     1      Float   Z-score From Wilcoxon rank sum test of...
   DP                  1      Integer Approximate read depth; some reads may...
   DS                  0      Flag    Were any of the samples downsampled?     
   END                 1      Integer Stop position of the interval            
   FS                  1      Float   Phred-s

In [22]:
chr22_vcf = vcf[seqnames(rowRanges(vcf)) == "22", ]

In [23]:
rowRanges(chr22_vcf)

GRanges object with 45592 ranges and 5 metadata columns:
                  seqnames    ranges strand | paramRangeID            REF
                     <Rle> <IRanges>  <Rle> |     <factor> <DNAStringSet>
  22:16050822_G/A       22  16050822      * |           NA              G
  22:16051249_T/C       22  16051249      * |           NA              T
  22:16051347_G/C       22  16051347      * |           NA              G
  22:16051453_A/C       22  16051453      * |           NA              A
  22:16051497_A/G       22  16051497      * |           NA              A
              ...      ...       ...    ... .          ...            ...
  22:51220441_T/C       22  51220441      * |           NA              T
  22:51227937_A/C       22  51227937      * |           NA              A
  22:51230872_T/C       22  51230872      * |           NA              T
  22:51231657_T/C       22  51231657      * |           NA              T
  22:51241080_C/G       22  51241080      * |          

In [47]:
# Failed trials
#chr22_region = GRanges(seqnames = "22", ranges = IRanges(start = 1, end = 1e9))
#vcf_chr22 = subsetByOverlaps(vcf, chr22_region)
#seqlevels(chr22_vcf, pruning.mode="coarse") = "chr22"

In [25]:
chr22_vcf = keepSeqlevels(chr22_vcf, value = "22", pruning.mode = "coarse")

In [29]:
seqlevels(chr22_vcf) = "chr22"

In [31]:
rowRanges(chr22_vcf)

GRanges object with 45592 ranges and 5 metadata columns:
                  seqnames    ranges strand | paramRangeID            REF
                     <Rle> <IRanges>  <Rle> |     <factor> <DNAStringSet>
  22:16050822_G/A    chr22  16050822      * |           NA              G
  22:16051249_T/C    chr22  16051249      * |           NA              T
  22:16051347_G/C    chr22  16051347      * |           NA              G
  22:16051453_A/C    chr22  16051453      * |           NA              A
  22:16051497_A/G    chr22  16051497      * |           NA              A
              ...      ...       ...    ... .          ...            ...
  22:51220441_T/C    chr22  51220441      * |           NA              T
  22:51227937_A/C    chr22  51227937      * |           NA              A
  22:51230872_T/C    chr22  51230872      * |           NA              T
  22:51231657_T/C    chr22  51231657      * |           NA              T
  22:51241080_C/G    chr22  51241080      * |          

In [32]:
txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
loc = locateVariants(chr22_vcf, txdb, AllVariants())

“GRanges object contains 1892 out-of-bound ranges located on sequences
  74442, 73481, 74448, 73495, 73496, 73498, 73504, 74469, 74470, 74507,
  74508, 74509, 74510, 74505, 74506, 73534, 73543, 73547, 74544, 73574,
  73575, 73576, 74549, 74569, 74570, 73622, 73623, 73624, 73625, 74578,
  74579, 73648, 73649, 74600, 74603, 73666, 73667, 73671, 73672, 73673,
  73674, 73675, 73679, 74623, 74624, 74625, 74626, 74627, 74628, 74629,
  74630, 73700, 73701, 73703, 74655, 73713, 73714, 74662, 73735, 73736,
  73737, 73739, 73745, 73746, 73747, 73750, 73751, 73756, 73757, 73758,
  73762, 73763, 74705, 74706, 73776, 73777, 73782, 73793, 73798, 73801,
  73802, 73803, 73804, 73805, 73806, 74714, 74717, 74718, 74715, 74716,
  74719, 74720, 74722, 74723, 74724, 74725, 74742, 73830, 73831, 74756,
  74757, 74762, 74763, 74764, 73852, 74779, 74781, 73872, 73873, 73874,
  73878, 73879, 74800, 74801, 74802, 74803, 74809, 73926, 73929, 73938,
  73939, 73940, 73941, 73942, 74829, 74831, 73946, 73947, 73948, 

In [33]:
loc

GRanges object with 116628 ranges and 9 metadata columns:
                  seqnames    ranges strand |   LOCATION  LOCSTART    LOCEND
                     <Rle> <IRanges>  <Rle> |   <factor> <integer> <integer>
  22:16050822_G/A    chr22  16050822      * | intergenic      <NA>      <NA>
  22:16051249_T/C    chr22  16051249      * | intergenic      <NA>      <NA>
  22:16051347_G/C    chr22  16051347      * | intergenic      <NA>      <NA>
  22:16051453_A/C    chr22  16051453      * | intergenic      <NA>      <NA>
  22:16051497_A/G    chr22  16051497      * | intergenic      <NA>      <NA>
              ...      ...       ...    ... .        ...       ...       ...
  22:51231657_T/C    chr22  51231657      + | intron         86817     86817
  22:51231657_T/C    chr22  51231657      + | intron         53549     53549
  22:51231657_T/C    chr22  51231657      + | intron         77621     77621
  22:51231657_T/C    chr22  51231657      + | intron         92082     92082
  22:51241080_C/G 

In [82]:
#splt = split(mcols(loc)$GENEID, mcols(loc)$QUERYID) 
#table(sapply(splt, function(x) length(unique(x)) > 1))

In [35]:
splt = split(mcols(loc)$QUERYID, mcols(loc)$GENEID)
head(sapply(splt, function(x) length(unique(x))), 3)

In [39]:
variants = sapply(splt, function(x) length(unique(x)))
gene_ids = names(variants)

In [40]:
library(org.Hs.eg.db)
library(AnnotationDbi)

symbols = mapIds(org.Hs.eg.db,
                  keys = gene_ids,
                  column = "SYMBOL",
                  keytype = "ENTREZID",
                  multiVals = "first")

'select()' returned 1:1 mapping between keys and columns



In [49]:
df = data.frame(row.names = gene_ids, Mutation_count = variants, Gene_symbol = symbols)
df %>% arrange(desc(Mutation_count)) %>% filter(Mutation_count<100)

Unnamed: 0_level_0,Mutation_count,Gene_symbol
Unnamed: 0_level_1,<int>,<chr>
9811,98,CTIF
5289,97,PIK3C3
54549,97,SDK2
1000,95,CDH2
56984,95,PSMG2
9625,93,AATK
124565,92,SLC38A10
8710,92,SERPINB7
147429,91,AQP4-AS1
2689,91,GH2


In [62]:
gene_locs = genes(txdb, filter = list(gene_id = gene_ids),single.strand.genes.only=T)

  3 genes were dropped because they have exons located on both strands of
  the same reference sequence or on more than one reference sequence, so
  cannot be represented by a single genomic range.
  Use 'single.strand.genes.only=FALSE' to get all the genes in a
  GRangesList object, or use suppressMessages() to suppress this message.



In [65]:
symbols

In [68]:
gene_locs$symbol = symbols[match(gene_locs$gene_id, gene_ids)]

In [69]:
gene_locs

GRanges object with 896 ranges and 2 metadata columns:
            seqnames            ranges strand |     gene_id       symbol
               <Rle>         <IRanges>  <Rle> | <character>  <character>
       1000    chr18 25530930-25757445      - |        1000         CDH2
  100037417    chr22 24309026-24314748      + |   100037417         DDTL
  100101467    chr18 32831023-32870196      - |   100101467      ZSCAN30
  100126318    chr22 22007270-22007347      + |   100126318      MIR301B
  100128531    chr22 25498384-25508659      - |   100128531 KIAA1671-AS1
        ...      ...               ...    ... .         ...          ...
       9931    chr17 65066554-65241319      - |        9931         HELZ
       9984    chr18     214520-268059      - |        9984        THOC1
       9989    chr18   9546789-9614605      - |        9989       PPP4R1
       9993    chr22 19023795-19109967      - |        9993        DGCR2
       9997    chr22 50961997-50964905      - |        9997         S

In [70]:
df_ft = data.frame(row.names = gene_locs$gene_id, Chromosome = as.character(seqnames(gene_locs)),
  Gene_symbol = gene_locs$symbol,
  stringsAsFactors = FALSE
)
df_ft

Unnamed: 0_level_0,Chromosome,Gene_symbol
Unnamed: 0_level_1,<chr>,<chr>
1000,chr18,CDH2
100037417,chr22,DDTL
100101467,chr18,ZSCAN30
100126318,chr22,MIR301B
100128531,chr22,KIAA1671-AS1
100128893,chr18,GATA6-AS1
100128946,chr22,LINC01310
100130370,chr17,LINC03048
100130418,chr22,CECR7
100130480,chr18,LINC01387


In [78]:
df$Chromosome = df_ft[rownames(df),'Chromosome']

In [45]:
writeVcf(chr22_vcf, "./chr22.vcf")

In [80]:
df %>% arrange(desc(Mutation_count)) %>% filter(Chromosome == 'chr22') #%>% filter(Mutation_count<100)

Unnamed: 0_level_0,Mutation_count,Gene_symbol,Chromosome
Unnamed: 0_level_1,<int>,<chr>,<chr>
80832,53,APOL4,chr22
613,37,BCR,chr22
11078,35,TRIOBP,chr22
27341,31,RRP7A,chr22
55007,29,FAM118A,chr22
1727,28,CYB5R3,chr22
23313,28,KIAA0930,chr22
10738,27,RFPL3,chr22
23542,27,MAPK8IP2,chr22
23762,26,OSBP2,chr22
