The vcfppR package implements various powerful functions for fast genomics analyses with VCF/BCF files using the C++ API of vcfpp.h.
- Load/save content of VCF/BCF into R objects with highly customizable options
- Visualize and chracterize variants
- Compare two VCF/BCF files and report various statistics
- Streaming read/write VCF/BCF files with fine control of everything
- Paper shows vcfppR
is 20x faster than
vcfR
. Also, much faster thancyvcf2
## install.package("vcfppR") ## from CRAN
remotes::install_github("Zilong-Li/vcfppR") ## from latest github
If you find it useful, please cite the paper
library(vcfppR)
citation("vcfppR")
All functions in vcfppR support URL as filename of VCF/BCF files.
phasedvcf <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz"
rawvcf <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz"
svfile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20210124.SV_Illumina_Integration/1KGP_3202.gatksv_svtools_novelins.freeze_V3.wAF.vcf.gz"
popfile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/20130606_g1k_3202_samples_ped_population.txt"
Want to investigate the concordance between two VCF files? vcfcomp
is
the utility function you need! For example, in benchmarkings, we intend
to calculate the genotype correlation between the test and the truth.
library(vcfppR)
res <- vcfcomp(test = rawvcf, truth = phasedvcf,
stats = "r2", region = "chr21:1-5100000",
formats = c("GT","GT"))
par(mar=c(5,5,2,2), cex.lab = 2)
vcfplot(res, col = 2, cex = 2, lwd = 3, type = "b")
res <- vcfcomp(test = rawvcf, truth = phasedvcf,
stats = "pse",
region = "chr21:5000000-5500000",
samples = "HG00673,NA10840",
return_pse_sites = TRUE)
#> stats F1 or NRC or PSE only uses GT format
vcfplot(res, what = "pse", which=1:2, main = "Phasing switch error", ylab = "HG00673,NA10840")
Check out the vignettes for more!
Want to summarize variants discovered by genotype caller e.g. GATK?
vcfsummary
is the utility function you need!
Small variants
res <- vcfsummary(rawvcf,"chr21:10000000-10010000")
vcfplot(res, pop = popfile, col = 1:5, main = "Number of SNP & INDEL variants per population")
Complex structure variants
res <- vcfsummary(svfile, svtype = TRUE, region = "chr20")
vcfplot(res, main = "Structure Variant Counts", col = 1:7)
vcftable
gives you fine control over what you want to extract from
VCF/BCF files.
Read SNP variants with GT format and two samples
res <- vcftable(phasedvcf, "chr21:1-5100000", samples = "HG00673,NA10840", vartype = "snps")
str(res)
#> List of 10
#> $ samples: chr [1:2] "HG00673" "NA10840"
#> $ chr : chr [1:194] "chr21" "chr21" "chr21" "chr21" ...
#> $ pos : int [1:194] 5030578 5030588 5030596 5030673 5030957 5030960 5031004 5031031 5031194 5031224 ...
#> $ id : chr [1:194] "21:5030578:C:T" "21:5030588:T:C" "21:5030596:A:G" "21:5030673:G:A" ...
#> $ ref : chr [1:194] "C" "T" "A" "G" ...
#> $ alt : chr [1:194] "T" "C" "G" "A" ...
#> $ qual : num [1:194] 2.14e+09 2.14e+09 2.14e+09 2.14e+09 2.14e+09 ...
#> $ filter : chr [1:194] "." "." "." "." ...
#> $ info : chr [1:194] "AC=74;AF=0.0115553;CM=0;AN=6404;AN_EAS=1170;AN_AMR=980;AN_EUR=1266;AN_AFR=1786;AN_SAS=1202;AN_EUR_unrel=1006;AN"| __truncated__ "AC=53;AF=0.00827608;CM=1.78789e-05;AN=6404;AN_EAS=1170;AN_AMR=980;AN_EUR=1266;AN_AFR=1786;AN_SAS=1202;AN_EUR_un"| __truncated__ "AC=2;AF=0.000312305;CM=3.21821e-05;AN=6404;AN_EAS=1170;AN_AMR=980;AN_EUR=1266;AN_AFR=1786;AN_SAS=1202;AN_EUR_un"| __truncated__ "AC=2;AF=0.000312305;CM=0.00016985;AN=6404;AN_EAS=1170;AN_AMR=980;AN_EUR=1266;AN_AFR=1786;AN_SAS=1202;AN_EUR_unr"| __truncated__ ...
#> $ gt : int [1:194, 1:2] 0 0 0 0 0 0 0 0 0 0 ...
#> - attr(*, "class")= chr "vcftable"
Read SNP variants with PL format and drop the INFO column
res <- vcftable(rawvcf, "chr21:1-5100000", vartype = "snps", format = "PL", info = FALSE)
str(res)
#> List of 10
#> $ samples: chr [1:3202] "HG00096" "HG00097" "HG00099" "HG00100" ...
#> $ chr : chr [1:1383] "chr21" "chr21" "chr21" "chr21" ...
#> $ pos : int [1:1383] 5030082 5030088 5030105 5030253 5030278 5030347 5030356 5030357 5030391 5030446 ...
#> $ id : chr [1:1383] "." "." "." "." ...
#> $ ref : chr [1:1383] "G" "C" "C" "G" ...
#> $ alt : chr [1:1383] "A" "T" "A" "T" ...
#> $ qual : num [1:1383] 70.1 2773.1 3897.8 102.6 868.9 ...
#> $ filter : chr [1:1383] "VQSRTrancheSNP99.80to100.00" "VQSRTrancheSNP99.80to100.00" "VQSRTrancheSNP99.80to100.00" "VQSRTrancheSNP99.80to100.00" ...
#> $ info : chr(0)
#> $ PL : int [1:1383, 1:9606] NA 64 64 0 0 0 0 0 0 0 ...
#> - attr(*, "class")= chr "vcftable"
Read INDEL variants with DP format and QUAL>100
res <- vcftable(rawvcf, "chr21:1-5100000", vartype = "indels", format = "DP", qual=100)
str(res)
#> List of 10
#> $ samples: chr [1:3202] "HG00096" "HG00097" "HG00099" "HG00100" ...
#> $ chr : chr [1:180] "chr21" "chr21" "chr21" "chr21" ...
#> $ pos : int [1:180] 5030240 5030912 5030937 5031018 5031125 5031147 5031747 5031881 5031919 5031984 ...
#> $ id : chr [1:180] "." "." "." "." ...
#> $ ref : chr [1:180] "AC" "CA" "T" "C" ...
#> $ alt : chr [1:180] "A" "C" "TGGTGCACGCCTGCAGTCCCGGC" "CCTATGATCACACCGT" ...
#> $ qual : num [1:180] 6872 321 233 270 58361 ...
#> $ filter : chr [1:180] "VQSRTrancheINDEL99.00to100.00" "PASS" "PASS" "PASS" ...
#> $ info : chr [1:180] "AC=82;AF=0.0136439;AN=6010;BaseQRankSum=-0.55;ClippingRankSum=0;DP=18067;FS=0;InbreedingCoeff=0.0664;MLEAC=76;M"| __truncated__ "AC=19;AF=0.0030586;AN=6212;BaseQRankSum=0.358;ClippingRankSum=-0.594;DP=30725;FS=44.58;InbreedingCoeff=-1.1608;"| __truncated__ "AC=1;AF=0.00015753;AN=6348;BaseQRankSum=-1.793;ClippingRankSum=-0.48;DP=32486;FS=27.337;InbreedingCoeff=-0.0212"| __truncated__ "AC=2;AF=0.000314268;AN=6364;BaseQRankSum=1.59;ClippingRankSum=0.358;DP=44916;FS=0;InbreedingCoeff=-0.0092;MLEAC"| __truncated__ ...
#> $ DP : int [1:180, 1:3202] 3 15 13 16 19 20 5 34 29 24 ...
#> - attr(*, "class")= chr "vcftable"
There are two classes i.e. vcfreader and vcfwriter offering the full R-bindings of vcfpp.h. Check out the examples in the tests folder or refer to the manual.
?vcfppR::vcfreader