In [1]:
%load_ext rpy2.ipython

In [2]:
import os
homeDir = os.getcwd()+"/"
fileDir = homeDir+"Analysis/"
workDir = "/storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/"

In [3]:
%%R -i fileDir -i workDir 
#devtools::install_github(repo="knausb/vcfR")
#install.packages('doParallel')
library(vcfR)
library(doParallel)
#setwd("/storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/")
dir.create(fileDir)
setwd(workDir)

# VCF-------------------
vcfDir <- "ensemble/vcf/"
filenames <- dir(vcfDir,"ensemble.snpEff.vcf.gz")
#filenames <- filenames[!grepl(".tbi",filenames)]

vcfRead <- function(filename,vcfDir){
  tumorID <- sub("-ensemble.snpEff.vcf.gz","",filename)
  tmp <- vcfR::read.vcfR(paste0(vcfDir,filename), verbose=FALSE)
  tmpVcf <- tmp@fix
  tmpGt <- tmp@gt
  if(length(colnames(tmpGt))==3){
    if(!grepl("_N\\b",colnames(tmpGt)[3])){ #CHECK THE NAME OF TUMOR AND NORMAL SAMPLES IN VCF!
      tmpGt <- tmpGt[,c(1,3,2)]
    }
  }
  info <- tmpVcf[,"INFO"]
  func <- sapply(info,function(x){
    temp <- unlist(strsplit(x,","))
    paste0(unique(na.omit(unlist(sapply(temp,function(x) unlist(strsplit(x,"|",fixed=TRUE))[2])))),collapse=",")
  })
  func <- unname(func)
  callers <- sapply(info,function(x){
    temp <- unlist(strsplit(x,";"))
    sub("CALLERS=","",temp[grepl("CALLERS=",temp)])
  })
  callers <- unname(callers)
  id <- paste0(tumorID,"-",tmpVcf[,1],":",tmpVcf[,2])
  if(length(colnames(tmpGt))==3){
    tmp <- data.frame(tmpVcf[,c(1,2,4,5,7)],tmpGt,tumorID,info,func,callers,id,stringsAsFactors=FALSE)
  } else {
    tmp <- data.frame(tmpVcf[,c(1,2,4,5,7)],tmpGt,normal=NA,tumorID,info,func,callers,id,stringsAsFactors=FALSE)
  }
  colnames(tmp) <- c("CHROM","POS","REF","ALT","FILTER","FORMAT","Tumor","Normal","Tumor_ID","snpEff","snpEff_func","caller","id")
  rownames(tmp) <- NULL
  return(tmp)
}

vcf <- lapply(filenames,function(x) vcfRead(x,vcfDir))
vcf <- do.call(rbind,vcf)
nrow(vcf) #306237


R[write to console]: 
   *****       ***   vcfR   ***       *****
   This is vcfR 1.12.0 
     browseVignettes('vcfR') # Documentation
     citation('vcfR') # Citation
   *****       *****      *****       *****


R[write to console]: Loading required package: foreach

R[write to console]: Loading required package: iterators

R[write to console]: Loading required package: parallel



[1] 306237


## CORRECT VCF

In [4]:
%%R 
# CORRECT FOR NA ALT
table(is.na(vcf$ALT))
# vcf <- vcf[!is.na(vcf$ALT),]

# CORRECT FOR MULTIPLE ALT
table(grepl(",",vcf$ALT))
tmp <- sapply(vcf$ALT,function(x) unlist(strsplit(x,",",fixed=TRUE))[1])
vcf$ALT <- tmp

# # CORRECT FOR NA LINES
#sum(is.na(vcf))
#vcf <- vcf[!is.na(vcf)]
#sum(is.na(vcf))

## SELECT CODING/SPLICING

In [5]:
%%R 
# SELECT CODING/SPLICING
tmp <- unique(vcf$snpEff_func)
tmp <- unique(unlist(sapply(tmp,function(x) unlist(strsplit(x,",")))))
tmp <- unique(unlist(sapply(tmp,function(x) unlist(strsplit(x,"&")))))
tmp <- tmp[!grepl("ENS",tmp)]
tmp <- tmp[!grepl("=",tmp)]
sort(tmp)
sel <- c("5_prime_UTR_premature_start_codon_gain_variant","5_prime_UTR_truncation", #NEW
         "3_prime_UTR_variant", "5_prime_UTR_variant", # Added
         "bidirectional_gene_fusion","conservative_inframe_deletion","conservative_inframe_insertion",
         "disruptive_inframe_deletion","disruptive_inframe_insertion",
         "exon_loss_variant","initiator_codon_variant","gene_fusion",
         "frameshift_variant","missense_variant","splice_acceptor_variant",
         "splice_donor_variant","splice_region_variant","start_lost","stop_gained",
         "stop_lost","stop_retained_variant")
setdiff(tmp,sel)
setdiff(sel,tmp)
sel <- sapply(sel,function(x) grep(x,vcf$snpEff_func))
sel <- unique(unlist(sel))
sel <- sort(sel)
sum(is.na(sel))
nrow(vcf) #227168
rownames(vcf) <- NULL
length(sel) #36938
vcf <- vcf[sel,]
sort(table(vcf$Tumor_ID))

callerN <- sapply(vcf$caller,function(x) length(unlist(strsplit(x,",",fixed=TRUE))))
callerN <- unname(callerN)
vcf <- data.frame(vcf,callerN,stringsAsFactors = FALSE)
table(vcf$callerN)
#   1     2     3     4     5     6 
# 29690  3753   704   328   438  2025  
rownames(vcf) <- NULL
#----------------------------------------------

save(vcf,file = paste(fileDir,"vcf.rda",sep=""))

## COUNTS and ALLELE FREQUENCY

In [6]:
%%R 
# COUNTS and ALLELE FREQUENCY -----------------

countsVcf <- function(vcf,i){
  Tmp <- rep(NA,4)
  names(Tmp) <- c("t_ref_count","t_alt_count","n_ref_count","n_alt_count")
  vcfLine <- as.character(vcf[i,])
  names(vcfLine) <- colnames(vcf)
  caller <- unlist(strsplit(vcfLine["caller"],",",fixed=TRUE))[1]
  if(caller=="varscan"){
    tmp <- unlist(strsplit(vcfLine["Tumor"],":"))
    names(tmp) <- unlist(strsplit(vcfLine["FORMAT"],":"))
    Tmp["t_alt_count"] <- as.numeric(tmp["AD"])
    Tmp["t_ref_count"] <- as.numeric(tmp["RD"])
    tmp <- unlist(strsplit(vcfLine["Normal"],":"))
    names(tmp) <- unlist(strsplit(vcfLine["FORMAT"],":"))
    Tmp["n_alt_count"] <- as.numeric(tmp["AD"])
    Tmp["n_ref_count"] <- as.numeric(tmp["RD"])
  } else {
    tmp <- unlist(strsplit(vcfLine["Tumor"],":"))
    names(tmp) <- unlist(strsplit(vcfLine["FORMAT"],":"))
    if(tmp["AD"]!=".") Tmp[c("t_ref_count","t_alt_count")] <- as.numeric(unlist(strsplit(tmp["AD"],","))[1:2])
    if(!is.na(vcfLine["Normal"])) {
      tmp <- unlist(strsplit(vcfLine["Normal"],":"))
      names(tmp) <- unlist(strsplit(vcfLine["FORMAT"],":"))[1:length(tmp)]
      if(tmp["AD"]!=".") Tmp[c("n_ref_count","n_alt_count")] <- as.numeric(unlist(strsplit(tmp["AD"],","))[1:2])
    }
  }
  return(Tmp)
}

cl <- makePSOCKcluster(60,outfile="")
registerDoParallel(cl)
tmp <- foreach(i = 1:nrow(vcf)) %dopar% {
  tmp <- countsVcf(vcf,i)
  return(tmp)
}
stopCluster(cl)

tmp <- do.call(rbind,tmp)
dim(tmp) #36938     4

counts <- cbind(tmp[,1:2],NA,NA,tmp[,3:4],NA,NA)
colnames(counts) <- c("t_ref_count","t_alt_count","t_depth","t_vaf","n_ref_count","n_alt_count","n_depth","n_vaf")
counts[,"t_depth"] <- counts[,"t_alt_count"] + counts[,"t_ref_count"]
counts[,"t_vaf"] <- round(counts[,"t_alt_count"]/counts[,"t_depth"],2)
counts[,"n_depth"] <- counts[,"n_alt_count"] + counts[,"n_ref_count"]
counts[,"n_vaf"] <- round(counts[,"n_alt_count"]/counts[,"n_depth"],2)
rownames(counts) <- NULL

#View(counts[sample(1:nrow(counts),100),])

vcf <- data.frame(vcf, counts, stringsAsFactors = FALSE)

table(cond <- is.na(vcf$n_depth)&!is.na(vcf$Normal))
# FALSE  TRUE 
# 27894  9044 
# table(cond <- is.na(vcf$n_depth))

tmp <- vcf[cond,]

# table(tmp$caller)
# freebayes 
# 9044

vcf <- vcf[!cond,]

rownames(vcf) <- NULL
nrow(vcf) #27894
rm(counts)

#----------------------------------------------
# save(vcf,file = "Analysis/vcf.rda")

## FP FILTER

In [41]:
fpfilterDir = fileDir + "fpfilter/"

In [9]:
%%R -i fpfilterDir
# FP FILTER
#system("mkdir Analysis/fpfilter")
#setwd("Analysis/fpfilter/")
dir.create(fpfilterDir)
setwd(fpfilterDir)

vcf <- data.frame(vcf,fpFid="",stringsAsFactors=FALSE)
patients <- unique(vcf$Tumor_ID)
for(i in 1:length(patients)){
  print(patients[i])
  name <- patients[i]
  var <- vcf[vcf$Tumor_ID==name,1:4]
  var[nchar(var$ALT)>1,"ALT"] <- paste0("+",substring(var[nchar(var$ALT)>1,"ALT"],2))
  var[nchar(var$REF)>1,"POS"] <- as.numeric(var[nchar(var$REF)>1,"POS"])+1
  var[nchar(var$REF)>1,"ALT"] <- paste0("-",substring(var[nchar(var$REF)>1,"REF"],2))
  var[nchar(var$REF)>1,"REF"] <- substring(var[nchar(var$REF)>1,"REF"],2,2)
  vcf[vcf$Tumor_ID==name,"fpFid"] <- paste0(name,"-",var[,1],":",var[,2],"-",var[,4])
  write.table(var,file=paste0(name,".var"),sep="\t",quote=FALSE,row.names=FALSE,col.names=FALSE)
  # n <- ceiling(nrow(var)/60)
  # system(paste0("split -l ",n," --additional-suffix .var ",paste0(name,".var")))
  loc <- paste(var$CHROM,var$POS,var$POS)
  write.table(loc,file=paste0(name,".loc"),quote=FALSE,row.names=FALSE,col.names=FALSE)
  # system(paste0("split -l ",n," --additional-suffix .loc ",paste0(name,".loc")))
}


[1] "A1783"
[1] "A9155"
[1] "A9878"
[1] "AT2550"
[1] "AT2822"
[1] "AT4415"
[1] "AT4808"
[1] "C0288"
[1] "C0334"
[1] "C1572"
[1] "C8448"
[1] "s1_3076_D"
[1] "s1_3295_D"
[1] "s1_8588_D"
[1] "s2_6088_D"
[1] "s2_6339_D"
[1] "s2_8811_D"
[1] "s3_5832_D"
[1] "s3_7299_D"
[1] "s3_7979_D"


In [61]:
os.chdir(fpfilterDir)

In [62]:
%%bash
fasta=/storage/gluster/vol1/bcbio/genomes/Hsapiens/hg19/bwa/hg19.fa
bamreadcount=/storage/gluster/vol1/SHARED/NGSTOOLS/bam-readcounts/bam-readcount/build/bin/bam-readcount

#cd {fpfilterDir}

for file in *.loc
do
	echo $file
	name=${file%.loc}
	echo $name
	echo "$name bam-readcount"
	bam=`ls /storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/*/*/${name}_T_recal.bam`
	echo $bam
    echo '\n'
	$bamreadcount -f $fasta -w 1 -l $file $bam  > ${name}.readcount &
done  

A1783.loc
A1783
A1783 bam-readcount
/storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/ABT414_Flank/ABT414_Flank/A1783_T_recal.bam /storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/CNV/tumorbam/A1783_T_recal.bam
\n
A9155.loc
A9155
A9155 bam-readcount
/storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/ABT414_Flank/ABT414_Flank/A9155_T_recal.bam /storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/CNV/tumorbam/A9155_T_recal.bam
\n
A9878.loc
A9878
A9878 bam-readcount
/storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/ABT414_Flank/ABT414_Flank/A9878_T_recal.bam /storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/CNV/tumorbam/A9878_T_recal.bam
\n
AT2550.loc
AT2550
AT2550 bam-readcount
/storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/ABT414_Flank/ABT414_Flank/AT2550_T_recal.bam /storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/CNV/tumorbam/AT2550_T_recal.bam
\n
AT2822.loc
AT2822
AT2822 bam-readcount
/sto

Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0
Minimum mapping quality is set to 0


In [58]:
%%bash
fpfilter=/storage/gluster/vol1/data/PUBLIC/Tools/VARSCAN/fpfilter.pl

for file in *.var
do
	echo $file
	perl $fpfilter ${file} ${file%.var}.readcount --output-basename ${file%.var}.fpfilter > ${file%.var}.log &
done  

A1783.var
A9155.var
A9878.var
AT2550.var
AT2822.var
AT4415.var
AT4808.var
C0288.var
C0334.var
C1572.var
C8448.var
s1_3076_D.var
s1_3295_D.var
s1_8588_D.var
s2_6088_D.var
s2_6339_D.var
s2_8811_D.var
s3_5832_D.var
s3_7299_D.var
s3_7979_D.var


In [63]:
%%R

setwd(fpfilterDir)
failfiles <- dir(path=".",pattern = "fpfilter.fail")
FPfail <- NULL
for(i in 1:length(failfiles)){
  print(i)
  patient <- unlist(strsplit(failfiles[i],".",fixed=TRUE))[1]
  tmp <- readLines(failfiles[i])
  tmp <- sapply(tmp,function(x){
    Tmp <- unlist(strsplit(x,"\t"))
    length(Tmp) <- 9
    Tmp
  })
  tmp <- t(tmp)
  failID <- paste0(patient,"-",tmp[,1],":",tmp[,2],"-",tmp[,4])
  tmp <- cbind(failID,tmp[,9])
  rownames(tmp) <- failID
  FPfail <- rbind(FPfail,tmp)
}

passfiles <- dir(path=".",pattern = "fpfilter.pass")
FPpass <- NULL
for(i in 1:length(passfiles)){
  print(i)
  patient <- unlist(strsplit(passfiles[i],".",fixed=TRUE))[1]
  tmp <- read.delim(passfiles[i],header=FALSE,as.is=TRUE)
  tmp <- paste0(patient,"-",tmp[,1],":",tmp[,2],"-",tmp[,4])
  FPpass <- c(FPpass,tmp)
}

length(FPpass) #8967
nrow(FPfail) #18886

length(FPpass)+nrow(FPfail) #27853

all <- c(FPpass,FPfail[,1])
table(vcf$fpFid%in%all)
#View(vcf[!vcf$fpFid%in%all,])

vcf <- data.frame(vcf,FPfilter=".",FPfail=".",stringsAsFactors=FALSE)
vcf[vcf$fpFid%in%FPpass,"FPfilter"] <- "PASS"
vcf[vcf$fpFid%in%rownames(FPfail),"FPfilter"] <- "FAIL"
vcf[vcf$FPfilter=="FAIL","FPfail"] <- FPfail[vcf[vcf$FPfilter=="FAIL","fpFid"],2]

vcf[is.na(vcf$FPfail),"FPfail"] <- ""
vcf[vcf$FPfail=="","FPfilter"] <- "no_readcounts" # alt absent in the bam readcount file
vcf[vcf$FPfail=="","FPfail"] <- "."
vcf[vcf$FPfilter==".","FPfilter"] <- "no_readcounts"# alt = 0 in the bam readcount file
table(vcf$FPfilter)
# FAIL no_readcounts          PASS 
# 15577          3350          8967  
rm(FPpass,FPfail,all)

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20


In [64]:
%%R

# # VirtualNormalCorrection----------
# system("mkdir VN")
# setwd("VN/")
# ID <- 1:nrow(vcf)
# VCF <- data.frame(vcf[,1:2],ID,vcf[,3:4],stringsAsFactors = FALSE)
# write.table(VCF,sep="\t",row.names = FALSE,quote=FALSE,file = "VNinput.vcf")
# n <- ceiling(nrow(VCF)/60)
# system(paste0("split -l ",n," --additional-suffix .vcf VNinput.vcf"))
# 
# # UNIX
# # parallelVN.sh
# # parallelVN_combineRes.sh
# 
# VNoutput <- read.delim("VNoutput_main.tsv",as.is=TRUE)
# rownames(vcf) <- VCF$ID
# vcf <- data.frame(vcf,idNum=VCF$ID,stringsAsFactors = FALSE)
# vcf <- data.frame(vcf[VNoutput$xRef,],VNoutput[,9:12],stringsAsFactors = FALSE)
# # threshold 5% (~ 22/433) - threshold_highconf 10% (~ 44/433)
# table(vcf$VN_occurrences<22&vcf$VN_fullycalled_count>=44)
# # FALSE   TRUE 
# # 220620  84644 
# table(vcf$VN_occurrences<3&vcf$VN_fullycalled_count>=110)
# # FALSE   TRUE 
# # 245383  59881 
# table(vcf$VN_occurrences<3) # 0.7% (3/433)
# # FALSE   TRUE 
# # 233414  71850 
# # table(is.na(vcf$VN_occurrences))
# 
# vcf$vn <- vcf$VN_occurrences<3
# table(vcf$vn)
# # FALSE   TRUE 
# # 233414  71850 
# 
# rm(VCF,VNoutput,tmp)

setwd("/storage/gluster/vol1/data/PUBLIC/SCAMBIO/ABT414_WES_Analysis/Analysis/")
save(vcf,file = paste(fileDir,"vcf.rda",sep=""))

# STEP 2 - ANNOTATION AND DOWNSTREAM ANALYSIS #

# ANNOVAR Annotation

In [66]:
%%R
ann.in <- vcf[,c(1,2,2,3,4)]
colnames(ann.in) <- c("Chr","Start","End","Ref","Alt")

cond <- nchar(ann.in[,"Ref"])>1&nchar(ann.in[,"Alt"])==1
ann.in[cond,"Start"] <- as.numeric(ann.in[cond,"Start"])+1
ann.in[cond,"Ref"] <- substring(ann.in[cond,"Ref"],2,nchar(ann.in[cond,"Ref"]))
ann.in[cond,"Alt"] <- "-"
ann.in[cond,"End"] <- as.numeric(ann.in[cond,"Start"])+nchar(ann.in[cond,"Ref"])-1

cond <- nchar(ann.in[,"Alt"])>1&nchar(ann.in[,"Ref"])==1
ann.in[cond,"Ref"] <- "-"
ann.in[cond,"Alt"] <- substring(ann.in[cond,"Alt"],2,nchar(ann.in[cond,"Alt"]))

cond <- nchar(ann.in[,"Ref"])>1&nchar(ann.in[,"Alt"])>1
ann.in[cond,"End"] <- as.numeric(ann.in[cond,"Start"])+nchar(ann.in[cond,"Ref"])-1

write.table(ann.in,file="ann.in",col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")

R[write to console]: Error in file(file, ifelse(append, "a", "w")) : 
  cannot open the connection

R[write to console]: In addition: 

R[write to console]: In file(file, ifelse(append, "a", "w")) :
R[write to console]: 
 
R[write to console]:  cannot open file 'ann.in': Permission denied




Error in file(file, ifelse(append, "a", "w")) : 
  cannot open the connection


In [None]:
%%bash
/storage/gluster/vol1/data/PUBLIC/Tools/annovar2017Jul16/table_annovar.pl ann.in \
/storage/gluster/vol1/data/PUBLIC/Tools/annovar2017Jul16/humandb/ -buildver hg19 -out ann.out \
-remove -protocol refGene,avsnp150,snp138NonFlagged,\
exac03,1000g2015aug_all,esp6500siv2_all,kaviar_20150923,hrcr1,\
cosmic80,clinvar_20170905,\
dbnsfp33a,dbscsnv11,dann,eigen,gerp++gt2,cadd  \
-operation g,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f -nastring . --thread 80

In [None]:
ann.out <- read.delim("ann.out.hg19_multianno.txt",as.is=TRUE)
paste0(colnames(ann.out),collapse="','")
tmp <- c('Chr','Start','End','Ref','Alt','Func.refGene','Gene.refGene','GeneDetail.refGene','ExonicFunc.refGene','AAChange.refGene', #refGene
         'avsnp150', #avsnp150
         'snp138NonFlagged', #snp138NonFlagged
         'ExAC_ALL','ExAC_AFR','ExAC_AMR','ExAC_EAS','ExAC_FIN','ExAC_NFE','ExAC_OTH','ExAC_SAS', #exac03
         'X1000g2015aug_all', #1000g2015aug_all
         'esp6500siv2_all', #esp6500siv2_all
         'Kaviar_AF','Kaviar_AC','Kaviar_AN', #kaviar_20150923
         'HRC_AF','HRC_AC','HRC_AN','HRC_non1000G_AF','HRC_non1000G_AC','HRC_non1000G_AN', #hrcr1
         'cosmic80', #cosmic80
         'CLINSIG','CLNDBN','CLNACC','CLNDSDB','CLNDSDBID', #clinvar_20170905
         'SIFT_score','SIFT_converted_rankscore','SIFT_pred', #dbnsfp33a
         'Polyphen2_HDIV_score','Polyphen2_HDIV_rankscore','Polyphen2_HDIV_pred', #dbnsfp33a
         'Polyphen2_HVAR_score','Polyphen2_HVAR_rankscore','Polyphen2_HVAR_pred', #dbnsfp33a
         'LRT_score','LRT_converted_rankscore','LRT_pred', #dbnsfp33a
         'MutationTaster_score','MutationTaster_converted_rankscore','MutationTaster_pred', #dbnsfp33a
         'MutationAssessor_score','MutationAssessor_score_rankscore','MutationAssessor_pred', #dbnsfp33a
         'FATHMM_score','FATHMM_converted_rankscore','FATHMM_pred', #dbnsfp33a
         'PROVEAN_score','PROVEAN_converted_rankscore','PROVEAN_pred', #dbnsfp33a
         'VEST3_score','VEST3_rankscore','MetaSVM_score','MetaSVM_rankscore','MetaSVM_pred', #dbnsfp33a
         'MetaLR_score','MetaLR_rankscore','MetaLR_pred', #dbnsfp33a
         'M.CAP_score','M.CAP_rankscore','M.CAP_pred', #dbnsfp33a
         'CADD_raw','CADD_raw_rankscore','CADD_phred', #dbnsfp33a
         'DANN_score','DANN_rankscore', #dbnsfp33a
         'fathmm.MKL_coding_score','fathmm.MKL_coding_rankscore','fathmm.MKL_coding_pred', #dbnsfp33a
         'Eigen_coding_or_noncoding','Eigen.raw','Eigen.PC.raw', #dbnsfp33a
         'GenoCanyon_score','GenoCanyon_score_rankscore', #dbnsfp33a
         'integrated_fitCons_score','integrated_fitCons_score_rankscore','integrated_confidence_value', #dbnsfp33a
         'GERP.._RS','GERP.._RS_rankscore', #dbnsfp33a
         'phyloP100way_vertebrate','phyloP100way_vertebrate_rankscore','phyloP20way_mammalian','phyloP20way_mammalian_rankscore', #dbnsfp33a
         'phastCons100way_vertebrate','phastCons100way_vertebrate_rankscore','phastCons20way_mammalian','phastCons20way_mammalian_rankscore', #dbnsfp33a
         'SiPhy_29way_logOdds','SiPhy_29way_logOdds_rankscore', #dbnsfp33a
         'Interpro_domain', #dbnsfp33a
         'GTEx_V6_gene','GTEx_V6_tissue', #dbnsfp33a
         'dbscSNV_ADA_SCORE','dbscSNV_RF_SCORE', #dbscsnv11 SPLICE SITE
         'dann', #dann
         'Eigen', #eigen
         'gerp..gt2', #gerp++gt2
         'CADD','CADD_Phred') #cadd

sel <- c('Chr','Start','End','Ref','Alt',
         'Func.refGene','Gene.refGene','GeneDetail.refGene','ExonicFunc.refGene','AAChange.refGene', 'Interpro_domain',
         'avsnp150','snp138NonFlagged',
         'ExAC_ALL','X1000g2015aug_all','esp6500siv2_all','Kaviar_AF','HRC_AF',
         'cosmic80','CLINSIG','CLNDBN','CLNACC','CLNDSDB','CLNDSDBID',
         'SIFT_pred','Polyphen2_HDIV_pred','MutationTaster_pred','PROVEAN_pred',
         'dbscSNV_ADA_SCORE','dbscSNV_RF_SCORE',
         'dann','Eigen','gerp..gt2','CADD','CADD_Phred')

ann.out <- ann.out[,sel]
ann.out[ann.out$ExonicFunc.refGene!=".","Func.refGene"] <- ann.out[ann.out$ExonicFunc.refGene!=".","ExonicFunc.refGene"]
ann.out[ann.out$AAChange.refGene!=".","GeneDetail.refGene"] <- ann.out[ann.out$AAChange.refGene!=".","AAChange.refGene"]
ann.out <- ann.out[,!colnames(ann.out)%in%c("ExonicFunc.refGene","AAChange.refGene")]