### Variant calling module pt 2

**CMM262, Winter 2022**

Kyle Gaulton, kgaulton@gmail.com
<br>
<br>
Variant genotype and annotation filtering and visualization in R
<br>

In [None]:
# load required packages

library(tidyverse)
library(reshape2)


In [None]:
# load tab-delimited genotype file

geno <- read.table("GMTS.geno.txt")

In [None]:
head(geno)

In [None]:
colnames(geno) <- c("chr","raw_pos","id","raw_ref","raw_alt","qual","filter","AC","AF","AN","BaseQRankSum","ClippingRankSum","DP","DS","END","ExcessHet","FS","Hrun","HaplotypeScore","InbreedingCoeff","MLEAC","MLEAF","MQ","MQRankSum","QD","RAW_MQ","ReadPosRankSum","SOR","SAMPLE","AD","DP.1","GQ","GT","MIN_DP","PGT","PID","PL","RGQ","SB")

In [None]:
head(geno)

In [None]:
#create a unique variant ID
geno <- geno %>% mutate(varID=paste(chr,raw_pos,raw_ref,raw_alt,sep=":"))

In [None]:
head(geno)

In [None]:
#add SNP/indel variant type info

geno<-geno %>% mutate(type=ifelse(nchar(as.character(raw_ref))>1 | nchar(as.character(raw_alt))>1,"indel","SNV"))


In [None]:
head(geno)

In [None]:
# set sample column to character

geno$SAMPLE <- as.character(geno$SAMPLE)

In [None]:
head(geno$SAMPLE)

In [None]:
# number of variants
geno %>% select(varID) %>% unique() %>% nrow()

In [None]:
# number of samples
geno %>% select(SAMPLE) %>% unique() %>% nrow()

In [None]:
# number of each genotype
table(geno$GT)

In [None]:
# retain only het and hom alt genotypes

geno <- geno %>% filter(GT=="0/1" | GT=="1/1")


In [None]:
# retain just snp variants
geno$raw_ref <- as.character(geno$raw_ref)
geno$raw_alt <- as.character(geno$raw_alt)
geno$AD <- as.character(geno$AD)
geno2<-geno %>% filter(!str_detect(AD,"\\."))


In [None]:
#getting allelic fraction
geno2<-geno2 %>% separate(AD,into=c("Rcnt","Acnt"),remove = F, convert=T, sep=",") %>% mutate(AF=Acnt/(Rcnt+Acnt))

In [None]:
#number of variant of each GT and type per sample
geno %>% select(SAMPLE,GT,type,varID) %>% 
  unique() %>% 
  group_by(SAMPLE,GT,type) %>% 
  tally() 

In [None]:
# average number of each variant type per samples

geno %>% select(SAMPLE,type,varID) %>% 
  unique() %>% 
  group_by(SAMPLE,type) %>% 
  summarize(Nvariants=length(varID)) %>%
  group_by(type) %>%
  summarize(medVar=median(Nvariants),minVar=min(Nvariants),maxVar=max(Nvariants))

In [None]:
geno3 <- geno %>% select(SAMPLE,type,varID) %>% 
  unique() %>% 
  group_by(SAMPLE,type) %>% 
  summarize(Nvariants=length(varID))

ggplot(geno3,aes(x=Nvariants,y=SAMPLE,col=type)) + geom_point() + theme_classic()
ggplot(filter(geno3,type=="SNV"),aes(x=Nvariants)) + geom_histogram() + theme_classic()
 ggplot(filter(geno3,type=="indel"),aes(x=Nvariants)) + geom_histogram() + theme_classic()

**Filtering and visualizing variant annotations**

In [None]:
var <- read.delim2("myanno.21.txt",header=F,sep="\t")
colnames(var)<-c("chr","pos","raw_pos","ref","raw_ref","alt","raw_alt","mut_type","function","region_type","region_name","refGene.name","refGene.name2","dbSNP.name","ExAC_r0_3_sites_vep.name","ExAC_r0_3_sites_vep.AF","dbNSFP.SIFT_pred","dbNSFP.Polyphen2_HDIV_pred","dbNSFP.MutationTaster_pred","dbNSFP.MetaSVM_pred","dbNSFP.clinvar_clnsig","clinvar.CLNSIG")


In [None]:
head(var)

In [None]:
# remove leading and trailing white spaces

var$chr <- paste0("chr",trimws(var$chr))
var$raw_pos<-as.numeric(trimws(var$raw_pos))
var$raw_ref<-trimws(var$raw_ref)
var$raw_alt<-trimws(var$raw_alt)
var$refGene.name2<-trimws(var$refGene.name2)


In [None]:
#create a unique variant ID

var <- var %>% mutate(varID=paste(chr,raw_pos,raw_ref,raw_alt,sep=":"))


In [None]:
# summarize number of variants

var %>% select(varID) %>% unique() %>% nrow()


In [None]:
# summarize number of variants by functional type

var %>% group_by(mut_type) %>% tally()

In [None]:
# summarize number of variants by functional type for a specific gene

var %>% filter(refGene.name2=="DOPEY2") %>% group_by(mut_type) %>% tally()

In [None]:
# summarize and report variants with potential clinical significance

var %>% filter(dbNSFP.clinvar_clnsig==5) %>% select(dbSNP.name,mut_type,refGene.name2) %>% unique()

In [None]:
# nonsyn variants per gene
var2 <- var %>% filter(grepl("nonsynonymous",mut_type))

ggplot(var2,aes(refGene.name2))+geom_bar()+coord_flip()+theme(axis.text.y = element_text(size = 5)) 


In [None]:
topGenes <- var2 %>% group_by(refGene.name2) %>% tally() %>% top_n(20,n) %>% select(refGene.name2)
topGenes
var_top<-left_join(topGenes,var2)
ggplot(var_top,aes(refGene.name2))+geom_bar() +coord_flip()

In [None]:
# join genotype and annotation data

geno_lite <- geno %>% select(varID,SAMPLE,type,qual,GT,DP)
anno_lite <- var %>% select(varID,refGene.name2,mut_type,region_type,`function`)
data <- left_join(geno_lite,anno_lite)


In [None]:
# number of nonsynonymous variants per sample

data %>% group_by(SAMPLE) %>% filter(grepl("frameshift",mut_type)) %>% summarize(NS=length(varID))

In [None]:
# comparing quality by type and GT

ggplot(data,aes(type,qual))+geom_boxplot()+facet_wrap(~SAMPLE)

ggplot(data,aes(type,qual))+geom_boxplot(outlier.shape = NA)+geom_jitter(width=0.1)+facet_wrap(~SAMPLE)



In [None]:
sdata <- data %>% filter(SAMPLE=="201849403") %>% filter(type=="SNV")

ggplot(sdata,aes(type,qual))+geom_boxplot(outlier.shape = NA)+geom_jitter(width=0.1)+facet_grid(~mut_type)

sdata2 <- data %>% filter(SAMPLE=="201849403") %>% filter(type=="indel")

ggplot(sdata2,aes(type,qual))+geom_boxplot(outlier.shape = NA)+geom_jitter(width=0.1)+facet_grid(~mut_type)


In [None]:
# coverage depth distribution

ggplot(data,aes(type,log10(DP)))+geom_boxplot()


In [None]:
# plot variants per sample by functional category

ggplot(data,aes(SAMPLE,fill=mut_type))+geom_bar()

del_vars <- data %>% filter(grepl("^frameshift|^stop",mut_type))

ggplot(del_vars,aes(SAMPLE,fill=mut_type))+geom_bar()

# plot 