### Variant calling module pt 2

**CMM262, Winter 2023**

Kyle Gaulton, kgaulton@health.ucsd.edu

Note: this notebook should be run using the `R [conda env: r-bio]` kernel.
<br>
<br>
<br>
<br>
<b><u>Variant genotype and annotation filtering and visualization in R</u></b>
<br><br>
Load required R packages for this walk through


In [None]:
library(tidyverse)
library(reshape2)


<br>
Load tab-delimited genotype file and store in 'geno' data frame

In [None]:
geno <- read.table("outputs/GMTS.geno.txt")


<br>
View top part of geno so we can inspect the data frame

In [None]:
head(geno)

<br>
Assign column names to the data frame

In [None]:
colnames(geno) <- c("chr","raw_pos","id","raw_ref","raw_alt","qual","filter","AC","AF","AN","BaseQRankSum","ClippingRankSum","DP","DS","END","ExcessHet","FS","Hrun","HaplotypeScore","InbreedingCoeff","MLEAC","MLEAF","MQ","MQRankSum","QD","RAW_MQ","ReadPosRankSum","SOR","SAMPLE","AD","DP.1","GQ","GT","MIN_DP","PGT","PID","PL","RGQ","SB")

<br>
Double check that column names look correct

In [None]:
head(geno)

<br>
Create a new column with a unique ID for each variant

In [None]:
#create a unique variant ID
geno <- geno %>% mutate(varID=paste(chr,raw_pos,raw_ref,raw_alt,sep=":"))

<br>
Double check that column names look correct

In [None]:
head(geno)

<br>
Create new column with SNP/indel variant info

In [None]:
geno<-geno %>% mutate(type=ifelse(nchar(as.character(raw_ref))>1 | nchar(as.character(raw_alt))>1,"indel","SNV"))


<br>
Double check that column names look correct

In [None]:
head(geno)

<br>
Set sample column to type 'character' so it doesn't get mistakenly interpreted as numbers

In [None]:
geno$SAMPLE <- as.character(geno$SAMPLE)


<br>
Double check sample column

In [None]:
head(geno$SAMPLE)

<br>
Determine how many unique variants there are

In [None]:
geno %>% select(varID) %>% unique() %>% nrow()


<br>
Determine how many unique samples there are

In [None]:
geno %>% select(SAMPLE) %>% unique() %>% nrow()


<br>
How many of each genotype category are there?

In [None]:
table(geno$GT)


<br>
Filter only heterozygous or homozygous alternate genotype calls

In [None]:
geno <- geno %>% filter(GT=="0/1" | GT=="1/1")


<br>
Filter only variants that are SNPs and save as new data frame geno2

In [None]:
geno$raw_ref <- as.character(geno$raw_ref)
geno$raw_alt <- as.character(geno$raw_alt)
geno$AD <- as.character(geno$AD)
geno2<-geno %>% filter(!str_detect(AD,"\\."))


<br>
Determine allele fraction observed for each SNP

In [None]:
geno2<-geno2 %>% separate(AD,into=c("Rcnt","Acnt"),remove = F, convert=T, sep=",") %>% mutate(AF=Acnt/(Rcnt+Acnt))


<br>
Summarize variants by genotype and variant type per sample

In [None]:
head(geno %>% select(SAMPLE,GT,type,varID) %>% 
  unique() %>% 
  group_by(SAMPLE,GT,type) %>% 
  tally(), n=20)
    

<br>
Summarize and plot average number of each variant type per sample

In [None]:
geno %>% select(SAMPLE,type,varID) %>% 
  unique() %>% 
  group_by(SAMPLE,type) %>% 
  summarize(Nvariants=length(varID)) %>%
  group_by(type) %>%
  summarize(medVar=median(Nvariants),minVar=min(Nvariants),maxVar=max(Nvariants))

In [None]:
geno3 <- geno %>% select(SAMPLE,type,varID) %>% 
  unique() %>% 
  group_by(SAMPLE,type) %>% 
  summarize(Nvariants=length(varID))

ggplot(geno3,aes(x=Nvariants,y=SAMPLE,col=type)) + geom_point() + theme_classic()
ggplot(filter(geno3,type=="SNV"),aes(x=Nvariants)) + geom_histogram() + theme_classic()
 ggplot(filter(geno3,type=="indel"),aes(x=Nvariants)) + geom_histogram() + theme_classic()

<br>
<b><u>Filtering and visualizing variant annotations</u></b>
<br><br>
Read in variant annotation file into data frame 'var', and add column names

In [None]:
var <- read.delim2("outputs/myanno.21.txt",header=F,sep="\t")
colnames(var)<-c("chr","pos","raw_pos","ref","raw_ref","alt","raw_alt","mut_type","function","region_type","region_name","refGene.name","refGene.name2","dbSNP.name","ExAC_r0_3_sites_vep.name","ExAC_r0_3_sites_vep.AF","dbNSFP.SIFT_pred","dbNSFP.Polyphen2_HDIV_pred","dbNSFP.MutationTaster_pred","dbNSFP.MetaSVM_pred","dbNSFP.clinvar_clnsig","clinvar.CLNSIG")


<br>
View top of data frame to make sure all looks good

In [None]:
head(var)

<br>
Remove leading and trailing white spaces and add column types in same data frame

In [None]:
var$chr <- paste0("chr",trimws(var$chr))
var$raw_pos<-as.numeric(trimws(var$raw_pos))
var$raw_ref<-trimws(var$raw_ref)
var$raw_alt<-trimws(var$raw_alt)
var$refGene.name2<-trimws(var$refGene.name2)


<br>
Create a new column with a unique variant ID in same data frame

In [None]:
var <- var %>% mutate(varID=paste(chr,raw_pos,raw_ref,raw_alt,sep=":"))


<br>
Summarize number of unique variants

In [None]:
var %>% select(varID) %>% unique() %>% nrow()


<br>
Summarize number of variants by functional category

In [None]:
var %>% group_by(mut_type) %>% tally()


<br>
Summarize number of variants in each functional category by gene

In [None]:
var %>% filter(refGene.name2=="DOPEY2") %>% group_by(mut_type) %>% tally()


<br>
Summarize variants with potential clinical significance in Clinvar

In [None]:
var %>% filter(dbNSFP.clinvar_clnsig==5) %>% select(dbSNP.name,mut_type,refGene.name2) %>% unique()


<Br>
Summarize nonsynonymous variants per gene and store in new data frame var2.  Then extract and plot the genes with the largest number of nonsynonymous variants

In [None]:
var2 <- var %>% filter(grepl("nonsynonymous",mut_type))

ggplot(var2,aes(refGene.name2))+geom_bar()+coord_flip()+theme(axis.text.y = element_text(size = 5))



In [None]:
topGenes <- var2 %>% group_by(refGene.name2) %>% tally() %>% top_n(20,n) %>% select(refGene.name2)
topGenes
var_top<-left_join(topGenes,var2)
ggplot(var_top,aes(refGene.name2))+geom_bar() + coord_flip()


<br>
Combine annotation and genotype data into one data frame

In [None]:
geno_lite <- geno %>% select(varID,SAMPLE,type,qual,GT,DP)
anno_lite <- var %>% select(varID,refGene.name2,mut_type,region_type,`function`)
data <- left_join(geno_lite,anno_lite)


<br>
Summarize number of frameshift mutations per sample

In [None]:
data %>% group_by(SAMPLE) %>% filter(grepl("frameshift",mut_type)) %>% summarize(NS=length(varID))


<br>
Visualize variant quality by type of variant and genotype class

In [None]:
ggplot(data,aes(type,qual))+geom_boxplot()+facet_wrap(~SAMPLE)

ggplot(data,aes(type,qual))+geom_boxplot(outlier.shape = NA)+geom_jitter(width=0.1)+facet_wrap(~SAMPLE)



<br>
Visualize variant quality for SNPs and indels from a single sample

In [None]:
sdata <- data %>% filter(SAMPLE=="201849403") %>% filter(type=="SNV")

ggplot(sdata,aes(type,qual))+geom_boxplot(outlier.shape = NA)+geom_jitter(width=0.1)+facet_grid(~mut_type)

sdata2 <- data %>% filter(SAMPLE=="201849403") %>% filter(type=="indel")

ggplot(sdata2,aes(type,qual))+geom_boxplot(outlier.shape = NA)+geom_jitter(width=0.1)+facet_grid(~mut_type)


<br>
Visualize distribution of depth of coverage across variants

In [None]:
ggplot(data,aes(type,log10(DP)))+geom_boxplot()


<br>
Visualize variants per sample by specific functional categories

In [None]:
ggplot(data,aes(SAMPLE,fill=mut_type))+geom_bar()

del_vars <- data %>% filter(grepl("^frameshift|^stop",mut_type))

ggplot(del_vars,aes(SAMPLE,fill=mut_type))+geom_bar()