The purpose of this notebook is to generate the AF_Cutoff_vs_Gene_Number figure for the manuscript.

In [2]:
#Parameters/Arguments

working_dir = "/home/jupyter/notebooks/Ancestry"
workspace_bucket = Sys.getenv('WORKSPACE_BUCKET')
out_directory = "avana14" #the output directory name

In [3]:
#Set up the environment

#load packages
library(dplyr)
library(tidyverse)
library(stringr)
library(plyr)
library(ggplot2)
library(reshape2)
library(RColorBrewer)
library(viridis)
library(scales)

#Define functions
show_msg <- function(x){ 
    print(x)
    flush.console()
}


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mforcats[39m 0.5.1
[32m✔[39m [34mreadr  [39m 1.4.0     

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

------------------------------------------------------------------------------

You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and 

In [7]:
#get a list of all of the vcf files

paste({working_dir}, "/filtered_output/", out_directory, sep = "") %>% setwd()
file.list <- list.files(path = ".", pattern = "gnomad.genomes")
file.list

if(length(file.list) == 24){print("All of the files are here!")} else {print("There are either too many or too few files")}



[1] "All of the files are here!"


In [8]:
#load in the bed file that contains the information for all of the guides

paste({working_dir}, "/filtered_output/", out_directory, sep = "") %>% setwd()
bedFile <- read.table('Avana14_filtering.bed', sep = "\t")
colnames(bedFile) <- c("chromosome", "startPosition", "endPosition", "sgrna")
bedFile <- bedFile[!bedFile$chromosome == "chrY",]
bedFile <- bedFile[!duplicated(bedFile[ , c("sgrna")]),]
head(bedFile)

Unnamed: 0_level_0,chromosome,startPosition,endPosition,sgrna
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<chr>
1,chr1,69527,69550,TCTCCATTCGGTGAGCCAGT
2,chr1,168646,168669,TGTAGTCCCAGCTACTCAGG
3,chr1,383420,383443,GAGAATCTCTTGAACCCGGG
4,chr1,424270,424293,TGTAATCCCAGTACTTTGGG
5,chr1,451271,451294,ATAATGGTCAGATAGTGGAG
6,chr1,451344,451367,TCTCCACACCACCAACGACG


In [None]:
#Determine which guides beling to each SNP
snp.to.guide <- NULL
for(file in 1:length(file.list)){
show_msg(file)


snp.data <- read.table(file.list[file]) #load in the snp data
colnames(snp.data) <- c("chrom", "pos", "id", "ref", "alt", "qual", "filter", "info") #assign the columns names
chromosome.of.interest <- snp.data$chrom %>% unique()
bed.file.subset <- bedFile[bedFile$chromosome %in% chromosome.of.interest, ]

	#Find which SNPs map to which guides
	all.kept.snps <- NULL
	for(i in 1:nrow(bed.file.subset)){
	guideOfInterest <- bed.file.subset[i,] #Extract the row with the sgrna that we are interested in
	keep.list <- between(snp.data[, 2], guideOfInterest$startPosition, guideOfInterest$endPosition) #Find the SNPs that fall within this sgrna sequence
	kept.snps <- snp.data[keep.list, ] #extract just the snps that we want to keep from the data frame
	guide.sequence <- guideOfInterest$sgrna #get the sequence of the guide for this iteration of the loop
	guide.sequence <- rep(guide.sequence, nrow(kept.snps)) #make a vector so that it can be bound to the df
	kept.snps <- cbind(guide.sequence, kept.snps) #bind them together
	all.kept.snps <- rbind(all.kept.snps, kept.snps)
	}

snp.to.guide <- rbind(snp.to.guide, all.kept.snps)
}

head(snp.to.guide)

[1] 1


In [None]:
##Map the guides to genes
paste({working_dir}, "/filtered_output/", out_directory, sep = "") %>% setwd()
guide.to.gene <- read.table('avana14_guide_to_gene.chip', sep = "\t", header = T) #load in a matrix that maps genes to each sgrna sequence
snp.to.guide <- cbind(snp.to.guide$guide.sequence, snp.to.guide) #duplicate the sgrna column so that we can replace it with the gene name
snp.to.guide[,1] <- plyr::mapvalues(snp.to.guide[,1], from = guide.to.gene$Barcode.Sequence, to = guide.to.gene$Gene.Symbol) #Add in the gene name to the dataset so that we know which guide is associated with each gene.

In [None]:
##Write the dataset so that we can use it in the future without needing to re-compute everything
paste({working_dir}, "/filtered_output/", out_directory, sep = "") %>% setwd()
write.table(snp.to.guide, "snp_to_gene_af_matrix.txt", sep = "\t", col.names = NA, row.names = F)

In [None]:
#Extract the overall allele frquency data from the snp.to.guide matrix
info.vector <- snp.to.guide$info %>% as.vector() #first subset out the info column so that it's a vector, just to make writing the code a little easier.

#This is a vetor of all of the phenotype values that we want to extract
values.to.extract <- c("AC-oth=", "AN-oth=", 
						"AC-ami=", "AN-ami=", 
						"AC-sas=", "AN-sas=", 
						"AC-fin=", "AN-fin=",
						"AC-eas=", "AN-eas=",
						"AC-amr=", "AN-amr=", 
						"AC-afr=", "AN-afr=", 
						"AC-mid=", "AN-mid=", 
						"AC-asj=", "AN-asj=", 
						"AC-nfe=", "AN-nfe=")
						
#This vector contains what we want to call all of the extracted values
extracted.value.names <- c("ac.oth", "an.oth", 
						"ac.ami", "an.ami", 
						"ac.sas", "an.sas", 
						"ac.fin", "an.fin",
						"ac.eas", "an.eas",
						"ac.amr", "an.amr", 
						"ac.afr", "an.afr", 
						"ac.mid", "an.mid", 
						"ac.asj", "an.asj", 
						"ac.nfe", "an.nfe")

AC-oth	Alternate allele count for samples of Other ancestry
AN-oth	Total number of alleles in samples of Other ancestry
AC-ami	Alternate allele count for samples of Amish ancestry
AN-ami	Total number of alleles in samples of Amish ancestry
AC-sas	Alternate allele count for samples of South Asian ancestry
AN-sas	Total number of alleles in samples of South Asian ancestry
AC-fin	Alternate allele count for samples of Finnish ancestry
AN-fin	Total number of alleles in samples of Finnish ancestry
AC-eas	Alternate allele count for samples of East Asian ancestry
AN-eas	Total number of alleles in samples of East Asian ancestry
AC-amr	Alternate allele count for samples of Latino ancestry
AN-amr	Total number of alleles in samples of Latino ancestry
AC-afr	Alternate allele count for samples of African/African-American ancestry
AN-afr	Total number of alleles in samples of African/African-American ancestry
AC-mid	Alternate allele count for samples of Middle Eastern ancestry
AN-mid	Total number of alleles in samples of Middle Eastern ancestry
AC-asj	Alternate allele count for samples of Ashkenazi Jewish ancestry
AN-asj	Total number of alleles in samples of Ashkenazi Jewish ancestry
AC-nfe	Alternate allele count for samples of Non-Finnish European ancestry
AN-nfe	Total number of alleles in samples of Non-Finnish European ancestry

In [None]:
#Extract/purify the allele frequency data
extracted.value.list <- NULL
for(i in 1:length(values.to.extract)){
print(i)

first.filter <- gsub(
	print(paste(".*", values.to.extract[i], sep = "")), "", info.vector)

second.filter <- gsub(";.*", "", first.filter) %>% as.numeric()

extracted.value.list[[i]] <- second.filter

}
names(extracted.value.list) <- extracted.value.names #assign the names to the list

In [None]:
#Bind it all together into a single data frame
extracted.value.df <- NULL
for(i in 1:length(extracted.value.list)){
extracted.value.df <- cbind(extracted.value.df, extracted.value.list[[i]]) %>% data.frame()
}
colnames(extracted.value.df) <- extracted.value.names

In [None]:
#calculate the overall allele frequency
total.maf.sum <- rowSums(extracted.value.df[,c(1,3,5,7,9,11,13,15,17,19)]) %>% as.vector()
total.genotyped.sum <- rowSums(extracted.value.df[,c(2,4,6,8,10,12,14,16,18,20)]) %>% as.vector()
extracted.value.df <- cbind(extracted.value.df, total.maf.sum, total.genotyped.sum) #bind the overall allele count data to the main matrix


In [None]:
#create a new matrix that has the allele frequency data
af.oth <- extracted.value.df[,1]/extracted.value.df[,2]
af.ami <- extracted.value.df[,3]/extracted.value.df[,4]
af.sas <- extracted.value.df[,5]/extracted.value.df[,6]
af.fin <- extracted.value.df[,7]/extracted.value.df[,8]
af.eas <- extracted.value.df[,9]/extracted.value.df[,10]
af.amr <- extracted.value.df[,11]/extracted.value.df[,12]
af.afr <- extracted.value.df[,13]/extracted.value.df[,14]
af.mid <- extracted.value.df[,15]/extracted.value.df[,16]
af.asj <- extracted.value.df[,17]/extracted.value.df[,18]
af.nfe <- extracted.value.df[,19]/extracted.value.df[,20]
af.tot <- extracted.value.df[,21]/extracted.value.df[,22]
af.matrix <- cbind(af.oth, af.ami, af.sas, af.fin, af.eas, af.amr, af.afr, af.mid, af.asj, af.nfe, af.tot) %>% data.frame()
colnames(af.matrix) <- c("af.oth", "af.ami", "af.sas", "af.fin", "af.eas", "af.amr", "af.afr", "af.mid", "af.asj", "af.nfe", "af.tot")


In [None]:
#bind the allele frequency data back to the snp/guide matrix
snp.and.af <- cbind(af.matrix, snp.to.guide) %>% data.frame()
colnames(snp.and.af) <- c("af.oth", "af.ami", "af.sas", "af.fin", "af.eas", "af.amr", "af.afr", "af.mid", "af.asj", "af.nfe", "af.tot",
							"gene", "sgrna", "chr", "pos", "id", "ref", "alt", "qual", "filter", "info")

In [None]:
#Find the fraction of genes that are affected at each AF cutoff
af.cutoff.sequence <- seq(from = 0, to = 1, by = 0.001)

affected.genes.vector.tot <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on tot")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.tot >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.tot <- c(affected.genes.vector.tot, length.unique.genes)
}

affected.genes.vector.oth <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on oth")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.oth >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.oth <- c(affected.genes.vector.oth, length.unique.genes)
}


affected.genes.vector.ami <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on ami")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.ami >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.ami <- c(affected.genes.vector.ami, length.unique.genes)
}


affected.genes.vector.sas <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on sas")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.sas >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.sas <- c(affected.genes.vector.sas, length.unique.genes)
}


affected.genes.vector.fin <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on fin")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.fin >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.fin <- c(affected.genes.vector.fin, length.unique.genes)
}


affected.genes.vector.eas <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on eas")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.eas >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.eas <- c(affected.genes.vector.eas, length.unique.genes)
}


affected.genes.vector.amr <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on amr")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.amr >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.amr <- c(affected.genes.vector.amr, length.unique.genes)
}


affected.genes.vector.afr <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on afr")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.afr >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.afr <- c(affected.genes.vector.afr, length.unique.genes)
}


affected.genes.vector.mid <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on mid")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.mid >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.mid <- c(affected.genes.vector.mid, length.unique.genes)
}


affected.genes.vector.asj <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on asj")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.asj >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.asj <- c(affected.genes.vector.asj, length.unique.genes)
}


affected.genes.vector.nfe <- NULL
for(i in 1:length(af.cutoff.sequence)){
show_msg("Working on nfe")
af.cutoff <- af.cutoff.sequence[i] #bring the af cutoff inside of the loop

df.subset <- snp.and.af[snp.and.af$af.nfe >= af.cutoff, ] #subset the dataframe so that it only includes genes which have a SNP greater than the AF cutoff
unique.genes <- df.subset$gene %>% unique() #save this data outside of the loop if you want to have a list of genes that are at each allele frequency fraction
length.unique.genes <- length(unique.genes) #get the total number of affected genes
affected.genes.vector.nfe <- c(affected.genes.vector.nfe, length.unique.genes)
}

In [None]:
#bind the output together in a single data frame so that we can create a plot
dataframe.for.plotting <- cbind(af.cutoff.sequence, 
								affected.genes.vector.afr,
								affected.genes.vector.nfe,
								affected.genes.vector.ami,
								affected.genes.vector.sas,
								affected.genes.vector.fin,
								affected.genes.vector.eas,
								affected.genes.vector.amr,
								affected.genes.vector.asj,
								affected.genes.vector.mid,
								affected.genes.vector.oth,
								affected.genes.vector.tot) %>% data.frame()
								
colnames(dataframe.for.plotting) <- c("af_cutoff", 
									"AFR",
									"NFE",
									"AMI",
									"SAS",
									"FIN",
									"EAS",
									"AMR",
									"ASJ",
									"MID",
									"OTH",
									"Total")

In [None]:

#melt the df
melted.dataframe.for.plotting <- melt(dataframe.for.plotting, id = "af_cutoff")
colnames(melted.dataframe.for.plotting) <- c("af_cutoff", "Ancestry", "value")


#generate the plot
ggplot(melted.dataframe.for.plotting, aes(x = af_cutoff, y = value, color = Ancestry)) +
	geom_point() +
	theme_bw() +
	scale_color_brewer(palette = "Spectral") +

theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) +
theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold")) +
theme(axis.text.x = element_text(face="bold", 
                           size=14),
          axis.text.y = element_text(face="bold", 
                           size=14)) +

theme(
panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(color = "black")
) +

theme(
axis.title.x = element_text(color = "black", size = 14, face = "bold"),
axis.title.y = element_text(color = "black", size = 14, face = "bold")
) +

theme(
axis.text=element_text(size = 12, face = "bold"),
axis.title=element_text(size = 14,face = "bold")
) +

theme(
  legend.title = element_text(size = 14, face = "bold"),
  legend.text = element_text(size = 10, face = "bold")
) +

xlab("Genotype Frequency") +
ylab("# Affected Genes") +


scale_y_continuous(breaks=c(0, 1, 10, 100, 1000, 10000), trans=log10_trans())


In [None]:
#Export the plot

paste(working_dir, "/figures", sep ="") %>% setwd()
ggsave("af_cutoff_vs_gene_number.tiff") #save the plot

system(glue::glue("
cd {working_dir}/figures
gsutil cp af_cutoff_vs_gene_number.tiff gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514
"))

In [None]:
###This may have errored somewhere in the process. As a next step, check to make sure the figure was output correctly.