# Preprocess Input Files

The purpose of this code is to generate the input files which will be used as inputs for generating figures.

In [1]:
#Arguments/Parameters

working_dir = "/home/jupyter/notebooks/Ancestry"
workspace_bucket = Sys.getenv('WORKSPACE_BUCKET')

filtering_bed = "sanger_filtering.bed" #The name of the bed file that will be used to subset the gnomAD data. This file must be uploaded to the workspace bucket.

out_directory = "sanger" #the output directory name

In [2]:
#Load packages/software 
#Build general functions

#load libraries
library(dplyr)
library(vcfR)
library(tidyverse)

#build functions
show_msg <- function(x){ 
    print(x)
    flush.console()
}


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



   *****       ***   vcfR   ***       *****
   This is vcfR 1.12.0 
     browseVignettes('vcfR') # Documentation
     citation('vcfR') # Citation
   *****       *****      *****       *****


── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mforcats[39m 0.5.1
[32m✔[39m [34mreadr  [39m 1.4.0     

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag(

In [3]:
#Get a list of all of the files that we will be working with

paste(working_dir, "/", "filtered_output/", out_directory, sep = "") %>% setwd() #Get the working directory

file.list <- list.files(path = ".", pattern = paste("vcf.bgz.", out_directory, sep = "")) #get a list of all of the chromosomes

file.list #Print all of the files in the directory so that we can take a quick look at their names

if(length(file.list) != 24) { print("Not all chromosomes are present") } else { print("All chromososomes are present") } #print an error if list.files() can't find all of the chromosomes




[1] "All chromososomes are present"


Process all of the data frames so that they are nice and clean, then merge them all together.

This chunk will process all of the VCF files and combine them together into a single data matrix.

In [None]:
#Loop through all of the chromosome files, trim off the junk that we don't want, and combine them together in a single data frame
combined.df <- NULL
for(i in 1:length(file.list)){
print(paste("Currently working on the following file:", file.list[i], ". This is iteration number ", i, " of the loop.", sep = "")) #Print the status
subset.file <- read.table(file.list[i], sep = "\t") #load in the subset
trimmed.subset.file <- subset.file[ ,-c(3:9)] #Remove the data that we are not interested in.
combined.df <- rbind(combined.df, trimmed.subset.file) %>% data.frame()
}



#Trim down the variant calls so that it removes the complex encoding and convers it to simple 0, 1, 2 calls
combined.df <- lapply(combined.df, function(x) sub(":.*", "", x)) %>% data.frame() #Trim off the extra stuff, leaving just the 0/0, 0/1, 1/1, and ./. encoding

combined.df <- combined.df %>%
                                mutate_all(funs(str_replace(., "0/0", "0"))) %>%
                                    mutate_all(funs(str_replace(., "0/1", "1"))) %>%
                                        mutate_all(funs(str_replace(., "1/0", "1"))) %>%
                                            mutate_all(funs(str_replace(., "1/1", "2"))) %>%
                                                mutate_all(funs(str_replace(., "./.", "0"))) #Now replace the 0/0, 0/1, 1/1, and ./. encoding with 0, 1, 2
                      
                      
                      
#Load in one of the VCF files so that we can extract the sample names from it. Then use those sample names to assign the column names to 'combined.df'
vcf.file <- read.vcfR(file.list[15], verbose = FALSE) #Read in the vcf file
vcf.file <- extract.gt(vcf.file, element = 'GT', as.numeric = TRUE) #Convert it to an actual matrix
sample.names <- colnames(vcf.file) #extract all of the sample names
colnames(combined.df) <- c("chr", "pos", sample.names)

[1] "Currently working on the following file:gnomad.genomes.v3.1.hgdp_1kg_subset.chr1.vcf.bgz.sanger. This is iteration number 1 of the loop."
[1] "Currently working on the following file:gnomad.genomes.v3.1.hgdp_1kg_subset.chr10.vcf.bgz.sanger. This is iteration number 2 of the loop."
[1] "Currently working on the following file:gnomad.genomes.v3.1.hgdp_1kg_subset.chr11.vcf.bgz.sanger. This is iteration number 3 of the loop."
[1] "Currently working on the following file:gnomad.genomes.v3.1.hgdp_1kg_subset.chr12.vcf.bgz.sanger. This is iteration number 4 of the loop."
[1] "Currently working on the following file:gnomad.genomes.v3.1.hgdp_1kg_subset.chr13.vcf.bgz.sanger. This is iteration number 5 of the loop."
[1] "Currently working on the following file:gnomad.genomes.v3.1.hgdp_1kg_subset.chr14.vcf.bgz.sanger. This is iteration number 6 of the loop."
[1] "Currently working on the following file:gnomad.genomes.v3.1.hgdp_1kg_subset.chr15.vcf.bgz.sanger. This is iteration number 7 of the 

In [None]:
#Export the data
output.file.name <- paste("combined_", out_directory, "_subset.txt", sep = "") #Generate the file name
write.table(combined.df, output.file.name, sep = "\t", row.names = F, col.names = T) #Export the file

Next, we need to assign an affected sgrna to each of the variants

In [None]:
#Load in the bed file
bed.file <- read.table(filtering_bed, sep = "\t") #load in a dataset which indicates where each sgRNA binds in the genome
colnames(bed.file) <- c("chr", "start_pos", "end_pos", "sgrna") #and assign the column names

In [None]:
#get a list of all of the chromosome names
chromosomes <- combined.df$chr %>% unique()

#rename combined.df file so that I don't need to change it in the following code.
#I previously had these two as separate scripts, and I am a very lazy person
collapsed.input <- combined.df

In [None]:
#split collapsed.input and bed.file so that they are a list of data frames where each element of the list is a different chromosome
collapsed.input.split <- NULL
bed.file.split <- NULL
for(i in 1:length(chromosomes)){
print(paste("Currently working on: ", chromosomes[i], sep = ""))
collapsed.input.split[[i]] <- collapsed.input[collapsed.input$chr %in% chromosomes[i],] %>% data.frame()
bed.file.split[[i]] <- bed.file[bed.file$chr %in% chromosomes[i],] %>% data.frame()
}
names(collapsed.input.split) <- chromosomes #assign the names so that the name of each list element is the chromosome name
names(bed.file.split) <- chromosomes #assign the names so that the name of each list element is the chromosome name

In [None]:
#Set up a for() loop to iterate through all of the chromosomes
collapsed.with.sgrna <- NULL
for(i in 1:length(chromosomes)){
show_msg(glue::glue("Currently working on: {chromosomes[i]}"))
bfs <- bed.file.split[[chromosomes[i]]] #assign the dataframe from the list of dataframes to a smaller variable to make it easier to code things in the loop
cis <- collapsed.input.split[[chromosomes[i]]] #assign the dataframe from the list of dataframes to a smaller variable to make it easier to code things in the loop

#Scan through all of the guides in the bed file and make an index of all of the variants that map to each of those guides
mapped.snp.list <- apply(bfs, 1, function(x){
    bed.chr <- x[1] #extract the name of the chromosome
    bed.start.pos <- x[2] %>% as.numeric() #extract the start position of the sgrna
    bed.end.pos <- x[3] %>% as.numeric() #extract the end position of the sgrna
    between.vector <- between(as.numeric(cis$pos), bed.start.pos, bed.end.pos)
    
    #now print the kept rows
    cis[between.vector,]
})
show_msg(glue::glue("Finished variant mapping for: {chromosomes[i]}"))

#Each element of mapped.snp.list is a data frame that contains all of the information for each SNP that maps to each sgRNA. Annotate each dataframe with the appropriate guide
eoe.compiled <- NULL
for(j in 1:length(mapped.snp.list)){
eoe <- mapped.snp.list[[j]] #extract the element of the list of interest to this loop. Each element here is all of the snps that map to a specific sgrna
sgrna.vector <- rep(bfs[j,4], nrow(eoe)) %>% as.vector() #create a vector that repeats the name of the sgrna for each row in the eoe list
eoe.sgrna <- cbind(sgrna.vector, eoe) %>% data.frame()
eoe.compiled <- rbind(eoe.compiled, eoe.sgrna)
}
show_msg(glue::glue("Finished assigning sgrnas for: {chromosomes[i]}"))

#Now bind this all together in one big data frame
collapsed.with.sgrna <- rbind(collapsed.with.sgrna, eoe.compiled) %>% data.frame()
}

In [None]:
#Now export the file so that we can use it in future analysis
output.file.name <- paste("collapsed_", out_directory, "_with_sgrna.txt", sep = "") #Generate the file name
write.table(collapsed.with.sgrna, output.file.name, sep = "\t", col.names = T, row.names = F) #Export the file

Create an additional output file that takes the sgRNA-level data matrix and converts it to a gene-level data matrix

In [3]:
#Load in the file that contains the sgrna-to-gene conversion
system(glue::glue("
gsutil cp {workspace_bucket}/{out_directory}_guide_to_gene.chip {working_dir}/filtered_output/{out_directory}
"))

paste(working_dir, "/", "filtered_output/", out_directory, sep = "") %>% setwd() #Get the working directory
show_msg(glue::glue("Loading Files"))
chip.file.name <- paste(out_directory, "_guide_to_gene.chip", sep = "")
guide.to.gene <- read.table(chip.file.name, sep = "\t", header = T)
colnames(guide.to.gene) <- c("sgrna", "gene", "gene_id")
head(guide.to.gene)



#Load in the collapsed dataset
paste(working_dir, "/", "filtered_output/", out_directory, sep = "") %>% setwd() #Get the working directory
output.file.name <- paste("collapsed_", out_directory, "_with_sgrna.txt", sep = "") #Generate the file name
collapsed.with.sgrna <- read.table(output.file.name, sep = "\t", header = T)




#Drop the variant data columns since we don't need it for this version of the matrix
show_msg(glue::glue("Dropping Columns"))
collapsed.with.sgrna <- collapsed.with.sgrna[,-c(2:3)]


#Convert the sgrna data so that it is numeric
collapsed.with.sgrna[,2:ncol(collapsed.with.sgrna)] <- apply(collapsed.with.sgrna[,2:ncol(collapsed.with.sgrna)], 2, as.numeric)


#collapse the data by sgrna sequence
show_msg(glue::glue("Collapsing guide-level data"))
collapsed.matrix <- aggregate(. ~ sgrna.vector, data = collapsed.with.sgrna, FUN = sum)


#Convert all of the numbers that are greater than 1 to be 1. A number greater than 1 means that there are multiple SNPs in that guide. But we just want a binary indicator of whether or not there is a SNP in that guide
show_msg(glue::glue("Recoding guide-level matrix"))
sgrna.labels <- collapsed.matrix$sgrna.vector #Take a vector of the sgrna labels
collapsed.matrix[collapsed.matrix > 1] <- 1 #Convert all of the numbers larger than 1 to be equal to 1
collapsed.matrix$sgrna.vector <- sgrna.labels #Assign the sgrna names back to the matrix


#clean up the dataframe a bit
colnames(collapsed.matrix)[1] <- "sgrna"


#Export the guide_level_matrix
show_msg(glue::glue("Exporting guide-level matrix"))
guide.level.matrix.name <- paste(out_directory, "_guide_level_matrix.txt", sep = "")
write.table(collapsed.matrix, guide.level.matrix.name, sep = "\t", col.names = TRUE, row.names = FALSE)


#Replace the sgrna sequences in column 1 with gene names
collapsed.matrix$sgrna <- plyr::mapvalues(collapsed.matrix$sgrna, from = guide.to.gene$sgrna, to = guide.to.gene$gene)



#Just like we did above, collapse the dataset by gene names
show_msg(glue::glue("Collapsing by gene name"))
collapsed.gene.matrix <- aggregate(. ~ sgrna, data = collapsed.matrix, FUN = sum)


#Convert all of the numbers that are greater than 1 to be 1. A number greater than 1 means that there are multiple SNPs in that guide. But we just want a binary indicator of whether or not there is a SNP in that guide
show_msg(glue::glue("Recoding gene-level matrix"))
gene <- collapsed.gene.matrix$sgrna #Take a vector of the sgrna labels
collapsed.gene.matrix[collapsed.gene.matrix > 1] <- 1 #Convert all of the numbers larger than 1 to be equal to 1
collapsed.gene.matrix$sgrna <- gene #Assign the sgrna names back to the matrix
colnames(collapsed.gene.matrix)[1] <- "gene"


#Now export the collapsed.gene.matrix
show_msg(glue::glue("Exporting guide-level matrix"))
gene.level.matrix.name <- paste(out_directory, "_gene_level_matrix.txt", sep = "")
write.table(collapsed.gene.matrix, gene.level.matrix.name, sep = "\t", col.names = TRUE, row.names = FALSE)

Loading Files


Unnamed: 0_level_0,sgrna,gene,gene_id
Unnamed: 0_level_1,<chr>,<chr>,<int>
1,TCAATGGTCACAGTAGCGC,A1BG,580
2,CTGCAGCTACCGGACCGAT,A1BG,803
3,CGGGGGTGATCCAGGACAC,A1BG,424
4,TGCTGACGGGTGACACCCA,A1BG,368
5,ACATGGTATTGCAGTAGAC,A1CF,234
6,ATCTTATCGGAGATGAAAA,A1CF,3


Dropping Columns
Collapsing guide-level data
Recoding guide-level matrix
Exporting guide-level matrix


The following `from` values were not present in `x`: TCAATGGTCACAGTAGCGC, CTGCAGCTACCGGACCGAT, CGGGGGTGATCCAGGACAC, TGCTGACGGGTGACACCCA, AATATGGTGGCCCTCCACC, AACATGCACCAGGCGTGCA, GAATCCCCAATCACGTCCC, TGCTCATCCGTGGTAGCAT, CAATCCGGGTGTCGGGAGT, GCTGTACGCATATCCAATG, TTCACCGAGCTTCCGGGAG, CACGTAGTACATCACGCTC, CGCCGCGCCCTTAGATACC, GACTGGTACGCGGCCGTGC, GCAGCAGACTCTACGGAAC, TTCTTTCAAGTCCCACCAG, GCCCATCGACAGAGTGCCA, GTGAAGACGGCATCCGGCT, GCTACATTTGTGGAGCTCC, AACTTGTACATTTACGGTA, GGACAGGCCAGATCTGCCC, CCGGGCCAGATAATTGCAC, GTATGTCTCTTAGGGAATC, AATAATGTTGCAGCCCAGT, ACCACCCCGATGATCAGAG, CTGGAACAAACCATAGCAT, ACGCCTTCTACTCAGGCCC, TTCAATTGCACCCCGCCAG, GGCAAAATCATCACTACGA, GGGTAGTCGGACTTGGGAT, CTGCACACCAGGAGAATGC, GGTACTCCACAGTTGATGA, TCGCTGAGCCGCCATACGA, ATACCTTGGCAGTGACCGG, GCAGCGAAACTCACTGGCA, ATTGACTATAACTCCTGGG, GTGGACAAGGCTAATCCGA, GCTGGAGGACCTCAGACGA, CTGACGCGCGCCTTCCAGT, GCGATCTCGAGGCGCGTGC, TACGCTATCGAAGAGCTCC, GCCGCACATCTTGTCAACC, ACCACCAATCCGGTCGTAG, ATCCACGAAGCGAGATGGC, TCATTGACGCTCTGCTCAA, GC

Collapsing by gene name
Recoding gene-level matrix
Exporting guide-level matrix
