The purpose of this code is to construct a matrix which lists which sgRNAs are affected by which SNVs in the DepMap dataset

In [37]:
#Arguments

working_dir = "/home/jupyter/notebooks/Ancestry"
workspace_bucket = Sys.getenv('WORKSPACE_BUCKET')

filtering_bed = "Avana14_filtering.bed" #The name of the bed file that will be used to subset the gnomAD data. This file must be uploaded to the workspace bucket.

out_directory = "avana14" #the output directory name

ccle_vcf_name = "ccle.all.called.vcf"

In [77]:
#Set up the environment
library(dplyr)
library(tidyverse)
library(stringr)
library(plyr)


#Install Picard if it needs to be installed
system(glue::glue("
cd {working_dir}

#Create the directory to store the liftover files
if [ ! -d '{working_dir}/software/picard' ] 
then
mkdir {working_dir}/software/picard
cd {working_dir}/software/picard
wget https://github.com/broadinstitute/picard/releases/download/2.25.7/picard.jar
fi


"))


#Define functions

show_msg <- function(x){ 
    print(x)
    flush.console()
}

In [17]:
#Create the directory structure

system(glue::glue("

cd {working_dir}

#Build the master directory if it doesn't exist
if [ ! -d '{working_dir}/depmap_analysis' ] 
then
mkdir depmap_analysis
fi

#Make the sample-specific sub-directory
mkdir depmap_analysis/{out_directory}

"))



system(glue::glue("

cd {working_dir}

#Build the master directory if it doesn't exist
if [ ! -d '{working_dir}/raw_data/depmap' ] 
then
mkdir raw_data/depmap
fi

#Make the sample-specific sub-directory
mkdir raw_data/depmap/{out_directory}

"))



system(glue::glue("
cd {working_dir}

#Create the directory to store the liftover files
if [ ! -d '{working_dir}/LiftOver' ] 
then
mkdir LiftOver
fi

if [ ! -d '{working_dir}/LiftOver/hg19_to_hg38' ] 
then
mkdir LiftOver/hg19_to_hg38
fi

"))

In [17]:
#Download the DepMap variant calls

system(glue::glue("
cd {working_dir}/raw_data/depmap/{out_directory}

gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/recoded.ccle.all.called.vcf .

"))

#Download the LiftOver files
system(glue::glue("
cd {working_dir}/LiftOver/hg19_to_hg38
wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
"))



In [11]:
#Construct the GATK dict file
system(glue::glue("
cd {working_dir}/LiftOver/hg19_to_hg38
java -jar {working_dir}/software/picard/picard.jar CreateSequenceDictionary R=hg38.fa.gz O=hg38.dict
"))

In [5]:
#zip recoded.ccle.all.called.vcf
system(glue::glue("
cd {working_dir}/raw_data/depmap/{out_directory}
bgzip -c recoded.{ccle_vcf_name} > recoded.{ccle_vcf_name}.gz
"))

In [10]:
#LiftOver the variant calls to hg38 because they are currently hg19
system(glue::glue("
java -jar {working_dir}/software/picard/picard.jar LiftoverVcf I={working_dir}/raw_data/depmap/{out_directory}/recoded.{ccle_vcf_name}.gz O={working_dir}/raw_data/depmap/{out_directory}/hg38.{ccle_vcf_name}.gz MAX_RECORDS_IN_RAM=10000 CHAIN={working_dir}/LiftOver/hg19_to_hg38/hg19ToHg38.over.chain.gz REJECT={working_dir}/raw_data/depmap/{out_directory}/liftover_rejected_variants.vcf R={working_dir}/LiftOver/hg19_to_hg38/hg38.fa.gz
"))

In [12]:
#Subset the hg38.ccle.all.called vcf so that it only includes variants that map to avana14 sgrna targeting sequences

system(glue::glue("
cd {working_dir}/raw_data/depmap/{out_directory}
/home/jupyter/notebooks/Ancestry/software/bcftools/bcftools view -R {working_dir}/filtered_output/{out_directory}/{filtering_bed} hg38.{ccle_vcf_name}.gz -x INFO -o hg38.{ccle_vcf_name}.gz.avana.subset -Ov
"))

In [31]:
###Get all of the cds names from the vcf file so that we can re-assign them as the sample headers for the matrix

#Extract the cds names from the vcf file
system(glue::glue("
cd {working_dir}/raw_data/depmap/{out_directory}
echo | head -n 118 recoded.{ccle_vcf_name} | tail -n 1 | cut -c3- > cds_names.txt
"))

#Read in the cds names
paste(working_dir, "/raw_data/depmap/", out_directory, sep = "") %>% setwd()
cds.names <- read.table('cds_names.txt', sep = "\t")

#The cds.names dataset is a dataframe with several rows and 1 column. Each column has a string with multiple samples in it. All rows are the same, so we will want to combine them
complex.strings <- NULL
for(i in 1:nrow(cds.names)){
    row.of.interest <- cds.names[i,] #Set the row of interest as a variable
    split.row <- data.frame(strsplit(row.of.interest, " ")) #split the row on white space
    colnames(split.row) <- "split.row"
    complex.strings <- rbind(complex.strings, split.row) #bind it to the variable in the main environment
}
complex.strings <- as.vector(complex.strings[,1])


#The output is now a data frame where each row is in the format of /asdf/asdf/asdf/asdf/asdf/CDS-XXXXX.vcf.gz, and we want only the last part.
#Each element has a '/' before the CDS, and no slashes after. So we can use gsub to remove all the junk
complex.strings <- gsub(".*/", "", complex.strings)
complex.strings <- gsub('_cnn_filtered.vcf.gz', '', complex.strings) #Now each CDS ID has "_cnn_filtered.vcf.gz" after it, so remove that.


###Oh no, complex.strings is longer than we expect. There are only 1918 samples in the dataset, but the length of comple.strings is 1935.
complex.string.index <- grep("CDS", complex.strings)
simple.strings <- complex.strings[complex.string.index]

In [44]:
#Take the avana subset matrix vcf file and convert it so that is it just a snp x cell line data matrix
paste(working_dir, "/raw_data/depmap/", out_directory, sep = "") %>% setwd()
avana.subset <- read.table('hg38.ccle.all.called.vcf.gz.avana.subset', sep = "\t") #Load in the avana subset dataset
dim(avana.subset)
avana.subset <- select(avana.subset, -c(3:9)) #Remove the columns that we don't need.
dim(avana.subset)

avana.subset <- lapply(avana.subset, function(x) sub(":.*", "", x)) %>% data.frame() #Remove the extra junk from the variant calls


#Replace the 0/0, 0/1, 1/1 notation with 0 1 and 2 to make things easier
print("Converting variant notation to 0, 1, 2 format")
avana.subset <- avana.subset %>%
mutate_all(funs(str_replace(., "0/0", "0"))) %>%
mutate_all(funs(str_replace(., "0/1", "1"))) %>%
mutate_all(funs(str_replace(., "1/0", "1"))) %>%
mutate_all(funs(str_replace(., "1/1", "2"))) %>%
mutate_all(funs(str_replace(., "./.", "0")))

avana.subset <- avana.subset %>% data.frame() #Convert the data to a data frame

[1] "Converting variant notation to 0, 1, 2 format"


ERROR: Error in eval(expr, envir, enclos): object 'sample.names' not found


# #Format/adjust 'avana.subset'

In [46]:
#Assign the correct column names to the data frame
colnames(avana.subset) <- c ("chr", "pos", simple.strings)

#Convert the data so that it is numeric
avana.subset[,2:ncol(avana.subset)] <- apply(avana.subset[,2:ncol(avana.subset)], 2, as.numeric)

#Now convert the cds sample names into the ACH IDs
system(glue::glue("
cd cd {working_dir}/raw_data/depmap/{out_directory}
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/ccle_sample_tracker.csv .
")) #Download the bridging file from the google bucket

#Load the cds-to-ach bridging file
paste(working_dir, "/raw_data/depmap/", out_directory, sep = "") %>% setwd()
cds.to.ach <- read.table('ccle_sample_tracker.csv', sep = ",", header = T)

#Filter the bridging file so that it only includes the wes and wgs data
cds.to.ach <- cds.to.ach[cds.to.ach$datatype %in% c("wgs", "wes"),]

#Assign the column names of avana.subset so that they are ACH IDs rather than CDS IDs
colnames(avana.subset) <- plyr::mapvalues(colnames(avana.subset), from = cds.to.ach$cds_id, to = cds.to.ach$arxspan_id)

#Remove the duplicated columns
avana.subset <- avana.subset[, !duplicated(colnames(avana.subset))]

#Remove the samples that do not have an ACH ID
avana.subset <- select(avana.subset, -contains("CDS"))

# #Now map the variants in this dataset back to the guides 

In [98]:
#Load in the bed file
system(glue::glue("
cd cd {working_dir}/raw_data/depmap/{out_directory}
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/Avana14_filtering.bed .
")) #Download the bed file from the google bucket

paste(working_dir, "/raw_data/depmap/", out_directory, sep = "") %>% setwd()
bedFile <- read.table(filtering_bed, sep = "\t")
colnames(bedFile) <- c("chr", "startpos", "endpos", "sgrna")

#Get the chromosome names
chromosome.names <- avana.subset$chr %>% unique()

collapsed.guide.snp.matrix <- NULL
for(i in 1:length(chromosome.names)){

coi <- chromosome.names[i] #Get the name of the chromosome that we are interested in for this iteration of the loop
show_msg(glue::glue("
currently running on chromosome {coi}
")) #Print the status


subset.snp.data <- avana.subset[avana.subset$chr %in% coi, ] #subset the snp data so that it only includes the chromosome that we are interested in
subset.guide.map <- bedFile[bedFile$chr %in% coi, ] #subset the bed file so that it only includes the chromosome that we are interested in


    #Find which SNPs map to each guide
show_msg(glue::glue("
currently mapping snps for chromosome {coi}
")) #Print the status
    
between.list <- apply(subset.guide.map, 1, function(x){
	between.vector <- between(subset.snp.data$pos, x[2], x[3])
	subset.snp.data.filtered <- subset.snp.data[between.vector, ]
	})

#assign the names to the list
names(between.list) <- subset.guide.map$sgrna


#Now convert the list into a dataframe
df.collapsed <- bind_rows(between.list, .id = "column_label")


#Bind the output back to the main matrix
collapsed.guide.snp.matrix <- rbind(collapsed.guide.snp.matrix, df.collapsed)

}

currently running on chromosome chr1
currently mapping snps for chromosome chr1
currently running on chromosome chr2
currently mapping snps for chromosome chr2
currently running on chromosome chr3
currently mapping snps for chromosome chr3
currently running on chromosome chr4
currently mapping snps for chromosome chr4
currently running on chromosome chr5
currently mapping snps for chromosome chr5
currently running on chromosome chr6
currently mapping snps for chromosome chr6
currently running on chromosome chr7
currently mapping snps for chromosome chr7
currently running on chromosome chr8
currently mapping snps for chromosome chr8
currently running on chromosome chr9
currently mapping snps for chromosome chr9
currently running on chromosome chr10
currently mapping snps for chromosome chr10
currently running on chromosome chr11
currently mapping snps for chromosome chr11
currently running on chromosome chr12
currently mapping snps for chromosome chr12
currently running on chromosome ch

In [115]:
#Add the gene name to collapsed.guide.snp.matrix

#First download and load in the chip file to bridge between sgrna and gene symbol
system(glue::glue("
cd cd {working_dir}/raw_data/depmap/{out_directory}
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/avana14_guide_to_gene.chip .
")) #Download the bed file from the google bucket

paste(working_dir, "/raw_data/depmap/", out_directory, sep = "") %>% setwd()
chipFile <- read.table("avana14_guide_to_gene.chip", sep = "\t", header = T)

head(chipFile)


#Add an additional column to collapsed.guide.snp.matrix
collapsed.guide.snp.matrix <- cbind(collapsed.guide.snp.matrix$column_label, collapsed.guide.snp.matrix) %>% data.frame()
colnames(collapsed.guide.snp.matrix)[1:2] <- c("gene", "sgrna")


#Now replace the extra sgrna column with gene symbols
collapsed.guide.snp.matrix$gene <- plyr::mapvalues(collapsed.guide.snp.matrix$gene, from = chipFile$Barcode.Sequence, to = chipFile$Gene.Symbol)



Unnamed: 0_level_0,Barcode.Sequence,Gene.Symbol,Gene.ID
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,AAAAAAATCCAGCAATGCAG,SHOC2,8036
2,AAAAAACCCGTAGATAGCCT,NDUFA12,55967
3,AAAAAAGAAGAAAAAACCAG,SDAD1,55153
4,AAAAAAGCTCAAGAAGGAGG,FAM98A,25940
5,AAAAAAGGCTGTAAAAGCGT,ZNF253,56242
6,AAAAAAGGGCTCCAAAAAGG,H2BC7,8343


The following `from` values were not present in `x`: AAAAAAATCCAGCAATGCAG, AAAAACAACACATCAGAGCG, AAAAAGATCATGATTGAGCG, AAAAAGCTGGGTTAGAAGCG, AAAAAGCTTCCGCCTGATGG, AAAAATGCTGAATTTCCCAG, AAAAATGTATACTAACCAGG, AAAACAATCTCACCTCTGGG, AAAACAGCTGAGACTTAAAA, AAAACAGGACGATGTGCGGC, AAAACATCGACCGAAAGCGT, AAAACCCAGGAAATTAGCAA, AAAACCCAGGAAATTAGCAA, AAAACCCAGGAAATTAGCAA, AAAACCCAGGAAATTAGCAA, AAAACCTCGGATTTCAGCCG, AAAAGAGCAGGAGTTCCTGG, AAAAGATCACTGAAGTTGGA, AAAAGCAGCCAGCAACTTGA, AAAAGCAGCCAGCAACTTGA, AAAAGGCATGATGCTCACCA, AAAAGGCCTGACATATCTGA, AAAAGGCCTGACATATCTGA, AAAAGGCTGTCCCATCCAGT, AAAAGTATAGAAAAGTCTGG, AAAAGTATTCTCAAACTAGA, AAAAGTCCACCAGAGCCATG, AAAAGTGTCGAGGAGGGCGT, AAAAGTGTGTGTGGGAGCTG, AAAATAGAGGAGGAACAAGG, AAAATAGCAGTAAACTCAAC, AAAATCGATGGGCTGAATCT, AAAATCGCTCAGTTACAGGA, AAAATCGTCCCATCCCCCAG, AAAATCGTCCCATTCCCCAG, AAAATCGTCCCATTCCCCAG, AAAATCGTCCCATTCCCCAG, AAAATCGTCCCATTCCCCAG, AAAATCGTCCCATTCCCCAG, AAAATCGTCCCATTCCCCAG, AAAATCGTCCCATTCCCCAG, AAAATCGTCCCATTCCCCAG, AAAATCGTCCCATTCCCCAG, A

Unnamed: 0_level_0,gene,sgrna,chr,pos,ACH.002214
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>
1,OR4F17,TCTCCATTCGGTGAGCCAGT,chr1,69534,0
2,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939354,0
3,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939355,0
4,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939356,0
5,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939358,0
6,NOC2L,CCATCTCTCAACCATGGCGA,chr1,956930,0
7,KLHL17,GCCCGCCTCCTGCAGCCCCG,chr1,961286,0
8,KLHL17,GCCCGCCTCCTGCAGCCCCG,chr1,961297,0
9,KLHL17,GCGTGGCCCACAACTCCAAG,chr1,961388,0
10,KLHL17,GCGTGGCCCACAACTCCAAG,chr1,961395,0


In [117]:
#Write the dataset so that we can use it in future analysis
paste(working_dir, "/raw_data/depmap/", out_directory, sep = "") %>% setwd()
write.table(collapsed.guide.snp.matrix, "depmap_avana_collapsed_guide_snp_matrix.txt", sep = "\t", col.names = T, row.names = F)