The purpose of this code is to create a heatmap which shows that a majority of the ancestry-associated guides have SNVs in the sgRNA targeting sequence

In [1]:
#Arguments/Parameters

working_dir = "/home/jupyter/notebooks/Ancestry"
workspace_bucket = Sys.getenv('WORKSPACE_BUCKET')
out_directory = "avana14" #the output directory name

In [110]:
#Set up the environment

#load packages
library(dplyr)
library(tidyverse)
library(stringr)
library(plyr)
library(tictoc)

#Define functions
show_msg <- function(x){ 
    print(x)
    flush.console()
}

In [3]:
#Load in the guide-level SNP Matrix
paste({working_dir}, "/raw_data/depmap/", {out_directory}, sep = "") %>% setwd()
snp.guide.matrix = read.table('depmap_avana_collapsed_guide_snp_matrix.txt', sep = "\t", header = T)
colnames(snp.guide.matrix) = gsub("\\.", "-", colnames(snp.guide.matrix))

In [33]:
#Load in the depmap Chronos scores
system(glue::glue("
cd {working_dir}/raw_data/depmap/{out_directory}
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/public-21q2_v12-achilles-gene-effect-chronos.csv .
"))

paste({working_dir}, "/raw_data/depmap/", {out_directory}, sep = "") %>% setwd()
chronos.scores = read.table('public-21q2_v12-achilles-gene-effect-chronos.csv', sep = ",", header = T)
colnames(chronos.scores) = gsub("\\..*", "", colnames(chronos.scores))

In [5]:
#Load in a list of ancestry-associated genes
#Once I get ADMIXTURE loading it will be necessary to re-run this since I suspect the predicted ancestry will change.
system(glue::glue("
cd {working_dir}/raw_data/depmap/{out_directory}
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/all_ancestry_associated_genes.txt .
"))

paste({working_dir}, "/raw_data/depmap/", {out_directory}, sep = "") %>% setwd()
ancestry.genes = read.table('all_ancestry_associated_genes.txt', sep = "\t")
ancestry.genes = ancestry.genes[,1] %>% as.vector()

In [73]:
#Load in the sgrna bed file
paste({working_dir}, "/raw_data/depmap/", {out_directory}, sep = "") %>% setwd()
avana.bed = read.table('Avana14_filtering.bed', sep = "\t")
colnames(avana.bed) = c("chr", "start", "end", "sgrna")

In [6]:
#filter the snp.guide.matrix so that it only includes ancestry-associated genes
ancestry.sgm = snp.guide.matrix[snp.guide.matrix$gene %in% ancestry.genes, ]

In [14]:
#Drop the gene name, chromosome, and position
ancestry.sgm.pruned = ancestry.sgm[,-c(1,3,4)]


#Then collapse by sgRNA
#The reason we do this is because there may be multiple SNPs that map to a single sgRNA
collapsed.ancestry.sgm.pruned <- aggregate(. ~ sgrna, data = ancestry.sgm.pruned, FUN = sum)


#Now convert all numbers > 1 to be = 1
show_msg(glue::glue("Recoding gene-level matrix"))
gene <- collapsed.ancestry.sgm.pruned$sgrna #Take a vector of the sgrna labels
collapsed.ancestry.sgm.pruned[collapsed.ancestry.sgm.pruned > 1] <- 1 #Convert all of the numbers larger than 1 to be equal to 1
collapsed.ancestry.sgm.pruned$sgrna <- gene #Assign the sgrna names back to the matrix

Recoding gene-level matrix


In [56]:
#Keep only the samples that we have WES and WGS data for
#Also subset so that it only includes samples that were profiled with CRISPR screens


#Load in the sample annotation file
paste({working_dir}, "/raw_data/depmap/", {out_directory}, sep = "") %>% setwd()
sample.annotation = read.table('ccle_sample_tracker.csv', sep = ",", header = T)


#Slice the sample annotation file so that it only includes the samples where we have WES and WGS data
samples.to.keep = sample.annotation[sample.annotation$datatype %in% c("wgs", "wes"), ]
samples.to.keep = samples.to.keep$arxspan_id %>% as.vector() %>% unique()


#Then subset the dataset so that it only includes the data that we have WES and WGS data for
casp = collapsed.ancestry.sgm.pruned[ ,colnames(collapsed.ancestry.sgm.pruned) %in% c("sgrna", samples.to.keep)]


#Finally, subset the dataframe so that it only contains samples that have CRISPR screen data
crispr.samples.to.keep = chronos.scores$DepMap_ID %>% as.vector %>% unique() #Grab all of the samples that we want to keep
casp = casp[,colnames(casp) %in% c("sgrna", crispr.samples.to.keep)]

In [57]:
casp[1:5, 1:5]

Unnamed: 0_level_0,sgrna,ACH-001020,ACH-000317,ACH-000172,ACH-000527
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,AACAATGCCCGTTTGCTGCA,0,0,0,0
2,AACTACCTATGACAGTGCCG,0,0,0,0
3,AAGCTGCTCGATGAGAACAG,0,0,1,0
4,AATGCTGGCGGCAGCAGCAA,1,1,0,1
5,AATGGTTTGCTGGCTCCATG,0,0,0,0


In [58]:
#Add in the gene name
casp = cbind(casp$sgrna, casp) %>% data.frame()
colnames(casp)[1] <- "gene"
casp$gene = plyr::mapvalues(casp$gene, from = snp.guide.matrix$sgrna, to = snp.guide.matrix$gene, warn_missing = FALSE)

In [60]:
casp[1:10, 1:10]
casp$gene %>% unique %>% length()

Unnamed: 0_level_0,gene,sgrna,ACH.001020,ACH.000317,ACH.000172,ACH.000527,ACH.001494,ACH.000522,ACH.001609,ACH.000183
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,TPSD1,AACAATGCCCGTTTGCTGCA,0,0,0,0,0,0,0,1
2,CLSPN,AACTACCTATGACAGTGCCG,0,0,0,0,0,0,0,0
3,MDN1,AAGCTGCTCGATGAGAACAG,0,0,1,0,1,0,0,0
4,PILRA,AATGCTGGCGGCAGCAGCAA,1,1,0,1,1,1,1,1
5,PSORS1C1,AATGGTTTGCTGGCTCCATG,0,0,0,0,0,0,0,0
6,C2orf80,AATTTGACCCAAAAGGAAGA,0,0,0,0,0,0,0,0
7,ZNF169,ACACAGGTATGGCTTCTCCC,0,0,0,0,0,0,0,0
8,OR4K1,ACAGTACAATTATGAACCGG,0,0,0,0,0,1,0,0
9,UGT2B17,ACAGTTGAGAAGAATGGTGG,0,0,0,0,0,0,0,0
10,ECD,ACATTGAGAGAATAATCACT,0,0,0,1,0,0,0,0


In this next step we will map the snps to the sgrna targeting sequences to find where the snps map in the affected guides

In [74]:
#We have the bed file, but we also need to annotate the bed file with which strand the guide binds to so that we know which side the ngg is on

#Load in the guide annotation data from the gpp website
system(glue::glue("
cd {working_dir}/raw_data/depmap/{out_directory}
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/CP0033_9606_GRCh38_20181031_ontarget.bed .
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/CP0033_reference_20141120.csv .
"))

paste({working_dir}, "/raw_data/depmap/", {out_directory}, sep = "") %>% setwd()
brd.to.guide = read.table('CP0033_reference_20141120.csv', sep = ",")
strand.info = read.table('CP0033_9606_GRCh38_20181031_ontarget.bed', sep = "\t")

In [76]:
#Now annotate avana.bed so that it also includes the strand information
avana.bed = cbind(avana.bed, avana.bed[,4]) %>% data.frame()
colnames(avana.bed) = c("chr", "start", "end", "sgrna", "strand")

#map the strand column so that it contains the strand information
avana.bed$strand = plyr::mapvalues(avana.bed$strand, from = brd.to.guide[,1], to = brd.to.guide[,2], warn_missing = FALSE)
avana.bed$strand = plyr::mapvalues(avana.bed$strand, from = strand.info[,4], to = strand.info[,6], warn_missing = FALSE)

In [79]:
snp.guide.matrix[1:10, 1:10]

Unnamed: 0_level_0,gene,sgrna,chr,pos,ACH-002214,ACH-001020,ACH-000317,ACH-000382,ACH-000172,ACH-000527
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,OR4F17,TCTCCATTCGGTGAGCCAGT,chr1,69534,0,0,0,0,0,0
2,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939354,0,0,0,0,0,0
3,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939355,0,0,0,0,0,0
4,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939356,0,0,0,0,0,0
5,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939358,0,0,0,0,0,0
6,NOC2L,CCATCTCTCAACCATGGCGA,chr1,956930,0,0,0,0,0,0
7,KLHL17,GCCCGCCTCCTGCAGCCCCG,chr1,961286,0,0,0,0,0,0
8,KLHL17,GCCCGCCTCCTGCAGCCCCG,chr1,961297,0,0,0,0,0,0
9,KLHL17,GCGTGGCCCACAACTCCAAG,chr1,961388,0,0,0,0,0,0
10,KLHL17,GCGTGGCCCACAACTCCAAG,chr1,961395,0,0,0,0,0,0


In [80]:
head(avana.bed)

Unnamed: 0_level_0,chr,start,end,sgrna,strand
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<chr>,<chr>
1,chr1,69527,69550,TCTCCATTCGGTGAGCCAGT,+
2,chr1,168646,168669,TGTAGTCCCAGCTACTCAGG,-
3,chr1,383420,383443,GAGAATCTCTTGAACCCGGG,+
4,chr1,424270,424293,TGTAATCCCAGTACTTTGGG,+
5,chr1,451271,451294,ATAATGGTCAGATAGTGGAG,+
6,chr1,451344,451367,TCTCCACACCACCAACGACG,+


In [116]:
#Map the position of each SNP to the guides

tic()

dist.vector = apply(snp.guide.matrix, 1, function(x){
    
    gene = x[1] #extract the gene of interest
    sgrna = x[2] #extract the sgrna of interest
    pos = x[4] %>% as.numeric() #extract the position of interest
    
    strand = avana.bed[avana.bed$sgrna %in% sgrna, 5] #Figure out what strand the sgrna is mapping to
    guide.start = avana.bed[avana.bed$sgrna %in% sgrna, 2] %>% as.numeric() #Get the starting position of the guide
    guide.end = avana.bed[avana.bed$sgrna %in% sgrna, 3] %>% as.numeric() #Get the end position of the guide
    
    if(strand %in% "+"){
            dist = pos - guide.start 
    } else {
            dist = guide.end - pos
    }
    
    dist #print the output
    
})


toc()

557.26 sec elapsed


In [120]:
#Add the distance to snp.guide.matrix
snp.guide.matrix = cbind(dist.vector, snp.guide.matrix) %>% data.frame()
colnames(snp.guide.matrix)[1] <- "snp_position"


In [121]:
snp.guide.matrix[1:5, 1:5]

Unnamed: 0_level_0,snp_position,gene,sgrna,chr,pos
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<chr>,<int>
1,7,OR4F17,TCTCCATTCGGTGAGCCAGT,chr1,69534
2,19,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939354
3,20,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939355
4,21,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939356
5,23,SAMD11,GCTTGTGTCAGCACTGAGCG,chr1,939358


In [None]:
###Next step, use ggplot to construct the heatmap
###May need to massage the data to make it into a matrix or something