In [16]:
#Set up the environment
library(tidyverse)
library(reshape2)
library(data.table)

In [17]:
#Read the file back in
setwd("../data")
final.df = read.table("snv_position_single_guide_finaldf.txt", sep = "\t", header = T) %>%
mutate(position = factor(position, levels = 0:22))

In [18]:
#Load in the guide -> gene map and convert final.df so that it also has the gene symbol
setwd("../data")
guide.map = read.table("22q1_Achilles_guide_map.csv", sep = ",", check.names = FALSE, header = T) %>%
mutate(gene = word(gene, 1))

#Add the gene column to final.df
final.df.gene = final.df %>%
mutate("gene" = plyr::mapvalues(from = guide.map$sgrna, to = guide.map$gene, sgrna, warn_missing = FALSE))

In [22]:
#Load in the genotyping data
#First, load in the ccle variants and format the dataset so that it looks nice
setwd("../data")
ccle.variant.calls = read.table("snps.in.all.avana.guides.vcf.gz", sep = "\t") %>%
filter(V7 %in% "PASS") %>%
select(-V1, -V2, -V4, -V5, -V6, -V7, -V8, -V9)

#Reformat the genotyping data
ccle.variant.calls[2:ncol(ccle.variant.calls)] = apply(ccle.variant.calls[2:ncol(ccle.variant.calls)], 2, function(x){gsub(":..*", "", x) })
ccle.variant.calls[2:ncol(ccle.variant.calls)] = apply(ccle.variant.calls[2:ncol(ccle.variant.calls)], 2, function(x){gsub("0\\|0", 0, x) })
ccle.variant.calls[2:ncol(ccle.variant.calls)] = apply(ccle.variant.calls[2:ncol(ccle.variant.calls)], 2, function(x){gsub("0\\|1", 1, x) })
ccle.variant.calls[2:ncol(ccle.variant.calls)] = apply(ccle.variant.calls[2:ncol(ccle.variant.calls)], 2, function(x){gsub("1\\|0", 1, x) })
ccle.variant.calls[2:ncol(ccle.variant.calls)] = apply(ccle.variant.calls[2:ncol(ccle.variant.calls)], 2, function(x){gsub("1\\|1", 2, x) })

#Load in the sample names
setwd("../data")
sample.names = read.table("ccle.vcf.sample.names.txt", sep = "\t") %>%
pull(1)

#Assign the sample names
ccle.variant.calls = ccle.variant.calls
colnames(ccle.variant.calls) = c("snp", sample.names)

#Melt the data frame
melted.ccle.variant.calls = ccle.variant.calls %>%
melt(id = "snp")



In [25]:
#Load in the 22q1 chronos scores
setwd("../data")
chronos.22q1 = fread("22q1_CRISPR_gene_effect.csv", sep = ",", header = T, check.names = FALSE) %>%
rename("sample" = 1)

#Get a vector of 22q1 samples
chronos.samples = chronos.22q1 %>% pull(sample)

#Melt the chronos scores
melted.chronos.22q1 = chronos.22q1 %>%
melt(id = "sample") %>%
rename("gene" = variable) %>%
mutate(gene = word(gene, 1))

melted.chronos.22q1 %>% head()

In [None]:
#Filter the ccle variant calls so that it only includes samples with chronos scores
chronos.only.melted.ccle.variant.calls = melted.ccle.variant.calls %>%
filter(variable %in% all_of(chronos.samples)) %>%
mutate(value = as.numeric(value))

In [None]:
#Loop through all of the guides in final.df and calculate the absolute difference in dependency score
differential.vector = NULL
for(i in 1:nrow(final.df.gene)){
    
    #Get some variables from final.df and store them so the code is easier to read
    guide.of.interest = final.df.gene[i, 1]
    snp.of.interest = final.df.gene[i, 2]
    gene.of.interest = final.df.gene[i, 6]
    
    
    #Get two vectors of samples with and without the guide
    samples.with.variant = chronos.only.melted.ccle.variant.calls %>%
    filter(snp %in% all_of(snp.of.interest)) %>%
    filter(value >= 1) %>%
    pull(variable)
    
    samples.without.variant = chronos.only.melted.ccle.variant.calls %>%
    filter(snp %in% all_of(snp.of.interest)) %>%
    filter(value == 0) %>%
    pull(variable)
    
    
    #Get two vectors of chronos scores for samples with and without the snp
    chronos.with.variant = melted.chronos.22q1 %>%
    filter(gene %in% all_of(gene.of.interest)) %>%
    filter(sample %in% all_of(samples.with.variant)) %>%
    pull(value)
    
    chronos.without.variant = melted.chronos.22q1 %>%
    filter(gene %in% all_of(gene.of.interest)) %>%
    filter(sample %in% all_of(samples.without.variant)) %>%
    pull(value)
    
    
    #Calculate the mean of each of the vectors
    chronos.with.variant.mean = chronos.with.variant %>% mean()
    chronos.without.variant.mean = chronos.without.variant %>% mean()
    
    
    #Calculate the differential
    chronos.differential = chronos.with.variant.mean - chronos.without.variant.mean
    
    
    #Write the differential to the vector outside of the loop
    differential.vector = c(differential.vector, chronos.differential)
    
}

In [None]:
#Add the differential data to the main data frame
final.df.gene.with.differential = final.df.gene %>%
mutate("differential" = differential.vector)

final.df.gene.with.differential %>% head()

In [None]:
#Plot the plot
final.df.gene.with.differential %>%
ggplot(aes(x = differential, y = -log10(fdr))) +
geom_point() +

geom_vline(xintercept = 0, linetype = 2) +

theme_bw() +

theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), 
axis.line = element_line(color = "black")
) +

theme(
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12,)
) +

theme(
axis.text=element_text(size = 12),
axis.title=element_text(size = 12)
) +

theme(
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)
) +

ylab("Associations between SNV and sgRNA score (FDR)") +
xlab("Chronos score differential (with SNP - without SNP)")


#Export it to the google bucket
setwd('../output')
ggsave("supplemental_figure_4.pdf", width = 5, height = 5)

In [None]:
#Write the df that was used to make this figure
setwd("../output")
write.table(final.df.gene.with.differential, "supplemental_figure_4_differential_df.txt", sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)