In [1]:
#Set up the environment
library(tidyverse)
library(reshape2)
library(data.table)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.0     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘reshape2’


The following object is masked from ‘package:tidyr’:

    smiths



Attaching package: ‘data.table’


The following objects are masked f

In [2]:
#replicate map
setwd("../data")
avana.replicate.map = fread("22q1_Achilles_replicate_map.csv", sep = ",", check.names = FALSE, header = T)

In [3]:
#guide map
setwd("../data")
avana.guide.map = fread("22q1_Achilles_guide_map.csv", sep = ",", check.names = FALSE, header = T) %>%
mutate(gene = word(gene, 1))

In [4]:
#Load in the lineage-corrected ancestry-associated p-value data frame
setwd("../data")
lineage.corrected.pvals = read.table("lm_ancestry_associated_dependency_pvals.txt", sep = "\t", header = T)

#Get a vector of ancestry-associated genes
ancestry.associated.genes = lineage.corrected.pvals %>%
filter(afr_fdr < 0.05 | amr_fdr < 0.05 | eas_fdr < 0.05 | eur_fdr < 0.05 | sas_fdr < 0.05) %>%
pull(gene) %>%
unique()

In [5]:
#Load in the ancestry-associated pval matrix
setwd("../data")
all.data = fread("merged.pvals.txt", sep = "\t", header = TRUE)

#geta vector of snps
snp_vector = all.data$snp

In [6]:
#FDR correct the entire dataset
all.data.fdr = all.data %>%
select(-snp) %>%
apply(2, function(x) { p.adjust(x, method = "BY", n = length(x)) }) %>%
data.frame() %>%
mutate("snp" = all.data$snp, .before = 1)

#Remove the original dataset to free up memory
rm(all.data)

: 

In [None]:
#Melt and subset to only include the significant SNPs
all.fdr.melted = all.data.fdr %>%
melt(id = "snp")

#Remove the original dataset to free up memory
rm(all.data.fdr)

In [None]:
#guide logfc data
setwd("../data")
avana.logfc = fread("22q1_Achilles_logfold_change.csv", sep = ",", header = T, check.names = FALSE)

In [None]:
#Filter to only include significant variants
#Also split the snp column
significant.fdr = all.fdr.melted %>%
filter(value < 0.05) %>%
separate(snp, sep = ":", into = c("chr", "pos", "ref", "alt"))

In [None]:
#Compute which SNPs intersect with a guide
in.guide.vector = significant.fdr %>%
apply(1, function(x){
    
    #Store information as variables
    chr.of.interest = x[1]
    pos.of.interest = x[2]
    
    #subset avana.bed to only include guides that target the same chromosome
    #then annotate the data frame if the variant falls within a guide
    avana.bed.filtered = avana.bed %>% 
    filter(chr %in% all_of(chr.of.interest)) %>%
    mutate("in_guide" = ifelse(pos.of.interest >= start & pos.of.interest <= end, 1, 0))
    
    #Get the guides which that snp maps to
    guide.logical = is.element(1, avana.bed.filtered$in_guide)
    
    if(guide.logical == TRUE){
        guide.to.return = avana.bed.filtered %>%
        filter(in_guide %in% "1") %>%
        pull(sgrna)
    } else {
        guide.to.return = 0
    }
    
    #Return the data outside of the loop
    return(guide.to.return) 
})

In [None]:
#Merge the guide into the main data frame
significant.fdr.with.guide = significant.fdr %>%
mutate("affected_guide" = in.guide.vector)

In [None]:
#Compute all of the ancestry-associated guides
ancestry.associated.guides = avana.guide.map %>%
filter(gene %in% all_of(ancestry.associated.genes)) %>%
pull(sgrna)

#Filter the logfc data to only include ancestry-associated guides
#Then collapse replicates into cell lines
ancestry.avana.logfc = avana.logfc %>%
data.frame(check.names = FALSE) %>%
rownames_to_column("sgrna") %>%
filter(sgrna %in% ancestry.associated.guides) %>%
melt(id = "sgrna") %>%
mutate("sample" = plyr::mapvalues(from = avana.replicate.map$replicate_ID, to = avana.replicate.map$DepMap_ID, variable, warn_missing = FALSE)) %>%
group_by(sgrna, sample) %>%
summarise("sgrna_depletion" = mean(value)) %>%
mutate("gene" = plyr::mapvalues(from = avana.guide.map$sgrna, to = avana.guide.map$gene, sgrna, warn_missing = FALSE))


[1m[22m[36mℹ[39m In argument: `sgrna_depletion = mean(value)`.
[33m![39m argument is not numeric or logical: returning NA”
[1m[22m`summarise()` has grouped output by 'sgrna'. You can override using the
`.groups` argument.


In [None]:
#Get a vector of guides that have a snp
guide.with.snp = significant.fdr.with.guide %>%
filter(!affected_guide %in% "0") %>%
pull(affected_guide) %>%
unique()

#Group by guide and calculate the median across all cell lines
#Then annotate with if the guide has a snp in the targeting sequence
collapsed.median.guide.depletion = ancestry.avana.logfc %>%
group_by(sgrna) %>%
summarise("median_sgrna_depletion" = median(sgrna_depletion)) %>%
mutate("gene" = plyr::mapvalues(from = ancestry.avana.logfc$sgrna, to = ancestry.avana.logfc$gene, sgrna, warn_missing = FALSE)) %>%
mutate("snp_flag" = ifelse(sgrna %in% all_of(guide.with.snp), 1, 0))

ERROR: Error in eval(expr, envir, enclos): object 'significant.fdr.with.guide' not found


In [None]:
###Represent the data in a different way for proximal genes
collapsed.median.guide.depletion %>%
filter(gene %in% all_of(ancestry.associated.genes)) %>%
filter(gene %in% all_of(proximal.genes)) %>%
group_by(gene) %>%
summarise("num_affected_guides" = sum(snp_flag)) %>%
group_by(num_affected_guides) %>%
summarise("class" = n()) %>%
ggplot(aes(x = num_affected_guides, y = class)) +
geom_bar(stat = "identity") +

theme_bw() +

theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), 
axis.line = element_line(color = "black")
) +

theme(
axis.title.x = element_text(size = 7),
axis.title.y = element_text(size = 7)
) +

theme(
axis.text=element_text(size = 7),
axis.title=element_text(size = 7)
) +

theme(
legend.title = element_text(size = 7),
legend.text = element_text(size = 7)
) +

ylab("Number of genes") +
xlab("Number of affected guides")

#Export it to the google bucket
setwd('../output')
ggsave("supplemental_figure_3_left.pdf", width = 3, height = 5)

In [None]:
###Represent the data in a different way for non-proximal genes
collapsed.median.guide.depletion %>%
filter(gene %in% all_of(ancestry.associated.genes)) %>%
filter(!gene %in% all_of(proximal.genes)) %>%
group_by(gene) %>%
summarise("num_affected_guides" = sum(snp_flag)) %>%
group_by(num_affected_guides) %>%
summarise("class" = n()) %>%
ggplot(aes(x = num_affected_guides, y = class)) +
geom_bar(stat = "identity") +

theme_bw() +

theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), 
axis.line = element_line(color = "black")
) +

theme(
axis.title.x = element_text(size = 7),
axis.title.y = element_text(size = 7)
) +

theme(
axis.text=element_text(size = 7),
axis.title=element_text(size = 7)
) +

theme(
legend.title = element_text(size = 7),
legend.text = element_text(size = 7)
) +

ylab("Number of genes") +
xlab("Number of affected guides")

#Export it to the google bucket
setwd('../output')
ggsave("supplemental_figure_3_right.pdf", width = 3, height = 5)

In [None]:
#Write the dataset used to make supplemental figure 3 
setwd("../output")
write.table(collapsed.median.guide.depletion, "supplemental_figure_3_df.txt", sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)