In [None]:
#install required R packages 
suppressMessages(install.packages("BiocManager"))
suppressMessages(BiocManager::install(c('GenomicRanges','BiocParallel','CTCF','tidyr','dplyr','edgeR','ggplot2','variancePartition')))

In [None]:
suppressMessages(library("CTCF"))
suppressMessages(library("tidyr"))
suppressMessages(library("dplyr"))
suppressMessages(library("edgeR"))
suppressMessages(library("ggplot2"))
suppressMessages(library('variancePartition'))
suppressMessages(library('BiocParallel'))
suppressMessages(library('GenomicRanges'))

options(repr.plot.width=15, repr.plot.height=10)

#set working directory
dir.create("chipseq/")
setwd("chipseq/")


In [None]:
#download required count matrix. GEO ID given here

#

In [None]:
#retreive CTCF motifs from JASPAR

suppressMessages(library(AnnotationHub))
ah <- AnnotationHub()
query_data <- subset(ah, preparerclass == "CTCF")
query_data
CTCF_hg38 <- query_data[["AH104729"]]
CTCF_hg38

In [None]:
#set up count matrix, and normalize using CPM for scatterplots

x = read.table("~/ctcf_motif_counts_all_reps_sg4.tsv", sep = "\t")
ctcf_grp = c('Safe.Harbor.CTCF.rep1.bam','Safe.Harbor.CTCF.rep2.bam',
        'sg10mer.CTCF.rep1.bam','sg10mer.CTCF.rep2.bam')

y = DGEList(counts = x[,ctcf_grp])
norm_counts <- as.data.frame(cpm(y, log=FALSE))

keep <- filterByExpr(y, min.count = 1, group = c(0,0,1,1))
y <- y[keep,,keep.lib.sizes=FALSE]
isexpr = rowSums(cpm(y)>=5) >= 2


In [None]:
#set up annotations in norm_counts dataframe 

sites_10mer = read.table("sg4_10mer_seq.bed", sep = "\t")
sites_13mer = read.table("sg4_13mer_seq.bed", sep = "\t")

grsites_10mer = makeGRangesFromDataFrame(sites_10mer, seqnames.field = "V1", start.field = "V2", end.field = "V3")
grsites_13mer = makeGRangesFromDataFrame(sites_13mer, seqnames.field = "V1", start.field = "V2", end.field = "V3")

ov_13mer = findOverlaps(grsites_13mer,grsites_10mer)

ov = findOverlaps(grsites_10mer, resize(CTCF_hg38, 500, fix = "center"))
ov_strict = findOverlaps(grsites_10mer,CTCF_hg38)

sites_10mer$site = "10mer"
sites_10mer[subjectHits(ov_13mer),"site"] = "13mer"

norm_counts$site = "no_site"
norm_counts[subjectHits(ov), "site"] = sites_10mer[queryHits(ov),"site"]

norm_counts$motif = "sg4"
norm_counts = norm_counts %>% arrange(desc(site))

nrow(norm_counts)
head(norm_counts)

In [None]:
#filter CPM count matrix

counts_filt_df = as.data.frame(cpm(y[isexpr,]))
counts_filt_df$site = norm_counts[rownames(cpm(y)),"site"]
counts_filt_df$guide = "sg4"

counts_filt_df = counts_filt_df %>% arrange(desc(site))
nrow(counts_filt_df)
table(counts_filt_df$site)

In [None]:
#scatterplot Safe Harbor vs 10mer rep1

p = ggplot(data = counts_filt_df %>% arrange(rev(site))) + 
geom_point(aes(x = Safe.Harbor.CTCF.rep1.bam, y = sg10mer.CTCF.rep1.bam, color = site), alpha = 1, size = 3) +
geom_abline(linetype = "dashed") + 
xlab("Safe Harbor Normalized CTCF Counts rep1") + 
ylab("sg4 10mer Normalized CTCF Counts rep1") +
geom_vline(xintercept = 0) + 
geom_hline(yintercept = 0) +
theme(axis.text = element_text(size = 15)) +
theme(axis.title = element_text(size = 20)) + 
theme(legend.text = element_text(size = 15)) +
xlim(c(-1,max(c(counts_filt_df$Safe.Harbor.CTCF.rep1.bam, counts_filt_df$sg10mer.CTCF.rep1.bam)))) +
ylim(c(-1,max(c(counts_filt_df$Safe.Harbor.CTCF.rep1.bam, counts_filt_df$sg10mer.CTCF.rep1.bam)))) +
scale_color_manual(name = "Sites", values= c("black","#BBBBBB","#BBBBBB"), labels = c("Perfect Match", "Partial Match", "Partial Match")) +
theme(axis.text = element_text(size = 18), 
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), colour = "black"),
      axis.text.y = element_text(colour = "black"),
      axis.title = element_text(size = 20), 
      legend.text = element_text(size = 15),
      legend.title = element_text(size=15),
      axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))

p

In [None]:
#scatterplot Safe Harbor vs 10mer rep2

p = ggplot(data = counts_filt_df %>% arrange(rev(site))) + 
geom_point(aes(x = Safe.Harbor.CTCF.rep2.bam, y = sg10mer.CTCF.rep2.bam, color = site), alpha = 1, size = 3) +
geom_abline(linetype = "dashed") + 
xlab("Safe Harbor Normalized CTCF Counts rep2") + 
ylab("sg4 10mer Normalized CTCF Counts rep2") +
geom_vline(xintercept = 0) + 
geom_hline(yintercept = 0) +
theme(axis.text = element_text(size = 15)) +
theme(axis.title = element_text(size = 20)) + 
theme(legend.text = element_text(size = 15)) +
xlim(c(-1,max(c(counts_filt_df$Safe.Harbor.CTCF.rep2.bam, counts_filt_df$sg10mer.CTCF.rep2.bam)))) +
ylim(c(-1,max(c(counts_filt_df$Safe.Harbor.CTCF.rep2.bam, counts_filt_df$sg10mer.CTCF.rep2.bam)))) +
scale_color_manual(name = "Sites", values= c("black","#BBBBBB","#BBBBBB"), labels = c("Perfect Match", "Partial Match", "Partial Match")) +
theme(axis.text = element_text(size = 18), 
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), colour = "black"),
      axis.text.y = element_text(colour = "black"),
      axis.title = element_text(size = 20), 
      legend.text = element_text(size = 15),
      legend.title = element_text(size=15),
      axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))

p

In [None]:
#set up count matrix - normalize using TMM for dream analysis

y = calcNormFactors( y[isexpr,] )

nrow(y$counts)

In [None]:
#set up design matrix 

design_df = data.frame(id = ctcf_grp, cond = factor(c("SH","SH","10mer","10mer")), replicate = factor(c(1,2,1,2)) )
rownames(design_df) = design_df$id
design_df$cond = relevel(design_df$cond, ref = "SH")
design_df

In [None]:
#set up threading parameters

param = SnowParam(6, "SOCK", progressbar=TRUE)

In [None]:
#run variancePartition model

varPart <- fitExtractVarPartModel( cpm(y), ~ (1|cond) + (1|replicate), design_df, BPPARAM=param )
vp <- sortCols( varPart )
plotVarPart(vp)

In [None]:
#run voom with dream weights model

gc()
form <- ~ cond + (1|replicate) 
vobjDream = voomWithDreamWeights(cpm(y), form, design_df, BPPARAM=param )

In [None]:
#run dream model

gc()
fitmm = dream( vobjDream, form, design_df, BPPARAM=param )


In [None]:
#run eBayes model 

fitmm = eBayes(fitmm)

In [None]:
##Uncomment if you would like to save the fitmm object

#saveRDS(fitmm,"~/fitmm_sg4_chipseq_ctcf.rds")
#fitmm = readRDS("~/fitmm_sg4_chipseq_ctcf.rds")

In [None]:
#extract significant results between 10mer and Safe Harbor 

var_df_res = as.data.frame(topTable( fitmm, coef='cond10mer', number = Inf))

write.table(var_df_res, "sg4-ctcf-dream-pvals.tsv", sep = "\t")
#var_df_res = read.table("sg4-ctcf-dream-pvals.tsv", sep = "\t")

In [None]:
var_df_res$site = norm_counts[rownames(var_df_res),"site"]
var_df_res$guide = "sg4" 
var_df_res = var_df_res %>% arrange(desc(site))

head(var_df_res)

In [None]:
#volcano plot

p_cut = 1e-5
fc_cut = 0.5
ggplot(data= var_df_res, aes(x= logFC, y= -log10(adj.P.Val))) +
     geom_point(colour= 'grey80', size = 2) +
     geom_point(data= var_df_res[which((abs(var_df_res$adj.P.Val) < p_cut) & (abs(var_df_res$logFC) > fc_cut)),], colour= 'red') +
     geom_hline(yintercept= c(-log10(p_cut)), colour= 'black', linetype= 'dashed') +
     geom_vline(xintercept= c(-fc_cut,fc_cut), colour= 'black', linetype= 'dashed') +
     xlab('log2 fold-change') +
     ylab('-log10(adj.P.Val)') +
     theme_classic()

ggplot(data= var_df_res, aes(x= logFC, y= -log10(adj.P.Val))) +
     geom_point(aes(colour= site), size = 2) +
     geom_hline(yintercept= c(-log10(p_cut)), colour= 'black', linetype= 'dashed') +
     geom_vline(xintercept= c(-fc_cut,fc_cut), colour= 'black', linetype= 'dashed') +
     xlab('log2 fold-change') +
     ylab('-log10(adj.P.Val)') +
     theme_classic()


In [None]:
#volcano plot - color by site

p_cut = 1e-5
fc_cut = 0.5

p = ggplot(data= var_df_res, aes(x= logFC, y= -log10(adj.P.Val))) +
    geom_point(aes(colour= site), size = 2) +
    geom_hline(yintercept= c(-log10(p_cut)), colour= 'black', linetype= 'dashed') +
    geom_vline(xintercept= c(-fc_cut,fc_cut), colour= 'black', linetype= 'dashed') +
    xlab('log2 fold-change') +
    ylab('-log10(adj.P.Val)') +
    scale_color_manual(name = "Sites", values= c("#b30000","#BBBBBB"), labels = c("On target", "Not targeted")) +
    theme_classic() +
    theme(axis.text = element_text(size = 18), 
        axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), colour = "black"),
        axis.text.y = element_text(colour = "black"),
        axis.title = element_text(size = 20), 
        legend.text = element_text(size = 15),
        legend.title = element_text(size=15),
        axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))

p

In [None]:
#volcano plot - color by significance

p_cut = 1e-5
fc_cut = 0.5

p = ggplot(data= var_df_res, aes(x= logFC, y= -log10(adj.P.Val))) +
    geom_point(colour= 'grey80', size = 2) +
    geom_point(data= var_df_res[which((abs(var_df_res$adj.P.Val) < p_cut) & (abs(var_df_res$logFC) > fc_cut)),], colour= 'red') +
    geom_hline(yintercept= c(-log10(p_cut)), colour= 'black', linetype= 'dashed') +
    geom_vline(xintercept= c(-fc_cut,fc_cut), colour= 'black', linetype= 'dashed') +
    xlab('log2 fold-change') +
    ylab('-log10(adj.P.Val)') +
    theme_classic() +
    theme(axis.text = element_text(size = 18), 
        axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), colour = "black"),
        axis.text.y = element_text(colour = "black"),
        axis.title = element_text(size = 20), 
        legend.text = element_text(size = 15),
        legend.title = element_text(size=15),
        axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))

p

In [None]:
table(var_df_res[which((abs(var_df_res$adj.P.Val) < p_cut) &(abs(var_df_res$logFC) > fc_cut)),"site"])