In [None]:
## This analysis script is used to create Supp Fig. 7a

In [None]:
#install required R packages 
suppressMessages(install.packages("BiocManager"))
suppressMessages(BiocManager::install(c('tidyr','dplyr','ggplot2','GenomicRanges','ggsignif')))

In [None]:
suppressMessages(library('tidyr'))
suppressMessages(library('dplyr'))
suppressMessages(library('ggplot2'))
suppressMessages(library('BiocParallel'))
suppressMessages(library('GenomicRanges'))
suppressMessages(library('ggsignif'))

options(repr.plot.width=15, repr.plot.height=10)

#set working directory

dir.create("chipseq/")
setwd("chipseq/")

In [None]:
#download required count matrix. GEO ID given here 

#GSE246939 - sg8_CTCF_500bp_counts.tsv

In [None]:
#filter CTCF counts based on CPM >=5 in atleast 2 samples

x = read.table("sg8_CTCF_500bp_counts.tsv", sep = "\t")
ctcf_grp = c('Safe.Harbor.CTCF.rep1.bam','Safe.Harbor.CTCF.rep2.bam',
        'sg10mer.CTCF.rep1.bam','sg10mer.CTCF.rep2.bam')

y = DGEList(counts = x[,ctcf_grp])
keep <- filterByExpr(y, min.count = 1, group = c(0,0,1,1))
y <- y[keep,,keep.lib.sizes=FALSE]

nrow(y$counts)

isexpr = rowSums(cpm(y)>=5) >= 2

y = calcNormFactors( y[isexpr,] )

nrow(y$counts)

In [None]:
#set up annotations in norm_counts dataframe 

norm_counts = as.data.frame(cpm(y, log=FALSE))

sites_10mer = read.table("sg4_10mer_seq.bed", sep = "\t")

grsites_10mer = makeGRangesFromDataFrame(sites_10mer, seqnames.field = "V1", start.field = "V2", end.field = "V3")

ov = findOverlaps(grsites_10mer, resize(CTCF_hg38, 500, fix = "center"))
sites_10mer$site = "10mer"

norm_counts$site = "no_site"
norm_counts[subjectHits(ov), "site"] = sites_10mer[queryHits(ov),"site"]

norm_counts$motif = "sg4"
norm_counts = norm_counts %>% arrange(desc(site))
nrow(norm_counts)

In [None]:
mat_check = norm_counts %>% filter(site == "10mer", rownames(norm_counts) %in% rownames(y$counts))
mat_check$mean_safe = rowMeans(mat_check[,c("Safe.Harbor.CTCF.rep1.bam", "Safe.Harbor.CTCF.rep2.bam")])
mat_check$mean_10mer = rowMeans(mat_check[,c("sg10mer.CTCF.rep1.bam", "sg10mer.CTCF.rep2.bam")])
plot_df = pivot_longer(mat_check, cols = c(7,8))
#plot_df$group = "sig_sites"

colorder=c("red","blue")

ggplot(data=plot_df ,aes(x = reorder(name, -value), y = value, fill = name)) + 
geom_boxplot() + 
geom_point(position = position_jitter(seed = 1, width = 0.1)) +
geom_signif(comparisons = list(c("mean_safe", "mean_10mer")), map_signif_level=TRUE, size = 1, textsize = 8) + 
ylab("Normalized CTCF counts") +
scale_x_discrete(name="", labels=c("mean_safe" = "Safe Harbor", 'mean_10mer'= '10mer')) + 
scale_fill_manual( labels=c('10mer',"Safe Harbor"), values= c("#b30000","#00BFC4")) + 
theme_classic() + 
theme(axis.text = element_text(size = 18), 
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), colour = "black"),
      axis.text.y = element_text(colour = "black"),
      axis.title = element_text(size = 20), 
      legend.text = element_text(size = 15),
      legend.title = element_text(size=15),
      legend.position = "none",
      axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))




In [None]:
t.test(mat_check$mean_safe, mat_check$mean_10mer)$p.value #1x1
