In [None]:
#install required R packages 
suppressMessages(install.packages("BiocManager"))
suppressMessages(BiocManager::install(c('tidyr','dplyr','ggplot2','GenomicRanges','ggsignif')))

In [None]:
library("tidyr")
library("dplyr")
library("ggplot2")
library("GenomicRanges")
library("ggsignif")
options(repr.plot.width=15, repr.plot.height=10)

#set working directory

dir.create("chipseq/")
setwd("chipseq/")

In [None]:
#download required count matrix. GEO ID given here

#

In [None]:
#retreive CTCF motifs from JASPAR

suppressMessages(library(AnnotationHub))
ah <- AnnotationHub()
query_data <- subset(ah, preparerclass == "CTCF")
query_data
CTCF_hg38 <- query_data[["AH104729"]]
CTCF_hg38

In [None]:
#set up annotations for h3_norm_counts dataframe 

sites_10mer = read.table("sg4_10mer_seq.bed", sep = "\t")

grsites_10mer = makeGRangesFromDataFrame(sites_10mer, seqnames.field = "V1", start.field = "V2", end.field = "V3")

ov = findOverlaps(grsites_10mer, resize(CTCF_hg38, 500, fix = "center"))

sites_10mer$site = "10mer"

In [None]:
#read h3k9me3 counts in 5kbp windows around each motif, calculate CPM

h3x = read.table("hg38_5kb_h3_counts.tsv", sep = "\t")
y = DGEList(counts = h3x)
h3_norm_counts <- as.data.frame(cpm(y, log=FALSE))
h3_norm_counts = h3_norm_counts[rownames(h3x),]

h3_norm_counts$chr = str_split_fixed(rownames(h3_norm_counts),"[.]", n =3)[,1]
h3_norm_counts$start = str_split_fixed(rownames(h3_norm_counts),"[.]", n =3)[,2]
h3_norm_counts$end = str_split_fixed(rownames(h3_norm_counts),"[.]", n =3)[,3]

h3ov =findOverlaps(grsites_10mer, makeGRangesFromDataFrame(h3_norm_counts, seqnames.field = "chr", start.field = "start", end.field = "end"))

h3_norm_counts = h3_norm_counts[rownames(h3x),]
h3_norm_counts$site = "no_site"
h3_norm_counts[subjectHits(h3ov), "site"] = sites_10mer[queryHits(h3ov),"site"]

h3_norm_counts = h3_norm_counts %>% arrange(desc(site))


In [None]:
head(h3_norm_counts)

In [None]:
#filter H3K9me3 counts based on quantile - 90th percentile value in atleast 3 samples

subcol = c("Safe.Harbor.H3k9me3.rep1.bam", "sg10mer.H3k9me3.rep1.bam", "Safe.Harbor.H3k9me3.rep2.bam", "sg10mer.H3k9me3.rep2.bam" )
h3_counts_cpm = as.matrix(h3_norm_counts[,subcol])
hist(h3_counts_cpm, breaks = 100)
quantile(h3_counts_cpm, c(0.9, 0.95))
nrow( h3_counts_cpm[rowSums(h3_counts_cpm >= 1) > 2,] )
h3_counts_cpm_filt = h3_counts_cpm[rowSums(h3_counts_cpm >= 1) > 2,]

In [None]:
h3_counts_df = as.data.frame(h3_counts_cpm_filt) %>% filter( rownames(h3_counts_cpm_filt) %in% rownames(h3_norm_counts %>% filter(site != "no_site")) )
plot_df = pivot_longer(h3_counts_df, cols = c(1:4))
plot_df$group = "sg4_sites"

ggplot(data=plot_df ,aes(x = name, y = value)) + geom_boxplot() + 
  geom_point(position = position_jitter(seed = 1, width = 0.1))

In [None]:
#cross condition t.test pvalues 

t.test(h3_counts_df$Safe.Harbor.H3k9me3.rep1.bam, h3_counts_df$sg10mer.H3k9me3.rep1.bam)$p.value #1x1
t.test(h3_counts_df$Safe.Harbor.H3k9me3.rep1.bam, h3_counts_df$sg10mer.H3k9me3.rep3.bam)$p.value #1x2
t.test(h3_counts_df$Safe.Harbor.H3k9me3.rep3.bam, h3_counts_df$sg10mer.H3k9me3.rep1.bam)$p.value #2x1 
t.test(h3_counts_df$Safe.Harbor.H3k9me3.rep3.bam, h3_counts_df$sg10mer.H3k9me3.rep3.bam)$p.value #2x2

In [None]:
#cross replicate t.test pvalues 

t.test(h3_counts_df$Safe.Harbor.H3k9me3.rep1.bam, h3_counts_df$Safe.Harbor.H3k9me3.rep3.bam)$p.value #1x2
t.test(h3_counts_df$sg10mer.H3k9me3.rep1.bam, h3_counts_df$sg10mer.H3k9me3.rep3.bam)$p.value #1x2


In [None]:
mat_check = as.data.frame(h3_counts_cpm_filt) %>% filter( rownames(h3_counts_cpm_filt) %in% rownames(h3_norm_counts %>% filter(site != "no_site")) )
mat_check$mean_safe = rowMeans(mat_check[,c("Safe.Harbor.H3k9me3.rep1.bam", "Safe.Harbor.H3k9me3.rep2.bam")])
mat_check$mean_10mer = rowMeans(mat_check[,c("sg10mer.H3k9me3.rep1.bam", "sg10mer.H3k9me3.rep2.bam")])
plot_df = pivot_longer(mat_check, cols = c(5,6))
#plot_df$group = "sig_sites"

colorder=c("red","blue")

ggplot(data=plot_df ,aes(x = reorder(name, value), y = value, fill = name)) + 
geom_boxplot() + 
geom_point(position = position_jitter(seed = 1, width = 0.1)) +
geom_signif(comparisons = list(c("mean_safe", "mean_10mer")), map_signif_level=TRUE, size = 1, textsize = 8) + 
ylab("Normalized H3K9me3 counts") +
scale_x_discrete(name="", labels=c("mean_safe" = "Safe Harbor", 'mean_10mer'= '10mer')) + 
scale_fill_manual( labels=c('10mer',"Safe Harbor"), values= c("#b30000","#00BFC4")) + 
theme_classic() + 
theme(axis.text = element_text(size = 18), 
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), colour = "black"),
      axis.text.y = element_text(colour = "black"),
      axis.title = element_text(size = 20), 
      legend.text = element_text(size = 15),
      legend.title = element_text(size=15),
      legend.position = "none",
      axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))

In [None]:
t.test(mat_check$mean_safe, mat_check$mean_10mer)$p.value #1x1


In [None]:
ggplot(data=plot_df ,aes(x = reorder(name, value), y = value, fill = name)) + 
geom_boxplot() + 
geom_point(position = position_jitter(seed = 1, width = 0.1)) +
geom_signif(comparisons = list(c("mean_safe", "mean_10mer")), map_signif_level=TRUE, size = 1, textsize = 8) + 
ylab("Normalized H3K9me3 counts") +
scale_x_discrete(name="", labels=c("mean_safe" = "Safe Harbor", 'mean_10mer'= '10mer')) + 
scale_fill_discrete( labels=c('10mer',"Safe Harbor")) + 
theme_classic() + 
theme(axis.text = element_text(size = 18), 
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), colour = "black"),
      axis.text.y = element_text(colour = "black"),
      axis.title = element_text(size = 20), 
      legend.text = element_text(size = 15),
      legend.title = element_text(size=15),
      legend.position = "none",
      axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))
