In [None]:
## This analysis script is used to create Supp Fig. 7e

In [None]:
#install required R packages 
suppressMessages(install.packages("BiocManager"))
suppressMessages(BiocManager::install(c('pheatmap','stringr','ggseqlogo','BSgenome.Hsapiens.UCSC.hg38','GenomicRanges','CTCF','dplyr','patchwork','ggplot2')))

In [None]:
suppressMessages(library("CTCF"))
suppressMessages(library("dplyr"))
suppressMessages(library("ggplot2"))
suppressMessages(library("patchwork"))
suppressMessages(library("GenomicRanges"))
suppressMessages(library("stringr"))
suppressMessages(library("ggseqlogo"))
suppressMessages(library("BSgenome.Hsapiens.UCSC.hg38"))
suppressMessages(library("pheatmap"))

options(repr.plot.width=15, repr.plot.height=10)

#set working directory
dir.create("chipseq/")
setwd("chipseq/")


In [None]:
#helper functions

#functions to find differences in motif and their position

find_diff_pos = function(x,y){
    pos = list()
    string1 = str_split(x,"")[[1]]
    string2 = str_split(y,"")[[1]]
    for(i in 1:length(string1)){
       if(string1[i] != string2[i]){
           pos[i] = i
       }
    }
    return(pos[!unlist(lapply(pos, is.null))])
}

find_diff_seq = function(x,y){
    pos = list()
    string1 = str_split(x,"")[[1]]
    string2 = str_split(y,"")[[1]]
    for(i in 1:length(string1)){
       if(string1[i] != string2[i]){
           pos[i] = string1[i]
       }
    }
    return(pos[!unlist(lapply(pos, is.null))])
}

#function to write fasta file 

writeFasta<-function(data, filename){
  fastaLines = c()
  for (rowNum in 1:nrow(data)){
    fastaLines = c(fastaLines, as.character(paste(">", paste0(data[rowNum,"chr"],':',data[rowNum,"new_start"],"-",data[rowNum,"new_end"]), sep = "")))
    fastaLines = c(fastaLines,as.character(data[rowNum,"seq"]))
  }
  fileConn<-file(filename)
  writeLines(fastaLines, fileConn)
  close(fileConn)
}

In [None]:
suppressMessages(library(AnnotationHub))
ah <- AnnotationHub()
query_data <- subset(ah, preparerclass == "CTCF")
# Explore the AnnotationHub object
query_data
CTCF_hg38 <- query_data[["AH104729"]]

ctcf = as.data.frame(resize(CTCF_hg38, 500, fix = "center"))
ctcf$id = paste0(ctcf$seqnames,".", ctcf$start,".",ctcf$end)
ctcf = ctcf[,c("id", "seqnames","start","end","strand")]
colnames(ctcf) = c("GeneID","Chr","Start","End","Strand")

In [None]:
#Run 08-chipseq-sg8-ctcf-dream.ipynb to get pvals from dream

dream_test_res = read.table("sg4-ctcf-dream-pvals.tsv", sep = "\t", stringsAsFactors = F)
head(dream_test_res)
nrow(dream_test_res)

In [None]:
#Extract significant results
fc_cut = 0.5
p_cut = 1e-5

sig_df = dream_test_res %>% filter( (abs(logFC) > fc_cut) & (abs(adj.P.Val) < p_cut) )
sig_df$chr = str_split_fixed(rownames(sig_df),"[.]", n =3)[,1]
sig_df$start = as.double(str_split_fixed(rownames(sig_df),"[.]", n =3)[,2])
sig_df$end = as.double(str_split_fixed(rownames(sig_df),"[.]", n =3)[,3])
nrow(sig_df)

In [None]:
#Create Granges object of significant hits. Add "site" column 

grsites_sig = makeGRangesFromDataFrame(sig_df, seqnames.field = "chr", start.field = "start", end.field = "end", keep.extra.columns = T)
sites_10mer = read.table("~/data/sites/sg4_10mer_seq.bed", sep = "\t")
grsites_10mer = makeGRangesFromDataFrame(sites_10mer, seqnames.field = "V1", start.field = "V2", end.field = "V3")
ov = findOverlaps(grsites_10mer, grsites_sig)
grsites_sig$site = "no_site"
grsites_sig[subjectHits(ov),]$site = "10mer"


In [None]:
#Create two separate granges of sig hits (perfect match and partial match)
#Retrieve motif sequence from JASPAR for those sites

grsites_sig_target = grsites_sig[(elementMetadata(grsites_sig)[, "site"] %in% c("10mer"))]
grsites_sig_nottarget = grsites_sig[(elementMetadata(grsites_sig)[, "site"] %in% c("no_site"))]

ov1 = findOverlaps(grsites_sig_target, CTCF_hg38)
ov2 = findOverlaps(grsites_sig_nottarget, CTCF_hg38)

targetsdf = as.data.frame(CTCF_hg38[subjectHits(ov1),])
targetsdf = targetsdf[,c("seqnames","start","end","sequence")]
colnames(targetsdf) = c("chr","new_start","new_end","seq")
nontargetsdf = as.data.frame(CTCF_hg38[subjectHits(ov2),])
nontargetsdf = nontargetsdf[,c("seqnames","start","end","sequence")]
colnames(nontargetsdf) = c("chr","new_start","new_end","seq")


In [None]:
##write out fasta files of sequences.

#writeFasta(targetsdf, "sg4_sig_targets_pval_5_ctcfmotifs.fa")
#writeFasta(nontargetsdf, "sg4_sig_not_targets_pval_5_ctcfmotifs.fa")

In [None]:
## Use mafft to locally align sequences to each other

# !mafft.bat --adjustdirectionaccurately sg4_sig_targets_pval_5_ctcfmotifs.fa > sg4_sig_targets_pval_5_ctcfmotifs_mafft.fa
# !mafft.bat --adjustdirectionaccurately sg4_sig_not_targets_pval_5_ctcfmotifs.fa > sg4_sig_not_targets_pval_5_ctcfmotifs_mafft.fa

In [None]:
# Create logogram

bs = Biostrings::readDNAStringSet("sg4_sig_targets_pval_5_ctcfmotifs_mafft.fa")
bs = reverseComplement(bs)
p11= ggseqlogo(consensusMatrix(bs)[c('A','C','G','T'),])

bs = Biostrings::readDNAStringSet("sg4_sig_not_targets_pval_5_ctcfmotifs_mafft.fa")
bs = reverseComplement(bs)
p12= ggseqlogo(consensusMatrix(bs)[c('A','C','G','T'),])
p11 + p12 + plot_layout(ncol = 1)

In [None]:
#find differences in positions in significant partial match sites

nontargetsdf$subseq = toupper(substr(nontargetsdf$seq,2,11) )
nontargetsdf$subseq_rev = stringi::stri_reverse(chartr("ATGC","TACG",toupper(substr(nontargetsdf$seq,7,16))))
nontargetsdf$mismatches = mapply(min, mapply(adist,nontargetsdf$subseq,"TGCCATCTAC"), mapply(adist,nontargetsdf$subseq_rev,"TGCCATCTAC"))

head(nontargetsdf)
nrow(nontargetsdf)

In [None]:
#create heatmap of positions vs # of mismatches in partial match sites

diff_meta = data.frame(id = 1:length(unlist(nontargetsdf$diff_pos)))
diff_meta$length = 0
diff_meta$positions = 0
k=1
for(i in 1:nrow(nontargetsdf)){
    for(j in 1:nontargetsdf[i,]$diff_length){

        diff_meta[k,]$length = nontargetsdf[i,]$diff_length
        diff_meta[k,]$positions = unlist(nontargetsdf[i,]$diff_pos[[1]][j])
        k = k+1
    }
}

output = as.matrix(table(diff_meta[,c("length","positions")]))
output
pheatmap(mat = output, fontsize = 14, cluster_rows = F, cluster_cols = F,
        color = hcl.colors(50, "OrRd", rev = T),fontsize_col=20,fontsize_row=20)

In [None]:
#create histogram of number of mismatches 

df = as.data.frame(table(nontargetsdf$diff_length), stringsAsFactors=F)
df[nrow(df) + 1,] = c(0,nrow(targetsdf))

p = ggplot(data = df) + 
geom_bar(aes(x = Var1, y = Freq, fill=factor(ifelse(Var1=="0","Targets","Off-targets"))), stat = "identity", show.legend = FALSE) +
scale_fill_manual(name = "# of mismatches in significant motifs", values=c("grey60", "black")) +
xlab("# of mismatches to sg4 target sequence") + 
ylab("# of significant motifs") +
theme_classic() +
theme(axis.text = element_text(size = 18), 
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), colour = "black"),
      axis.text.y = element_text(colour = "black"),
      axis.title = element_text(size = 20), 
      legend.text = element_text(size = 15),
      legend.title = element_text(size=15),
      axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))
p



In [None]:
#volcano plot colored by # of mismatches

mcols(grsites_sig_target)$mismatch = 0
mcols(grsites_sig_nottarget)$mismatch = nontargetsdf$diff_length

df = rbind(as.data.frame(grsites_sig_nottarget),as.data.frame(grsites_sig_target))
df$mismatch = as.character(df$mismatch)
head(df)

p_cut = 1e-5
fc_cut = 0.5

p = ggplot(data= dream_test_res, aes(x= logFC, y= -log10(adj.P.Val))) +
    ggrastr::rasterize(geom_point(colour= 'grey80', size = 2)) +
    geom_point(data = dream_test_res %>% filter(site == "10mer"), aes(x= logFC, y= -log10(adj.P.Val)),colour= 'black', size = 3) +
    geom_point(data = df, aes(x= logFC, y= -log10(adj.P.Val), color = mismatch), size = 3) +
    geom_hline(yintercept= c(-log10(p_cut)), colour= 'black', linetype= 'dashed') +
    geom_vline(xintercept= c(-fc_cut,fc_cut), colour= 'black', linetype= 'dashed') +
    xlab('log2 fold-change') +
    ylab('-log10(adj.P.Val)') +
    scale_color_manual(name = "Mismatches", 
                   values= c("black","red","orange","gold","lightblue","darkgreen","pink"), 
                   labels = c("0", "1","2","3","4","5","6")) +
    theme_classic() +
    theme(axis.text = element_text(size = 18), 
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), colour = "black"),
      axis.text.y = element_text(colour = "black"),
      axis.title = element_text(size = 20), 
      legend.text = element_text(size = 15),
      legend.title = element_text(size=15),
      axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))

p
