In [None]:
#This notebook is used to create Supp. Fig 10

In [None]:
library("CTCF")
library("Rsubread")
library("dplyr")
library("edgeR")
library("ggplot2")
library("patchwork")
library("GenomicRanges")
library("stringr")
library("ggseqlogo")
library("BSgenome.Hsapiens.UCSC.hg38")
library("pheatmap")


options(repr.plot.width=15, repr.plot.height=10)

In [None]:
#set up count matrix, and normalize using CPM for scatterplots

x = read.table("sg4_CTCF_500bp_counts.tsv", sep = "\t")
ctcf_grp = c('Jurkat_Safe_Harbor_CTCF_rep1','Jurkat_Safe_Harbor_CTCF_rep2',
        'Jurkat_sg4_CTCF_rep1','Jurkat_sg4_CTCF_rep2')

y = DGEList(counts = x[,ctcf_grp])
norm_counts <- as.data.frame(cpm(y, log=FALSE))

keep <- filterByExpr(y, min.count = 1, group = c(0,0,1,1))
y <- y[keep,,keep.lib.sizes=FALSE]
isexpr = rowSums(cpm(y)>=5) >= 2

y = calcNormFactors( y[isexpr,] )

In [None]:
bound_ctcf = as.data.frame(y$counts)
bound_ctcf$chr = str_split_fixed(rownames(bound_ctcf),"[.]", n =3)[,1]
bound_ctcf$start = as.double(str_split_fixed(rownames(bound_ctcf),"[.]", n =3)[,2])
bound_ctcf$end = as.double(str_split_fixed(rownames(bound_ctcf),"[.]", n =3)[,3])

grsites_bound_ctcf = makeGRangesFromDataFrame(bound_ctcf, seqnames.field = "chr", start.field = "start", end.field = "end", keep.extra.columns = T)

grsites_bound_ctcf

In [None]:
#The CasOff txt file (casoff_sg4_0-4MM_hg38.txt) is generated by running the portable version of http://www.rgenome.net/cas-offinder/portable 
#with the input file to the program stored in references/input_sg4_10mer_1-4MM.txt

mismatch0_4_df = read.table("casoff_sg4_0-4MM_hg38.txt", sep = "\t")
mismatch0_4_df$end = mismatch0_4_df$V3+1
grsites_0_4mm = makeGRangesFromDataFrame(mismatch0_4_df, seqnames.field = "V2", start.field = "V3", end.field = "end", keep.extra.columns = T)
grsites_0_4mm = unique(grsites_0_4mm)


In [None]:
sites_10mer = read.table("references/sg4_10mer_seq.bed", sep = "\t")
grsites_10mer = makeGRangesFromDataFrame(sites_10mer, seqnames.field = "V1", start.field = "V2", end.field = "V3")
grsites_0mm = makeGRangesFromDataFrame(mismatch0_4_df %>% filter(V6 ==0), seqnames.field = "V2", start.field = "V3", end.field = "end", keep.extra.columns = T)
grsites_0mm = unique(grsites_0mm)


In [None]:
#get total number of bound CTCF sites with n mismatches to target sequence where n = 0-4

ov = findOverlaps(grsites_bound_ctcf, grsites_0_4mm)
df = as.data.frame(grsites_bound_ctcf[queryHits(ov),], row.names = NULL)
df$mismatch = elementMetadata(grsites_0_4mm[subjectHits(ov)])[,4] 

# Create an empty dataframe to store the results
result_df <- df[0,]

# Get the unique combinations of seqnames, start, end, and strand
unique_combinations <- unique(df[, c("seqnames", "start", "end", "strand")])

# Loop through each unique combination
for(i in 1:nrow(unique_combinations)) {
  # Subset the dataframe for the current combination
  subset_df <- df[df$seqnames == unique_combinations$seqnames[i] &
                    df$start == unique_combinations$start[i] &
                    df$end == unique_combinations$end[i] &
                    df$strand == unique_combinations$strand[i], ]
  
  # Find the row with the minimum mismatch value
  min_row <- subset_df[which.min(subset_df$mismatch), ]
  #print(min_row)
  # Append the row to the result dataframe
  result_df <- rbind(result_df, min_row)
    #if(i==5){break}
}

# Display the resulting dataframe
head(result_df)
table(result_df$mismatch)

In [None]:
#get total number of bound CTCF sites with significant CTCF loss with n mismatches to target sequence where n = 0-4

dream_test_res = read.table("sg4-ctcf-dream-pvals.tsv", sep = "\t", stringsAsFactors = F)
sig_df = dream_test_res %>% filter( (abs(logFC) > 0.5) & (abs(adj.P.Val) < 1e-5) )
sig_df$chr = str_split_fixed(rownames(sig_df),"[.]", n =3)[,1]
sig_df$start = as.double(str_split_fixed(rownames(sig_df),"[.]", n =3)[,2])
sig_df$end = as.double(str_split_fixed(rownames(sig_df),"[.]", n =3)[,3])

grsites_sig = makeGRangesFromDataFrame(sig_df, seqnames.field = "chr", start.field = "start", end.field = "end", keep.extra.columns = T)

ov = findOverlaps(grsites_sig, grsites_0_4mm)

df = as.data.frame(grsites_sig[queryHits(ov),], row.names = NULL)
df$mismatch = elementMetadata(grsites_0_4mm[subjectHits(ov)])[,4] 

# Create an empty dataframe to store the results
result_df_sig <- df[0,]

# Get the unique combinations of seqnames, start, end, and strand
unique_combinations <- unique(df[, c("seqnames", "start", "end", "strand")])

# Loop through each unique combination
for(i in 1:nrow(unique_combinations)) {
  # Subset the dataframe for the current combination
  subset_df <- df[df$seqnames == unique_combinations$seqnames[i] &
                    df$start == unique_combinations$start[i] &
                    df$end == unique_combinations$end[i] &
                    df$strand == unique_combinations$strand[i], ]
  
  # Find the row with the minimum mismatch value
  min_row <- subset_df[which.min(subset_df$mismatch), ]
  #print(min_row)
  # Append the row to the result dataframe
  result_df_sig <- rbind(result_df_sig, min_row)
    #if(i==5){break}
}

# Display the resulting dataframe
head(result_df_sig)
table(result_df_sig$mismatch)
