## CTPeaks annotation 

In [None]:
## runnning Peak to gene links analysis on each brain regions
# loading
library(Seurat)
library(Signac)

library(RColorBrewer)
library(ComplexHeatmap)
library(circlize)
library(stringr)
library(GenomicRanges)
library(GenomicFeatures)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)

# library for plotting
library(ggplot2)
library(dplyr)
library(tidyr)
library(viridis)
library(tidyverse)
library(patchwork)

##
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
# downloaded hg38 known gene annotation from UCSC: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/
# read into txdb format
#txdb <- makeTxDbFromGFF("./Data/hg38.knownGene.gtf.gz")
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

In [None]:
## in house CT peaks annotation files
ctpeaks <- read.csv("./Results/CTpeaks_annotated.csv",row.names = 1)
colnames(ctpeaks) <- c("chr","start","end","width","strand","peak_called_in")
ctpeaks <- GRanges(ctpeaks)
ctpeaks

In [None]:
## Load reference datasets
ENCODE_cCREs <- import.bb("./Data/encodeCcreCombined.bb")
SCREEN_cCREs <- GRanges(read.delim("./Data/GRCh38-cCREs.bed", header = F,
                                   col.names = c("chr","start","end","assession_ID1","assession_ID2","annotation")))
PsychSCREEN_bCREs <- GRanges(read.delim("./Data/all_bCREs.bed", header = F,
                                        col.names = c("chr","start","end","assession_ID1","assession_ID2","annotation")))
neuron_bCREs <- GRanges(read.delim("./Data/neuron_bCREs.bed", header = F,
                                   col.names = c("chr","start","end","assession_ID1","annotation")))
glia_bCREs <- GRanges(read.delim("./Data/glia_bCREs.bed", header = F,
                                 col.names = c("chr","start","end","assession_ID1","annotation")))
brain_peaks_Li <- GRanges(read.delim("./Data/cCREs.bed", header = F,
                                     col.names = c("chr","start","end","annotation")))

In [None]:
# Reference peak list
reference_sets <- list(
  ENCODE_cCREs = ENCODE_cCREs,
  SCREEN_cCREs = SCREEN_cCREs,
  PsychSCREEN_bCREs = PsychSCREEN_bCREs,
  neuron_bCREs = neuron_bCREs,
  glia_bCREs = glia_bCREs,
  brain_peaks_Li = brain_peaks_Li
)

# Overlap fraction function
compute_overlap_fraction <- function(query, subject) {
  hits <- findOverlaps(query, subject)
  return(length(unique(queryHits(hits))) / length(query))
}

In [None]:
# Cell types from ctpeaks
ct_list <- unique(unlist(strsplit(ctpeaks$peak_called_in, ",")))

# Initialize results
overlap_summary <- data.frame()

# Overall (all ctpeaks)
all_row <- c("All", sapply(reference_sets, function(ref) compute_overlap_fraction(ctpeaks, ref)))
overlap_summary <- rbind(overlap_summary, all_row)

# Cell type-specific (not unique-only)
for (ct in ct_list) {
  ct_peaks <- ctpeaks[grep(ct, ctpeaks$peak_called_in), ]
  ct_peaks <- ct_peaks[!duplicated(ct_peaks)]

  row <- c(ct, sapply(reference_sets, function(ref) compute_overlap_fraction(ct_peaks, ref)))
  overlap_summary <- rbind(overlap_summary, row)
}

# Assign column names
colnames(overlap_summary) <- c("CellType", names(reference_sets))
overlap_summary[ , -1] <- lapply(overlap_summary[ , -1], as.numeric)
overlap_summary = overlap_summary[!overlap_summary$CellType %in% c("VLMC_Per","Endothelial"), ]

# View results
print(overlap_summary)

In [None]:
# # Convert to long format
# col1 = c(
#   "All" = "#FCB905",
#   'Astrocyte' = '#F06719',
#   'Excitatory' = '#33A65C',
#   'Inhibitory' = '#23767C',
#   'Microglia' = '#E03426',
#   'Oligodendrocyte' = '#1BA3C6',
#   'OPC' = "#A26DC2",
#   'Endothelial' = "#FCB905",
#   'VLMC_Per' = "#EB73B3"
# )

# # Step 1: reshape
# overlap_long <- overlap_summary %>%
#   pivot_longer(cols = -CellType, names_to = "ReferenceSet", values_to = "Known_cCREs")

# # Step 2: make long with Known + Novel
# plot_data <- overlap_long %>%
#   mutate('Potential_Novel_cCREs' = 1 - Known_cCREs) %>%
#   pivot_longer(cols = c("Known_cCREs", "Potential_Novel_cCREs"), 
#                names_to = "Category", values_to = "OverlapFraction")

# # Ensure stack order: Novel on bottom, Known on top
# plot_data$Category <- factor(plot_data$Category, levels = c("Potential_Novel_cCREs", "Known_cCREs"))

# # Step 3: set fill based on Category + CellType
# plot_data <- plot_data %>%
#   mutate(FillKey = ifelse(Category == "Known_cCREs", "Known", CellType))

# # Construct fill color map
# fill_colors <- c("Known" = "#4d4d4d", col1)

In [None]:
# options(repr.plot.width=6, repr.plot.height=8)
# p = ggplot(plot_data, aes(x = "", y = OverlapFraction, fill = FillKey)) +
#       geom_bar(stat = "identity", width = 0.6, color = "black") +  # ← add border
#       geom_text(aes(label = paste0(round(OverlapFraction * 100,2), "%")),
#                 position = position_stack(vjust = 0.5),
#                 size = 5, color = "black") +
#       facet_grid(CellType ~ ReferenceSet, switch = "both") +
#       scale_fill_manual(values = fill_colors) +
#       labs(x = NULL, y = "Fraction of Peaks", fill = NULL) +
#       theme_minimal(base_size = 11) +
#       theme(
#         axis.text.x = element_blank(),
#         axis.ticks.x = element_blank(),
#         strip.text = element_text(size = 16),
#         panel.grid = element_blank(),
#         legend.position = "none",
#         panel.spacing.x = unit(-1, "lines"),   # ← reduce horizontal space
#         panel.spacing.y = unit(0.1, "lines")     # optional: vertical spacing
#       )

In [None]:
# p
# ggsave(p,filename = "./Results/Revision/LINK/CTpeaks_annotation.pdf", width = 10, height = 18)

In [None]:
# Convert to long format
col1 = c(
  "All" = "#FCB905",
  'Astrocyte' = '#F06719',
  'Excitatory' = '#33A65C',
  'Inhibitory' = '#23767C',
  'Microglia' = '#E03426',
  'Oligodendrocyte' = '#1BA3C6',
  'OPC' = "#A26DC2"
)

# Step 1: reshape
overlap_long <- overlap_summary %>%
  pivot_longer(cols = -CellType, names_to = "ReferenceSet", values_to = "Known_cCREs")

# Step 2: make long with Known + Novel
plot_data <- overlap_long %>%
  mutate('Potential_Novel_cCREs' = 1 - Known_cCREs) %>%
  pivot_longer(cols = c("Known_cCREs", "Potential_Novel_cCREs"), 
               names_to = "Category", values_to = "OverlapFraction")
plot_data = plot_data[!plot_data$CellType %in% c("Endothelial","VLMC_Per"),]

plot_data$Category <- factor(plot_data$Category, levels = c("Potential_Novel_cCREs", "Known_cCREs"))

# Define desired bottom-to-top order
celltype_order <- c("OPC", "Oligodendrocyte", 
                    "Microglia", "Inhibitory", "Excitatory", "Astrocyte","All")

# Apply to plot_data and reorder factor
plot_data$CellType <- factor(plot_data$CellType, levels = rev(celltype_order))

# --- Use fixed color map: 1 color for novel, 1 for known
fill_colors <- c("Potential_Novel_cCREs" = "#aa0000", "Known_cCREs" = "#4d4d4d")

In [None]:
main_plot <- ggplot(plot_data, aes(x = "", y = OverlapFraction, fill = Category)) +
  geom_bar(stat = "identity", width = 0.6, color = "black") +
  geom_text(aes(label = paste0(round(OverlapFraction * 100,2), "%")),
            position = position_stack(vjust = 0.5),
            size = 5, color = "black") +
  facet_grid(CellType ~ ReferenceSet, switch = "both") +
  scale_fill_manual(values = fill_colors) +
  scale_y_continuous(
    breaks = seq(0, 1, by = 0.25),
    labels = scales::percent_format(accuracy = 1)
  ) +
  labs(x = NULL, y = "Fraction of Peaks", fill = NULL) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(size = 17, face = "bold"),
    strip.text.y.left = element_blank(),
    panel.grid = element_blank(),
    legend.position = "top",
    panel.spacing.x = unit(0.05, "lines"),
    panel.spacing.y = unit(0, "lines")
  )

# Build data for annotation bar with the same factor levels
annot_bar_data <- data.frame(CellType = celltype_order, x = 1)

annot_bar <- ggplot(annot_bar_data, aes(x = x, y = CellType, fill = CellType)) +
  geom_tile(width = 0.6, height = 0.95) +
  scale_y_discrete(limits = celltype_order) +
  scale_fill_manual(values = col1) +
  theme_void(base_size = 14) +
  theme(
    legend.position = "none",
    axis.text.y = element_text(size = 16, hjust = 1),
    plot.margin = margin(t = 5, r = 5, b = 5, l = 5)
  )
# Use patchwork to combine
final_plot <- annot_bar + main_plot + 
  plot_layout(ncol = 2, widths = c(0.06, 1))

In [None]:
final_plot
ggsave(final_plot,filename = "./Results/Revision/LINK/CTpeaks_annotation_alt.pdf", width = 12, height = 20)

## Annotating the DEG linked peaks again

In [None]:
pfc_linked_peaks = read.csv("./Results/LINK/PFC_linkpeaks_all_annotated_1.23.csv")
ec_linked_peaks = read.csv("./Results/LINK/EC_linkpeaks_all_annotated_1.23.csv")
hip_linked_peaks = read.csv("./Results/LINK/HIP_linkpeaks_all_annotated_1.23.csv")

In [None]:
annotate_with_SCREEN_cCRE <- function(linked_peaks_df, screen_cCREs_gr) {
  # Convert to GRanges
  linked_gr <- makeGRangesFromDataFrame(linked_peaks_df,
                                         seqnames.field = "seqnames",
                                         start.field = "start",
                                         end.field = "end",
                                         keep.extra.columns = TRUE)
  
  # Find overlaps
  hits <- findOverlaps(linked_gr, screen_cCREs_gr)
  
  # Annotate
  linked_peaks_df$in_SCREEN_cCREs <- FALSE
  linked_peaks_df$in_SCREEN_cCREs[queryHits(hits)] <- TRUE
  
  linked_peaks_df$SCREEN_cCRE_type <- NA_character_
  linked_peaks_df$SCREEN_cCRE_type[queryHits(hits)] <- screen_cCREs_gr$annotation[subjectHits(hits)]
  
  return(linked_peaks_df)
}

In [None]:
# Annotate for each region
pfc_linked_peaks <- annotate_with_SCREEN_cCRE(pfc_linked_peaks, SCREEN_cCREs)
ec_linked_peaks  <- annotate_with_SCREEN_cCRE(ec_linked_peaks, SCREEN_cCREs)
hip_linked_peaks <- annotate_with_SCREEN_cCRE(hip_linked_peaks, SCREEN_cCREs)

In [None]:
# Combine annotated data with region labels
pfc_linked_peaks$Region <- "PFC"
ec_linked_peaks$Region  <- "EC"
hip_linked_peaks$Region <- "HIP"

# Combine all
combined <- rbind(hip_linked_peaks,ec_linked_peaks, pfc_linked_peaks)


In [None]:
# Clean NA and standardize annotation labels
combined$SCREEN_cCRE_type[is.na(combined$SCREEN_cCRE_type)] <- "Unannotated"

# Compute proportions
plot_data <- combined %>%
  group_by(Region, SCREEN_cCRE_type) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Region) %>%
  mutate(Proportion = Count / sum(Count))

In [None]:
# Order regions and annotation categories
plot_data$Region <- factor(plot_data$Region, levels = c("HIP", "EC", "PFC"))

# Customize category order (can be adjusted)
plot_data$SCREEN_cCRE_type <- factor(plot_data$SCREEN_cCRE_type,
   levels = c("PLS", "pELS", "dELS", "CA", "CA-H3K4me3", "CA-CTCF", "TF", "CA-TF", "DNase-H3K4me3", "Other", "Unannotated"))

# Custom color palette (extend as needed)
ccre_colors <- c(
  "PLS" = "#FF0000",
  "pELS" = "#FFA700",
  "dELS" = "#FFCD00",
  "CA" = "#C7E9B4",
  "CA-H3K4me3" = "#41B6C4",
  "CA-CTCF" = "#2C7FB8",
  "TF" = "#6A51A3",
  "CA-TF" = "#9E9AC8",
  "DNase-H3K4me3" = "#00B0F0",
  "Other" = "#706f6f",
  "Unannotated" = "#D9D9D9"
)


In [None]:
ggplot(plot_data, aes(x = Region, y = Proportion, fill = SCREEN_cCRE_type)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = ccre_colors, name = "SCREEN cCRE Type") +
  xlab("Region") + ylab("Proportion of Linked Peaks") +
  theme_classic() +
  theme(
    legend.position = "right",
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 15),
    legend.text = element_text(size = 11),
    legend.title = element_text(size = 13)
  ) +
  coord_flip()
table(pfc_linked_peaks$SCREEN_cCRE_type,useNA = "always")
table(ec_linked_peaks$SCREEN_cCRE_type,useNA = "always")
table(hip_linked_peaks$SCREEN_cCRE_type,useNA = "always")

## Annotate with PsychSCREEN_bCREs

In [None]:
## Annotate with PsychSCREEN_bCREs
annotate_with_PsychSCREEN_bCRE <- function(linked_peaks_df, psych_bCREs_gr) {
  # Convert to GRanges
  linked_gr <- makeGRangesFromDataFrame(linked_peaks_df,
                                         seqnames.field = "seqnames",
                                         start.field = "start",
                                         end.field = "end",
                                         keep.extra.columns = TRUE)
  
  # Find overlaps
  hits <- findOverlaps(linked_gr, psych_bCREs_gr)
  
  # Annotate
  linked_peaks_df$in_PsychSCREEN_bCREs <- FALSE
  linked_peaks_df$in_PsychSCREEN_bCREs[queryHits(hits)] <- TRUE
  
  linked_peaks_df$PsychSCREEN_bCRE_type <- NA_character_
  linked_peaks_df$PsychSCREEN_bCRE_type[queryHits(hits)] <- psych_bCREs_gr$annotation[subjectHits(hits)]
  
  return(linked_peaks_df)
}

In [None]:
# Annotate for each region
pfc_linked_peaks <- annotate_with_PsychSCREEN_bCRE(pfc_linked_peaks, PsychSCREEN_bCREs)
ec_linked_peaks  <- annotate_with_PsychSCREEN_bCRE(ec_linked_peaks, PsychSCREEN_bCREs)
hip_linked_peaks <- annotate_with_PsychSCREEN_bCRE(hip_linked_peaks, PsychSCREEN_bCREs)

table(pfc_linked_peaks$PsychSCREEN_bCRE_type,useNA = "always")
table(ec_linked_peaks$PsychSCREEN_bCRE_type,useNA = "always")
table(hip_linked_peaks$PsychSCREEN_bCRE_type,useNA = "always")

### Draw stacked plot on the PsychSCREEN_bCREs annotation

In [None]:
# Combine annotated data with region labels
pfc_linked_peaks$Region <- "PFC"
ec_linked_peaks$Region  <- "EC"
hip_linked_peaks$Region <- "HIP"

# Combine all
combined <- rbind(hip_linked_peaks,ec_linked_peaks, pfc_linked_peaks)


In [None]:
# Clean NA and standardize annotation labels
combined$PsychSCREEN_bCRE_type[is.na(combined$PsychSCREEN_bCRE_type)] <- "Unannotated"

# Compute proportions
plot_data <- combined %>%
  group_by(Region, PsychSCREEN_bCRE_type) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Region) %>%
  mutate(Proportion = Count / sum(Count))

In [None]:
unique(plot_data$PsychSCREEN_bCRE_type)

In [None]:
# Order regions and annotation categories
plot_data$Region <- factor(plot_data$Region, levels = c("HIP", "EC", "PFC"))

# Customize category order (can be adjusted)
plot_data$PsychSCREEN_bCRE_type <- factor(plot_data$PsychSCREEN_bCRE_type,
   levels = c("PLS", "pELS", "dELS", "CA", "CA-H3K4me3", "CA-CTCF", "TF", "CA-TF", "DNase-H3K4me3", "Unannotated"))

# Custom color palette (extend as needed)
ccre_colors <- c(
  "PLS" = "#FF0000",
  "pELS" = "#FFA700",
  "dELS" = "#FFCD00",
  "CA" = "#C7E9B4",
  "CA-H3K4me3" = "#41B6C4",
  "CA-CTCF" = "#2C7FB8",
  "TF" = "#6A51A3",
  "CA-TF" = "#9E9AC8",
  "DNase-H3K4me3" = "#00B0F0",
  "Unannotated" = "#706f6f"
)


In [None]:
p1 = ggplot(plot_data, aes(x = Region, y = Proportion, fill = PsychSCREEN_bCRE_type)) +
      geom_bar(stat = "identity") +
      scale_fill_manual(values = ccre_colors, name = "SCREEN cCRE Type") +
      xlab("Region") + ylab("Proportion of Linked Peaks") +
      theme_classic() +
      theme(
        legend.position = "right",
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 15),
        legend.text = element_text(size = 11),
        legend.title = element_text(size = 13)
      ) +
      coord_flip()

ggsave(p1,filename = "./Results/Revision/LINK/psychENCODE_linked_peaks.pdf", width = 10, height = 3.5)

### Draw the stacked plot for each region but split by cell types. 

In [None]:
# data_oi = pfc_linked_peaks
# data_oi = ec_linked_peaks
data_oi = hip_linked_peaks

In [None]:
# Filter for relevant cell types
ct <- c('Astrocyte','Excitatory','Inhibitory','Microglia','Oligodendrocyte','OPC')
data_filtered <- data_oi %>%
  filter(celltype %in% ct) %>%
  mutate(PsychSCREEN_bCRE_type = ifelse(is.na(PsychSCREEN_bCRE_type), "Unannotated", PsychSCREEN_bCRE_type))

# Calculate proportions
plot_data_ct <- data_filtered %>%
  group_by(celltype, PsychSCREEN_bCRE_type) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(celltype) %>%
  mutate(Proportion = Count / sum(Count))

In [None]:
# Order regions and annotation categories
plot_data_ct$celltype <- factor(plot_data_ct$celltype, levels = ct)

# Customize category order (can be adjusted)
plot_data_ct$PsychSCREEN_bCRE_type <- factor(plot_data_ct$PsychSCREEN_bCRE_type,
   levels = c("PLS", "pELS", "dELS", "CA", "CA-H3K4me3", "CA-CTCF", "TF", "CA-TF", "DNase-H3K4me3", "Unannotated"))

# Custom color palette (extend as needed)
label_colors <- c(
  "PLS" = "#FF0000",
  "pELS" = "#FFA700",
  "dELS" = "#FFCD00",
  "CA" = "#C7E9B4",
  "CA-H3K4me3" = "#41B6C4",
  "CA-CTCF" = "#2C7FB8",
  "TF" = "#6A51A3",
  "CA-TF" = "#9E9AC8",
  "DNase-H3K4me3" = "#00B0F0",
  "Unannotated" = "#706f6f"
)

# Plot
p <- ggplot(plot_data_ct, aes(x = Proportion, y = celltype, fill = PsychSCREEN_bCRE_type)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = label_colors, name = "cCRE Type") +
  labs(x = "Proportion of Linked Peaks", y = "Cell Type") +
  theme_classic(base_size = 14) +
  theme(
    legend.position = "right",
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 15),
    legend.text = element_text(size = 12),
    legend.title = element_text(size = 13)
  )

In [None]:
ggsave(p, filename = "./Results/Revision/LINK/CTpeaks_annotation_psychSCREEN_HIP.pdf", width = 10, height = 4)

## draw stacked bar plot for ENCODE annotation

In [None]:
# Compute proportions
plot_data <- combined %>%
  group_by(Region, encodeLabel) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Region) %>%
  mutate(Proportion = Count / sum(Count))
  # plot_data

In [None]:
# Order regions and annotation categories
plot_data$Region <- factor(plot_data$Region, levels = c("HIP", "EC", "PFC"))

# Customize category order (can be adjusted)
plot_data$encodeLabel <- factor(plot_data$encodeLabel,
   levels = c("PLS", "pELS", "dELS", "CTCF-only", "DNase-H3K4me3", "Other"))

# Custom color palette (extend as needed)
ccre_colors <- c(
  "PLS" = "#FF0000",
  "pELS" = "#FFA700",
  "dELS" = "#FFCD00",
  "CTCF-only" = "#00B0F0",
  "DNase-H3K4me3" = "#FFAAAA",
  "Other" = "#706f6f"
)

In [None]:
p = ggplot(plot_data, aes(x = Region, y = Proportion, fill = encodeLabel)) +
      geom_bar(stat = "identity") +
      scale_fill_manual(values = ccre_colors, name = "SCREEN cCRE Type") +
      xlab("Region") + ylab("Proportion of Linked Peaks") +
      theme_classic() +
      theme(
        legend.position = "right",
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 15),
        legend.text = element_text(size = 11),
        legend.title = element_text(size = 13)
      )
ggsave(p,filename = "./Results/Revision/LINK/ENCODE_linked_peaks.pdf", width = 5, height = 8)

## Draw distribution of score in each region

In [None]:
p1 = ggplot(pfc_linked_peaks, aes(x = score)) +
  geom_histogram(binwidth = 0.01, fill = "#c25757ff", color = "#c25757ff", alpha = 0.8, position = 'dodge',size=1) +
  scale_fill_viridis(discrete=TRUE)+
  labs(x = "Score", y = "Count") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    panel.grid.major = element_blank(),  # remove major grid lines
    panel.grid.minor = element_blank(),  # remove minor grid lines
    axis.line = element_line(color = "black")  # show axes
  )

p2 = ggplot(ec_linked_peaks, aes(x = score)) +
  geom_histogram(binwidth = 0.02, fill = "#825ca6ff", color = "#825ca6ff", alpha = 0.8, position = 'dodge',size=1) +
  scale_fill_viridis(discrete=TRUE)+
  labs(x = "Score", y = "Count") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    panel.grid.major = element_blank(),  # remove major grid lines
    panel.grid.minor = element_blank(),  # remove minor grid lines
    axis.line = element_line(color = "black")  # show axes
  )

p3 = ggplot(hip_linked_peaks, aes(x = score)) +
  geom_histogram(binwidth = 0.02, fill = "#3f78c199", color = "#3f78c1", alpha = 0.8, position = 'dodge',size=1) +
  scale_fill_viridis(discrete=TRUE)+
  labs(x = "Score", y = "Count") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    panel.grid.major = element_blank(),  # remove major grid lines
    panel.grid.minor = element_blank(),  # remove minor grid lines
    axis.line = element_line(color = "black")  # show axes
  )


In [None]:
ggsave(patchwork::wrap_plots(p1, p2, p3, ncol = 3), 
        filename = "./Results/Revision/LINK/peak_score_distribution.pdf", 
        width = 13, height = 4)

In [None]:
summary(pfc_linked_peaks$score)
summary(ec_linked_peaks$score)
summary(hip_linked_peaks$score)

In [None]:
ggplot(pfc_linked_peaks, aes(x = score)) +
  geom_histogram(binwidth = 0.01, fill = "#E69F00", color = "black", alpha = 0.85) +
  facet_wrap(~ encodeLabel, scales = "free_y") +
  labs(x = "Score", y = "Count", title = "Score Distribution by cCRE Type") +
  theme_light(base_size = 13)

In [None]:
mean(pfc_linked_peaks$encodeLabel == "dELS")
mean(pfc_linked_peaks$PsychSCREEN_bCRE_type == "dELS", na.rm = TRUE)

mean(ec_linked_peaks$encodeLabel == "dELS")
mean(ec_linked_peaks$PsychSCREEN_bCRE_type == "dELS", na.rm = TRUE)

mean(hip_linked_peaks$encodeLabel == "dELS")
mean(hip_linked_peaks$PsychSCREEN_bCRE_type == "dELS", na.rm = TRUE)

In [None]:
mean(pfc_linked_peaks$encodeLabel == "PLS")
mean(pfc_linked_peaks$PsychSCREEN_bCRE_type == "PLS", na.rm = TRUE)

mean(ec_linked_peaks$encodeLabel == "PLS")
mean(ec_linked_peaks$PsychSCREEN_bCRE_type == "PLS", na.rm = TRUE)

mean(hip_linked_peaks$encodeLabel == "PLS")
mean(hip_linked_peaks$PsychSCREEN_bCRE_type == "PLS", na.rm = TRUE)

In [None]:
write.csv(pfc_linked_peaks, file = "./Results/LINK/PFC_linkpeaks_all_annotated_7.19.csv", row.names = F)
write.csv(ec_linked_peaks, file = "./Results/LINK/EC_linkpeaks_all_annotated_7.19.csv", row.names = F)
write.csv(hip_linked_peaks, file = "./Results/LINK/HIP_linkpeaks_all_annotated_7.19.csv", row.names = F)

## Annotate the linked peaks with new eQLT data

In [None]:
library(data.table)
list.files("./Data")

# Load eQTL data
beqtl = data.frame()

fl = c("celltype-eqtl-sumstats.Exc.tsv.gz",
       "celltype-eqtl-sumstats.Inh.tsv.gz",
       "celltype-eqtl-sumstats.Mic.tsv.gz",
       "celltype-eqtl-sumstats.Oli.tsv.gz",
       "celltype-eqtl-sumstats.Ast.tsv.gz",
       "celltype-eqtl-sumstats.OPC.tsv.gz",
       "celltype-eqtl-sumstats.End.tsv.gz")

# Read and combine all filesbeqtl = data.frame()
for (i in fl){
    df = fread(paste0("./Data/", i))
    df = df[df$significant_by_2step_FDR == "Yes", ]
    beqtl = rbind(beqtl, df)
}

In [None]:
## Clean up the celltype names
beqtl[beqtl$celltype == "Ast",]$celltype = "Astrocyte"
beqtl[beqtl$celltype == "Inh",]$celltype = "Inhibitory"
beqtl[beqtl$celltype == "Exc",]$celltype = "Excitatory"
beqtl[beqtl$celltype == "Mic",]$celltype = "Microglia"
beqtl[beqtl$celltype == "Oli",]$celltype = "Oligodendrocyte"
beqtl[beqtl$celltype == "OPC",]$celltype = "OPC"
beqtl[beqtl$celltype == "End",]$celltype = "Endothelial"

table(beqtl$celltype)

In [None]:
## save the combined data
write.csv(beqtl, "./Data/beqtl.csv", row.names = FALSE)
beqtl = read.csv("./Data/beqtl.csv")
head(beqtl)

In [None]:
dim(beqtl)
length(unique(beqtl$gene_symbol))

In [None]:
# linked_peaks = read.csv("./Results/LINK/PFC_linkpeaks_all_annotated_7.19.csv")
# linked_peaks = read.csv("./Results/LINK/HIP_linkpeaks_all_annotated_7.19.csv")
table(linked_peaks$in_brain_sc_eqlt,linked_peaks$celltype)

In [None]:
colnames(linked_peaks)

In [None]:
## Organize the data into a GRanges object
eqtl_gr <- GRanges(
  seqnames = beqtl$chr38,
  ranges = IRanges(start = beqtl$pos38, end = beqtl$pos38),
  strand = "*",
  gene_name = beqtl$gene_symbol,
  RSID = beqtl$snps,
  beta = beqtl$beta,
  pvalue = beqtl$pvalue,
  celltype = beqtl$celltype
)

eqtl_gr

In [None]:
## Find overlaps between linked peaks and eQTLs
atac_peaks <- linked_peaks[,c("seqnames","peak.start","peak.end")] 
atac_peaks <- GRanges(atac_peaks)
atac_peaks

overlap_eqtl <- findOverlaps(atac_peaks,eqtl_gr)

In [None]:
# ATAC-seq peaks contain SNPs
peaks_contain_snps <- linked_peaks[queryHits(overlap_eqtl),]
peaks_contain_snps$RSID <- eqtl_gr[subjectHits(overlap_eqtl)]$RSID
peaks_contain_snps$eGene <- eqtl_gr[subjectHits(overlap_eqtl)]$gene_name
peaks_contain_snps$cell_type <- eqtl_gr[subjectHits(overlap_eqtl)]$celltype
peaks_contain_snps$beta <- eqtl_gr[subjectHits(overlap_eqtl)]$beta
peaks_contain_snps <- peaks_contain_snps[peaks_contain_snps$gene == peaks_contain_snps$eGene & peaks_contain_snps$celltype == peaks_contain_snps$cell_type,]

linked_peaks$in_brain_sc_eqlt <- ifelse(linked_peaks$comb %in% peaks_contain_snps$comb, T, F)

In [None]:
## Check and save the results
head(peaks_contain_snps)

table(linked_peaks$in_brain_sc_eqlt, linked_peaks$celltype)

In [None]:
write.csv(peaks_contain_snps,"./Results/Revision/LINK/peaks_contain_snps_ec.csv", row.names = F)
write.csv(linked_peaks,"./Results/Revision/LINK/EC_linkpeaks_all_annotated_7.19.csv", row.names = F)

In [None]:
linked_peaks = read.csv("./Results/Revision/LINK/EC_linkpeaks_all_annotated_7.19.csv")
table(linked_peaks$in_brain_sc_eqlt, linked_peaks$celltype)

## Visualize the results

In [None]:
out_pfc <- read.csv("./Results/Revision/LINK/peaks_contain_snps_pfc.csv")
out_ec <- read.csv("./Results/Revision/LINK/peaks_contain_snps_ec.csv")
out_hip <- read.csv("./Results/Revision/LINK/peaks_contain_snps_hip.csv")

out_pfc$ct_rg <- paste(out_pfc$celltype,"_PFC",sep = "")
out_ec$ct_rg <- paste(out_ec$celltype,"_EC",sep = "")
out_hip$ct_rg <- paste(out_hip$celltype,"_HIP",sep = "")

In [None]:
temp <- rbind(out_pfc,out_ec, out_hip)
mat <- table(temp$ct_rg, temp$gene)
mat <- t(mat)

mat
dim(mat)

In [None]:
mat[c("APOE"),]

In [None]:
mat = mat[rowSums(mat) > 10,]
dim(mat)

In [None]:
rownames(mat)

In [None]:
ha<-HeatmapAnnotation(Region=colnames(mat)
                       , col= list(Region=c("Astrocyte_EC"="#825ca6ff","Astrocyte_HIP"="#3f78c1ff","Astrocyte_PFC"="#c25757ff",
                                            "Excitatory_EC"="#825ca6ff","Excitatory_HIP"="#3f78c1ff","Excitatory_PFC"="#c25757ff",
                                            "Inhibitory_PFC"="#c25757ff","Inhibitory_EC"="#825ca6ff","Inhibitory_HIP"="#3f78c1ff",
                                            "Microglia_EC"="#825ca6ff","Microglia_HIP"="#3f78c1ff","Microglia_PFC"="#c25757ff",
                                            "Oligodendrocyte_PFC"="#c25757ff","Oligodendrocyte_EC"="#825ca6ff","Oligodendrocyte_HIP"="#3f78c1ff",
                                            "OPC_PFC"="#c25757ff","OPC_EC"="#825ca6ff","OPC_HIP"="#3f78c1ff")), show_legend=F,annotation_label="Brain region")
ha2<-HeatmapAnnotation(Celltype=colnames(mat)
                       , col= list(Celltype=c("Astrocyte_EC"="#F06719","Astrocyte_HIP"="#F06719","Astrocyte_PFC"="#F06719",
                                            "Excitatory_EC"="#33A65C","Excitatory_HIP"="#33A65C","Excitatory_PFC"="#33A65C",
                                            "Inhibitory_PFC"="#23767C","Inhibitory_EC"="#23767C","Inhibitory_HIP"="#23767C",
                                            "Microglia_EC"="#E03426","Microglia_HIP"="#E03426","Microglia_PFC"="#E03426",
                                            "Oligodendrocyte_PFC"="#1ba3c6ff","Oligodendrocyte_EC"="#1ba3c6ff","Oligodendrocyte_HIP"="#1ba3c6ff",
                                            "OPC_EC"="#A26DC2","OPC_HIP"="#A26DC2","OPC_PFC"="#A26DC2")), show_legend=F,annotation_label="Cell type")
ha <- c(ha2,ha)
ht <- Heatmap(mat,
    cluster_rows = T,
    cluster_columns = F,
    col = colorRamp2(c(0,10,30),c("grey99","red","red4")),
    row_names_side = "left",row_names_gp = gpar(fontface="italic"),
    top_annotation=ha,show_column_names=F,show_row_dend = F,
    )

In [None]:
range(mat)

In [None]:
options(repr.plot.width=6, repr.plot.height=15)
pdf(file = "./Results/Revision/LINK/sceQTL_heatmap_updated.pdf",height = 15,width = 6)   
ht
dev.off()

## Check if DEG-linked peaks are also differentially assessed in the same cell type

In [None]:
pfc_linked_peaks = read.csv("./Results/Revision/LINK/PFC_linkpeaks_all_annotated_7.19.csv")
ec_linked_peaks = read.csv("./Results/Revision/LINK/EC_linkpeaks_all_annotated_7.19.csv")
hip_linked_peaks = read.csv("./Results/Revision/LINK/HIP_linkpeaks_all_annotated_7.19.csv")

In [None]:
## read in the DA analysis results
da_pfc = read.csv("./Results/DA/DA_major.cell.type_PFC.csv")
da_ec = read.csv("./Results/DA/DA_major.cell.type_EC.csv")
da_hip = read.csv("./Results/DA/DA_major.cell.type_HIP.csv")

In [None]:
linked_peaks = pfc_linked_peaks
dar_results = da_pfc

In [None]:
# Ensure consistent naming
linked_peaks$peak <- as.character(linked_peaks$peak)
dar_results$peaks <- as.character(dar_results$peaks)

# Filter DARs for significance (e.g., adj p < 0.05)
sig_dars <- dar_results %>% filter(p_val_adj < 0.05)

dim(sig_dars)

In [None]:
linked_peaks %>%
  filter(peak %in% sig_dars$peaks)%>%
  group_by(celltype) %>%
  summarise(n_linked_da_peaks = n())

In [None]:
# Count overlapping peaks per cell type
overlap_counts <- linked_peaks %>%
  filter(peak %in% sig_dars$peaks) %>%
  group_by(celltype) %>%  # adjust this column name if needed
  summarise(n_linked_da_peaks = n())

In [None]:
# Count overlapping peaks per cell type
overlap_counts <- linked_peaks %>%
  filter(peak %in% sig_dars$peaks) %>%
  group_by(celltype) %>%  # adjust this column name if needed
  summarise(n_linked_da_peaks = n())

# For total linked peaks per cell type
total_linked <- linked_peaks %>%
  group_by(celltype) %>%
  summarise(n_total_linked = n())

# Merge results
overlap_summary <- left_join(total_linked, overlap_counts, by = "celltype") %>%
  mutate(n_linked_da_peaks = ifelse(is.na(n_linked_da_peaks), 0, n_linked_da_peaks),
         proportion = n_linked_da_peaks / n_total_linked)

# Output
print(overlap_summary)

In [None]:
# pfc_summary = overlap_summary
# ec_summary = overlap_summary
hip_summary = overlap_summary

In [None]:
df = rbind(pfc_summary, ec_summary, hip_summary)

In [None]:
pfc_summary$region <- "PFC"
ec_summary$region <- "EC"
hip_summary$region <- "HIP"

In [None]:
p = ggplot(df, aes(x = region, y = celltype)) +
          geom_point(aes(size = n_linked_da_peaks, color = proportion)) +
          scale_color_viridis_c() +
          scale_size_continuous(range = c(3, 10)) +
          labs(size = "Number of significant DARs", color = "Proportion",
               x = "Brain Region", y = "Cell Type") +
          theme_minimal()
options(repr.plot.width=6, repr.plot.height=4)
p
ggsave(p,filename = "./Results/Revision/LINK/linked_peaks_DARs_summary.pdf", width = 6, height = 4)

In [None]:
linked_peaks$comb = paste(linked_peaks$peak, linked_peaks$celltype, sep = "_")
da_pfc_sig$comb = paste(da_pfc_sig$peaks, da_pfc_sig$cell.type, sep = "_")

table(linked_peaks$comb %in% da_pfc_sig$comb)

In [None]:
da_pfc_sig = da_pfc[da_pfc$p_val_adj < 0.05,]

In [None]:
table(pfc_linked_peaks$peak %in% da_pfc_sig$peaks & pfc_linked_peaks$celltype == da_pfc_sig$cell.type)