In [None]:
# overlap linked peaks to GWAS
suppressPackageStartupMessages({
library(stringr)
library(GenomicRanges)
library(GenomicFeatures)
library(LDlinkR)

## Loading
library(Seurat)
library(Signac)
library(dplyr)
library(patchwork)
library(future)
library(stringr)
library(tidydr)
library(tidyverse)
library(viridis)
library(ggplot2)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)

# For motif analysis
library(JASPAR2020)
library(TFBSTools)
library(rtracklayer)
library(ComplexHeatmap)
library(circlize)
})

In [None]:
## read in lead SNPs from four AD GWAS
Bellenguez_rsid <- read.csv("./Data/AD_GWAS_Bellenguez.csv")
Jansen_rsid <- read.csv("./Data/AD_GWAS_Jansen.csv")
Wightman_rsid <- read.csv("./Data/AD_GWAS_Wightman.csv")
Kunkle_rsid <- read.csv("./AD_GWAS/Kunkle_index_rsid.csv")

Bellenguez_rsid <- Bellenguez_rsid$Variant
length(Bellenguez_rsid)
Jansen_rsid <- Jansen_rsid$SNP
length(Jansen_rsid)
Wightman_rsid <- Wightman_rsid$Lead.variant
length(Wightman_rsid)
Kunkle_rsid <- Kunkle_rsid$RSID
length(Kunkle_rsid)

rsid_list <- unique(c(Bellenguez_rsid,Jansen_rsid,Wightman_rsid,Kunkle_rsid))
length(rsid_list)

#list_pop()

In [None]:
# rsid_list for EOAD based on the abstract https://alz-journals.onlinelibrary.wiley.com/doi/full/10.1002/alz.064268
rsid_list <- c("rs9268852","rs73199790","rs2075895","rs7108663","rs6656401") 

In [None]:
#setwd("./Results/AD_GWAS_SNP/batch/")
LDproxy_batch(
    snp = rsid_list,
    pop = "CEU",
    r2d = "r2",
    token = "764d73fdebf7",
    genome_build = "grch38"
)

In [None]:
snp_file <- list.files("./Results/AD_GWAS_SNP/batch/")
final = data.frame()
for (i in snp_file){
    inpath <- paste("./Results/AD_GWAS_SNP/batch/",i,sep = "")
    df <- read.table(inpath)
    df <- df[df$R2 > 0.8,]

    final <- rbind(final, df)
}

In [None]:
#final
length(unique(final$RS_Number))##
eur_list <- final

In [None]:
# Identification of SNPs of index SNP with R2 > 0.8
# only one variant each time
final <- data.frame()
for (i in rsid_list){
    tmp <- LDproxy(snp = i, 
                      pop = "CEU", 
                      r2d = "r2", 
                      token = "764d73fdebf7",
                      genome_build = "grch38")
    tmp <- as.data.frame(tmp)
    tmp <- tmp[tmp$R2 >= 0.8,]
    final <- rbind(final, tmp)   
}
final #with 148 index snp from four AD GWAS, expanded to xx snps, r2>0.8
#final #with 23 index snp, expanded to 736 snps with R2>0.8

In [None]:
## get all snps expanded from AD gwas
final <- read.csv("./Results/AD_GWAS_SNP/EOAD_GWAS_SNP.csv",row.names = 1)
final <- unique(final)
dim(final)
head(final)
length(unique(final$RS_Number))

#out <- final[final$RS_Number %in% rsid_list,]
#write.csv(out, file = "./Results/AD_GWAS_SNP/rsid_list.csv")

#rsid_list <- read.csv("./Results/AD_GWAS_SNP/rsid_list.csv")

In [None]:
# from expanded snps list, generate a GRanges format file
snp_coords <- as.data.frame(str_split_fixed(final$Coord,pattern = ":",n=2))
snp_coords$V3 = snp_coords$V2
snp_coords$RSID <- final$RS_Number
snp_coords$Alleles <- final$Alleles
#snp_coords$MAF <- final$MAF
#snp_coords$Distance <- final$Distance
#snp_coords$R2 <- final$R2


colnames(snp_coords) <- c("chr","start","end","RSID","Alleles")
snp_coords <- GRanges(snp_coords)
snp_coords <- unique(snp_coords)
snp_coords

In [None]:
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
# downloaded hg38 known gene annotation from UCSC: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/

# extract coding regions
cdsByGene <- cdsBy(txdb, by="gene")
txdb
cdsByGene <- unlist(cdsByGene)

In [None]:
## find snps located in coding regions. 
cg_hit <- findOverlaps(snp_coords, cdsByGene)
cg_hit # find numbers of snps locaded in the coding regions. 

## keep only non-coding expaned hits
noncoding_snp_coords <- snp_coords[-queryHits(cg_hit)]
noncoding_snp_coords

In [None]:
getwd()

### Overlaping linked cCRE with GWAS signals; Figure 6a

In [None]:
linked_peaks <- read.csv("./Results/LINK/PFC_linkpeaks_all_annotated_1.23.csv")
# linked_peaks <- read.csv("./Results/LINK/EC_linkpeaks_all_annotated_1.23.csv")
# linked_peaks <- read.csv("./Results/LINK/HIP_linkpeaks_all_annotated_1.23.csv")

#linked_peaks <- linked_peaks[linked_peaks$score > 0.1,]
linked_peaks <- unique(linked_peaks)
dim(linked_peaks)

In [None]:
atac_peaks <- linked_peaks[,c("seqnames","peak.start","peak.end")] 
atac_peaks <- GRanges(atac_peaks)
atac_peaks

In [None]:
# Find overlaps
#overlaps <- findOverlaps(snp_coords,atac_peaks)
overlaps <- findOverlaps(noncoding_snp_coords,atac_peaks)
overlaps

# SNPs within ATAC-seq peaks
#snps_within_peaks <- snp_coords[queryHits(overlaps)]
snps_within_peaks <- noncoding_snp_coords[queryHits(overlaps)]
snps_within_peaks

# ATAC-seq peaks contain SNPs
peaks_contain_snps <- linked_peaks[subjectHits(overlaps),]
#peaks_contain_snps <- unique(peaks_contain_snps)
dim(peaks_contain_snps)


#unique(peaks_contain_snps$comb)
table(peaks_contain_snps$celltype)
length(unique(peaks_contain_snps$comb))
length(unique(peaks_contain_snps$gene))

In [None]:
## cleaning and writting
out <- cbind(peaks_contain_snps,as.data.frame(snps_within_peaks))
out <- unique(out)
table(out$celltype)
table(out$cluster)
dim(out)

write.csv(out,file = "./Results/LINK/AD_GWAS_overlap_HIP_1.23.csv",row.names = F)
#write.csv(out,file = "./Results/LINK/AD_GWAS_overlap_DAR_CT_2.9.csv",row.names = F)

### Organizing all the GWAS overlap results

In [None]:
pfc_out <- read.csv("./Results/LINK/AD_GWAS_overlap_PFC_1.23.csv")
ec_out <- read.csv("./Results/LINK/AD_GWAS_overlap_EC_1.23.csv")
hip_out <- read.csv("./Results/LINK/AD_GWAS_overlap_HIP_1.23.csv")

dim(pfc_out)
dim(ec_out)
dim(hip_out)

pfc_out$comb <-paste(pfc_out$peak,pfc_out$gene,pfc_out$celltype,sep ="_")
ec_out$comb <-paste(ec_out$peak,ec_out$gene,ec_out$celltype,sep ="_")
hip_out$comb <-paste(hip_out$peak,hip_out$gene,hip_out$celltype,sep ="_")

l1 <- pfc_out$comb
l2 <- ec_out$comb
l3 <- hip_out$comb

#unique(l1)
#unique(l2)
#unique(l3)
unique(c(l1,l2,l3))
length(unique(c(l1,l2,l3)))

length(unique(pfc_out$peak))
length(unique(ec_out$peak))
length(unique(hip_out$peak))

length(unique(pfc_out$RSID))
length(unique(ec_out$RSID))
length(unique(hip_out$RSID))

## unique rsid that overlap with peaks linked to DEGs
length(unique(c(pfc_out$RSID,ec_out$RSID,hip_out$RSID)))

In [None]:
length(unique(c(pfc_out$peak,ec_out$peak,hip_out$peak)))
length(unique(c(pfc_out$gene,ec_out$gene,hip_out$gene)))

### Figure 6b: lolipop plot

In [None]:
## in house CT peaks bed
ctpeaks <- read.csv("./Results/CTpeaks_annotated.csv",row.names = 1)
#head(ctpeaks)
colnames(ctpeaks) <- c("chr","start","end","width","strand","peak_called_in")
ctpeaks <- GRanges(ctpeaks)
ctpeaks
# empty df
final <- data.frame()

In [None]:
# linked_peaks <- read.csv("./Results/LINK/PFC_linkpeaks_all_annotated_1.23.csv")
#linked_peaks <- read.csv("./Results/LINK/EC_linkpeaks_all_annotated_1.23.csv")
linked_peaks <- read.csv("./Results/LINK/HIP_linkpeaks_all_annotated_1.23.csv")

# table(linked_peaks$pvalue<0.05)
linked_peaks$p.adj = p.adjust(linked_peaks$pvalue,method = "BH")
table(linked_peaks$p.adj<0.05)

In [None]:
res <- data.frame(
    celltype = unique(linked_peaks$celltype),
    region = "HIP",
    N_EOAD_DEG_linked_peaks = NA,
    N_peaks_contain_LOAD_SNPs = NA,
    N_interactions = NA,
    N_non_EOAD_DEG_ctpeaks = NA,
    N_other_ctpeaks_contain_SNPs = NA, 
    Fishers_p = NA
)

In [None]:
## performing overlapping and Fisher's exact test.
for (i in unique(linked_peaks$celltype)){
    print(i)

    temp <- linked_peaks[linked_peaks$score >=0.05 & linked_peaks$p.adj < 0.05 & linked_peaks$celltype == i,]
    # convert linked_peaks to GRanges format
    message("Number of significantly correlated peaks to DEGs")
    print(dim(temp))
    atac_peaks <- temp[,c("seqnames","peak.start","peak.end")] 
    atac_peaks <- GRanges(atac_peaks)
    #atac_peaks
    
    ## ct peaks that are not cell type AD-DEG linked peaks
    ctpeaks_ct <- ctpeaks[grep(i, ctpeaks$peak_called_in)]
    temp2 <- ctpeaks_ct[!ctpeaks_ct %in% atac_peaks]
    #temp2

    res[res$celltype == i,]$N_EOAD_DEG_linked_peaks <- length(atac_peaks)
    res[res$celltype == i,]$N_non_EOAD_DEG_ctpeaks <- length(temp2)

    overlaps_ct <- findOverlaps(noncoding_snp_coords,atac_peaks)
    overlaps_all <- findOverlaps(noncoding_snp_coords,temp2)

    ## performing Fisher test
   a <- length(unique(subjectHits(overlaps_ct)))
   b <- length(atac_peaks) - a
   c <- length(unique(subjectHits(overlaps_all)))
   d <- length(temp2) - c

   ## fisher exact test
    df <- data.frame("AD_DEG_linked_peaks" = c(a,b),"other_peaks" = c(c,d),row.names = c("contain_LOAD_GWAS_SNPs","not_contrain_LOAD_GWAS_SNPs"))
    #df
    temp3 <- fisher.test(df,alternative = "greater")

    ## write in value
    res[res$celltype == i,]$Fishers_p <- temp3$p.value
    res[res$celltype == i,]$N_interactions <- length(overlaps_ct)
    res[res$celltype == i,]$N_other_ctpeaks_contain_SNPs <- length(overlaps_all)
    res[res$celltype == i,]$N_peaks_contain_LOAD_SNPs <- length(unique(subjectHits(overlaps_ct)))
}
final <- rbind(final,res)
final

In [None]:
# saving results
final$adj.p <- p.adjust(final$Fishers_p)
# final
write.csv(final, file = "./Results/LINK/overlapping_with_LOAD_GWAS.csv",row.names = F)

In [None]:
### plotting
final <- final[order(final$celltype),]
final$color <- NA

final[final$region == "PFC",]$color <- "#825ca6ff"
final[final$region == "EC",]$color <- "#3f78c199"
final[final$region == "HIP",]$color <- "#c25757ff"
final$ct_rg <- paste(final$celltype, final$region, sep = "_")

final$order <- factor(final$ct_rg, levels = c(final$ct_rg))
#final

pdf(file = "./Figures/LINK/GWAS_hits_Fisher.pdf",width = 8,height = 4.5)
ggplot(final, aes(y = order, x = -log10(adj.p), label = -log10(adj.p))) +
    geom_segment(aes(x = -1, xend = -log10(adj.p), yend = order, colour = region), size = 1) +
    geom_point(aes(size = final$N_peaks_contain_LOAD_SNPs,color = region)) +
    scale_size_continuous(range = c(1,6)) +
    geom_vline(xintercept = -log10(0.05),linetype="dashed",color ="gray50") +
    scale_color_manual(values = final$color)+theme_classic()+
    scale_y_discrete(limits=rev)
dev.off()

In [None]:
temp <- temp[order(temp$celltype),]

temp$color <- NA
temp[temp$region == "PFC",]$color <- "#825ca6ff"
temp[temp$region == "EC",]$color <- "#3f78c199"
temp[temp$region == "HIP",]$color <- "#c25757ff"

temp$order <- factor(temp$ct_rg, levels = c(temp$ct_rg))
# temp

### Supplementary Figure 9a

In [None]:
out_pfc <- read.csv("./Results/LINK/AD_GWAS_overlap_PFC_1.23.csv")
out_ec <- read.csv("./Results/LINK/AD_GWAS_overlap_EC_1.23.csv")
out_hip <- read.csv("./Results/LINK/AD_GWAS_overlap_HIP_1.23.csv")

In [None]:
out_pfc <- read.csv("./Results/LINK/AD_GWAS_overlap_PFC_1.23.csv")
out_ec <- read.csv("./Results/LINK/AD_GWAS_overlap_EC_1.23.csv")
out_hip <- read.csv("./Results/LINK/AD_GWAS_overlap_HIP_1.23.csv")

out_pfc <- out_pfc[out_pfc$score >= 0.05 & out_pfc$pvalue < 0.05,]
out_ec <- out_ec[out_ec$score >= 0.05 & out_ec$pvalue < 0.05,]
out_hip <- out_hip[out_hip$score >= 0.05 & out_hip$pvalue < 0.05,]

out_pfc$ct_rg <- paste(out_pfc$celltype,"_PFC",sep = "")
out_ec$ct_rg <- paste(out_ec$celltype,"_EC",sep = "")
out_hip$ct_rg <- paste(out_hip$celltype,"_HIP",sep = "")

temp <- rbind(out_pfc,out_ec,out_hip)
#temp <- unique(temp[,c("gene","RSID","ct_rg")])
mat <- table(temp$ct_rg, temp$gene)
#pfc_mat <- table(out_pfc$celltype, out_pfc$gene)
#ec_mat <- table(out_ec$celltype, out_ec$gene)
#hip_mat <- table(out_hip$celltype, out_hip$gene)
# mat


In [None]:
#temp[temp$RSID %in% rsid_list,]
df <- temp %>% group_by(gene) %>%  top_n(n = 1, wt = -score)
# df

In [None]:
# cap
mat[mat>10] <- 10
#rownames(mat)<-substr(rownames(mat),start=1,stop=3)
mat <- t(mat)
# mat <- mat[,c(9:11,1:8,12:15)]
# mat
dim(mat)

In [None]:
ha<-HeatmapAnnotation(Region=colnames(mat)
                       , col= list(Region=c("Astrocyte_EC"="#825ca6ff","Astrocyte_HIP"="#3f78c1ff","Astrocyte_PFC"="#c25757ff",
                                            "Excitatory_EC"="#825ca6ff","Excitatory_HIP"="#3f78c1ff","Excitatory_PFC"="#3f78c1ff",
                                            "Inhibitory_PFC"="#c25757ff","Inhibitory_EC"="#825ca6ff","Inhibitory_HIP"="#3f78c1ff",
                                            "Microglia_EC"="#825ca6ff","Microglia_HIP"="#3f78c1ff","Microglia_PFC"="#c25757ff",
                                            "Oligodendrocyte_PFC"="#c25757ff","Oligodendrocyte_EC"="#825ca6ff","Oligodendrocyte_PFC"="#c25757ff",
                                            "OPC_EC"="#825ca6ff","OPC_HIP"="#3f78c1ff","OPC_PFC"="#c25757ff")), show_legend=F,annotation_label="Brain region")
ha2<-HeatmapAnnotation(Celltype=colnames(mat)
                       , col= list(Celltype=c("Astrocyte_EC"="#F06719","Astrocyte_HIP"="#F06719","Astrocyte_PFC"="#F06719",
                                            "Excitatory_EC"="#33A65C","Excitatory_HIP"="#33A65C","Excitatory_PFC"="#33A65C",
                                            "Inhibitory_PFC"="#23767C","Inhibitory_EC"="#23767C","Inhibitory_HIP"="#23767C",
                                            "Microglia_EC"="#E03426","Microglia_HIP"="#E03426","Microglia_PFC"="#E03426",
                                            "Oligodendrocyte_PFC"="#1ba3c6ff","Oligodendrocyte_EC"="#1ba3c6ff","Oligodendrocyte_PFC"="#1ba3c6ff",
                                            "OPC_EC"="#A26DC2","OPC_HIP"="#A26DC2","OPC_PFC"="#A26DC2")), show_legend=F,annotation_label="Cell type")
ha <- c(ha2,ha)
ht <- Heatmap(mat,
    cluster_rows = T,
    cluster_columns = F,
    col = colorRamp2(c(0,5,10),c("grey99","red","red4")),
    row_names_side = "left",row_names_gp = gpar(fontface="italic"),
    top_annotation=ha,show_column_names=F,show_row_dend = F,
    )

pdf(file = "./Figures/LINK/GWAS_heatmap.pdf",height = 6,width = 4.5)   
ht
dev.off()

In [None]:
unique(out_ec[out_ec$gene == "HLA-DRB1",]$peak)
table(out_ec[out_ec$gene == "HLA-DRB1",]$peak)
#out_hip[out_hip$RSID=="rs9271058",]

In [None]:
library(Seurat)
library(Signac)
## plot the violin/box plot for the gene of interest
ec <- readRDS("./cellbender_EC_object_10.31.rds")
#pfc <- readRDS("./cellbender_PFC_object_10.31.rds")
#hip <- readRDS("./cellbender_HIP_object_10.31.rds")

In [None]:
pfc_linked_peaks_annotated <- read.csv("./Results/LINK/PFC_linkpeaks_all_annotated_1.23.csv")
ec_linked_peaks_annotated <- read.csv("./Results/LINK/EC_linkpeaks_all_annotated_1.23.csv")
hip_linked_peaks_annotated <- read.csv("./Results/LINK/HIP_linkpeaks_all_annotated_1.23.csv")

# pfc_linked_peaks_annotated[pfc_linked_peaks_annotated$in_brain_sc_eqlt == T,]
# pfc_linked_peaks_annotated[pfc_linked_peaks_annotated$in_brain_sc_eqlt == T,]$celltype
# ec_linked_peaks_annotated[ec_linked_peaks_annotated$in_brain_sc_eqlt == T,]
# ec_linked_peaks_annotated[ec_linked_peaks_annotated$in_brain_sc_eqlt == T,]$celltype
# hip_linked_peaks_annotated[hip_linked_peaks_annotated$in_brain_sc_eqlt == T,]
# hip_linked_peaks_annotated[hip_linked_peaks_annotated$in_brain_sc_eqlt == T,]$celltype

In [None]:
### draw a histgram for gene of interest
# working on EC/HIP/PFC
object <- ec
# retrive metadata
meta <- object@meta.data
table(meta$cluster_celltype)
meta <- meta[meta$cluster_celltype == "Microglia",]

# creating dataframe for plotting
temp <- as.data.frame(LayerData(object,layer = "data",assay = "PC",features = "HLA-DRB1",cells = rownames(meta)))
temp <- as.data.frame(t(temp))

# adding variable 
colnames(temp)[1] <- "expression"
temp$diagnosis <- meta[rownames(temp),]$diagnosis
temp$individual_ID <- meta[rownames(temp),]$individual_ID
temp$individual_ID <- factor(temp$individual_ID,levels = c("NIH02","NIH05","NIH29","NIH11","NIH14","NIH17"))

#temp$individual_ID <- with(temp,reorder(individual_ID,diagnosis))
p2 <- ggplot(temp, aes(x = diagnosis, y = expression)) + geom_boxplot(aes(fill = diagnosis), trim=F, scale = "width")+theme_classic()+scale_fill_manual(values=c("#d40000ff", "#005980ff"))+theme(legend.position = "none")

#pdf("./Figures/DEG/SupF2/EC_Mic_HLADRB1_boxplot.pdf",width = 2,height = 4)
p2
#dev.off()

In [None]:
gene_oi = "CLU"
celltype = "Astrocyte"

links <- ec_linked_peaks_annotated
obj <- ec
obj$cell_diag <- paste(obj$cluster_celltype, obj$diagnosis, sep = "_")
Idents(obj) <- obj$cell_diag
#Idents(obj) <- obj$cluster_celltype
table(Idents(obj))

#Idents(object) <- object$cluster_celltype
#table(Idents(object))
#idents.plot <- c(
#  "Microglia_EOAD","Microglia_NCI","Astrocyte_EOAD","Astrocyte_NCI",
#  "Excitatory_EOAD","Excitatory_NCI","Oligodendrocyte_EOAD","Oligodendrocyte_NCI",
#  "OPC_EOAD","OPC_NCI","Inhibitory_NCI","Inhibitory_EOAD")
#idents.plot <- c("Astrocyte","Excitatory","Microglia")
#idents.plot <- c("Microglia_EOAD","Microglia_NCI")
#idents.plot <- c("Inhibitory_EOAD","Inhibitory_NCI")
#idents.plot <- c("Excitatory_EOAD","Excitatory_NCI")
idents.plot <- c("Astrocyte_EOAD","Astrocyte_NCI")
#idents.plot <- c("Oligodendrocyte_EOAD","Oligodendrocyte_NCI")

## common
df_gr <- links[links$celltype == celltype & links$gene == gene_oi,]
#df_gr <- links[links$gene == gene_oi,]
df_gr <- GRanges(df_gr)
Links(obj) <- df_gr

p1 <- CoveragePlot(
  object = obj,
  window = 600,  
  region = gene_oi,
  features = gene_oi,
  expression.assay ="SCT",
  idents = idents.plot,
  extend.upstream = 2000,
  extend.downstream = 5000,
  region.highlight= GRanges(c("chr8:27608514-27609305","chr8:27610292-27612312")),
  annotation = T,#,heights = 2
  peaks = T,
  links = T
)
p1
