In [1]:
library(Seurat)
library(Signac)
library(genomation)
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg38)
library(EnsDb.Hsapiens.v86)
library(Matrix)
library(parallel)
library(foreach)

Loading required package: SeuratObject

Loading required package: sp

‘SeuratObject’ was built under R 4.3.2 but the current version is
4.3.3; it is recomended that you reinstall ‘SeuratObject’ as the ABI
for R may have changed

‘SeuratObject’ was built with package ‘Matrix’ 1.6.3 but the current
version is 1.6.5; it is recomended that you reinstall ‘SeuratObject’ as
the ABI for ‘Matrix’ may have changed


Attaching package: ‘SeuratObject’


The following object is masked from ‘package:base’:

    intersect


Loading required package: grid

“replacing previous import ‘Biostrings::pattern’ by ‘grid::pattern’ when loading ‘genomation’”
Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following object is masked from ‘package:SeuratObject’:

    intersect


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, ap

Specify file path

In [2]:
gene_gtf_path = "/maps/projects/ralab/data/genome/hg38/gencode.v43.chr_patch_hapl_scaff.annotation.gtf"
abc_genes_path = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/241203/scE2G/ENCODE_rE2G/ABC/reference/hg38/CollapsedGeneBounds.hg38.TSS500bp.bed"
path.matrix.atac_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_Xu/K562/Kendall/atac_matrix.csv.gz"
path.matrix.rna_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Xu/1.prepare_data/1.seurat_pipeline.240507/rna_count_matrix.csv.gz"
dir.output = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Xu/3.Genome_wide_prediction/Signac.default/Signac.250210/"

Import ATAC matrix

In [3]:
matrix.atac = read.csv(path.matrix.atac_count,
                       row.names = 1,
                       check.names = F)
matrix.atac = Matrix(as.matrix(matrix.atac), sparse = TRUE)

Import RNA matrix

In [4]:
matrix.rna_count = read.csv(path.matrix.rna_count,
                            row.names = 1,
                            check.names = F)
matrix.rna_count = Matrix(as.matrix(matrix.rna_count), sparse = TRUE)
matrix.rna_count = matrix.rna_count[,colnames(matrix.atac)]

Map gene names

In [5]:
extract_attributes <- function(gtf_attributes, att_of_interest){
  att <- unlist(strsplit(gtf_attributes, " "))
  if(att_of_interest %in% att){
    return(gsub("\"|;","", att[which(att %in% att_of_interest)+1]))
  } else {
    return(NA)}
}
map_gene_names <- function(rna_matrix, gene_gtf_path, abc_genes_path){
    library(dplyr)
    library(data.table)
    
	gene_ref <- fread(gene_gtf_path, header = FALSE, sep = "\t") %>%
		setNames(c("chr","source","type","start","end","score","strand","phase","attributes")) %>%
		dplyr::filter(type == "gene")
	gene_ref$gene_ref_name <- unlist(lapply(gene_ref$attributes, extract_attributes, "gene_name"))
	gene_ref$Ensembl_ID <- unlist(lapply(gene_ref$attributes, extract_attributes, "gene_id"))
	gene_ref <- dplyr::select(gene_ref, gene_ref_name, Ensembl_ID) %>%
		mutate(Ensembl_ID = sub("\\.\\d+$", "", Ensembl_ID)) %>% # remove decimal digits 
		distinct()
	
	abc_genes <- fread(abc_genes_path, col.names = c("chr", "start", "end", "name", "score", "strand", "Ensembl_ID", "gene_type")) %>%
		dplyr::select(name, Ensembl_ID) %>%
		rename(abc_name = name) %>%
		left_join(gene_ref, by = "Ensembl_ID") %>%
		group_by(Ensembl_ID) %>% # remove cases where multiple genes map to one ensembl ID
		filter(n() == 1) %>%
		ungroup()

	gene_key <- abc_genes$abc_name
	names(gene_key) <- abc_genes$gene_ref_name

	# remove genes not in our gene universe	
	row_sub <- intersect(rownames(rna_matrix), names(gene_key)) # gene ref names
	rna_matrix_filt <- rna_matrix[row_sub,] # still gene ref names
	rownames(rna_matrix_filt) <- gene_key[row_sub] # converted to abc names

	return(rna_matrix_filt)
}

In [6]:
matrix.rna_count.rename = map_gene_names(matrix.rna_count,gene_gtf_path, abc_genes_path)


Attaching package: 'dplyr'


The following objects are masked from 'package:ensembldb':

    filter, select


The following object is masked from 'package:AnnotationDbi':

    select


The following object is masked from 'package:Biobase':

    combine


The following objects are masked from 'package:Biostrings':

    collapse, intersect, setdiff, setequal, union


The following object is masked from 'package:XVector':

    slice


The following objects are masked from 'package:GenomicRanges':

    intersect, setdiff, union


The following object is masked from 'package:GenomeInfoDb':

    intersect


The following objects are masked from 'package:IRanges':

    collapse, desc, intersect, setdiff, slice, union


The following objects are masked from 'package:S4Vectors':

    first, intersect, rename, setdiff, setequal, union


The following objects are masked from 'package:BiocGenerics':

    combine, intersect, setdiff, union


The following objects are masked from 'package:stats':



Create Seurat object, follow https://stuartlab.org/signac/articles/pbmc_multiomic

In [7]:
# get gene annotations for hg38
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevels(annotation) <- paste0('chr', seqlevels(annotation))

"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence

In [8]:
# create a Seurat object containing the RNA adata
obj.seurat <- CreateSeuratObject(
  counts = matrix.rna_count.rename,
  assay = "RNA"
)

In [9]:
# create ATAC assay and add it to the object
obj.seurat[["ATAC"]] <- CreateChromatinAssay(
  counts = matrix.atac,
  sep = c("-", "-"),
  annotation = annotation
)

In [10]:
DefaultAssay(obj.seurat) <- "RNA"
obj.seurat <- SCTransform(obj.seurat)
obj.seurat <- RunPCA(obj.seurat)

Running SCTransform on assay: RNA

Running SCTransform on layer: counts

vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.

`vst.flavor` is set to 'v2' but could not find glmGamPoi installed.
Please install the glmGamPoi package for much faster estimation.
--------------------------------------------
install.packages('BiocManager')
BiocManager::install('glmGamPoi')
--------------------------------------------
Falling back to native (slower) implementation.


Variance stabilizing transformation of count matrix of size 16128 by 7821

Model formula is y ~ log_umi

Get Negative Binomial regression parameters per gene

Using 2000 genes, 5000 cells

"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iteration limit reached"
"iterat

In [11]:
DefaultAssay(obj.seurat) <- "ATAC"
obj.seurat <- FindTopFeatures(obj.seurat, min.cutoff = 5)
obj.seurat <- RunTFIDF(obj.seurat)
obj.seurat <- RunSVD(obj.seurat)

Performing TF-IDF normalization

Running SVD

Scaling cell embeddings



Linking peaks to genes

In [12]:
DefaultAssay(obj.seurat) <- "ATAC"

# first compute the GC content for each peak
obj.seurat <- RegionStats(obj.seurat, genome = BSgenome.Hsapiens.UCSC.hg38)

In [13]:
# link peaks to genes
obj.seurat <- LinkPeaks(
  object = obj.seurat,
  peak.assay = "ATAC",
  expression.assay = "SCT",
  genes.use = rownames(matrix.rna_count.rename[rowSums(matrix.rna_count.rename) > 0,])
)

Testing 14980 genes and 157600 peaks

Found gene coordinates for 14753 genes



Save results

In [14]:
dir.create(dir.output,recursive = T)
saveRDS(obj.seurat@assays$ATAC@links,
        paste(dir.output,"links.rds",sep = "/"))
obj.seurat@assays$ATAC@links

GRanges object with 21911 ranges and 5 metadata columns:
          seqnames              ranges strand |     score        gene
             <Rle>           <IRanges>  <Rle> | <numeric> <character>
      [1]     chr1      778761-1000172      * | 0.0508541        HES4
      [2]     chr1      820756-1020123      * | 0.0511646        AGRN
      [3]     chr1      827285-1116361      * | 0.0546499    C1orf159
      [4]     chr1      940540-1000172      * | 0.0505713        HES4
      [5]     chr1       951869-959309      * | 0.0530122       NOC2L
      ...      ...                 ...    ... .       ...         ...
  [21907]     chrX 154762742-154799249      * | 0.0512082        DKC1
  [21908]     chrX 154763237-155071420      * | 0.0604306       BRCC3
  [21909]     chrX 154776030-154821007      * | 0.0527089        MPP1
  [21910]     chrX 155264589-155334730      * | 0.0516050      RAB39B
  [21911]     chrX 155334657-155334730      * | 0.0606970       CLIC2
                            peak 

In [15]:
df.output = as.data.frame(obj.seurat@assays$ATAC@links)
colnames(df.output) = c('chr',
                        'start',
                        'end',
                        'width',
                        'strand',
                        'score',
                        'TargetGene',
                        'PeakName',
                        'zscore',
                        'pvalue')
df.output[,"CellType"] = "K562"
df.output[,"PairName"] = paste(df.output[,"PeakName"],
                               df.output[,"TargetGene"],
                               sep = "_")
data.table::fwrite(df.output,
                   file = paste(dir.output,"pairs.E2G.res.tsv.gz",sep = "/"),
                   row.names = F,
                   quote = F,
                   sep = "\t")
df.output

chr,start,end,width,strand,score,TargetGene,PeakName,zscore,pvalue,CellType,PairName
<fct>,<int>,<int>,<int>,<fct>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>
chr1,778761,1000172,221412,*,0.05085411,HES4,chr1-778120-779401,2.343423,9.553860e-03,K562,chr1-778120-779401_HES4
chr1,820756,1020123,199368,*,0.05116465,AGRN,chr1-819819-821692,3.093901,9.877161e-04,K562,chr1-819819-821692_AGRN
chr1,827285,1116361,289077,*,0.05464988,C1orf159,chr1-826642-827928,1.926267,2.703555e-02,K562,chr1-826642-827928_C1orf159
chr1,940540,1000172,59633,*,0.05057131,HES4,chr1-939995-941085,1.799467,3.597238e-02,K562,chr1-939995-941085_HES4
chr1,951869,959309,7441,*,0.05301224,NOC2L,chr1-950610-953128,3.453538,2.766421e-04,K562,chr1-950610-953128_NOC2L
chr1,959309,1080049,120741,*,0.05341145,NOC2L,chr1-1079353-1080745,3.384546,3.564810e-04,K562,chr1-1079353-1080745_NOC2L
chr1,959309,1331427,372119,*,0.06348661,NOC2L,chr1-1330543-1332311,4.451339,4.266819e-06,K562,chr1-1330543-1332311_NOC2L
chr1,960587,1000388,39802,*,0.05638424,KLHL17,chr1-999557-1001218,2.216227,1.333799e-02,K562,chr1-999557-1001218_KLHL17
chr1,960587,1308322,347736,*,0.05029331,KLHL17,chr1-1307532-1309112,2.078736,1.882081e-02,K562,chr1-1307532-1309112_KLHL17
chr1,960857,1000172,39316,*,0.05322021,HES4,chr1-960264-961450,2.362046,9.087205e-03,K562,chr1-960264-961450_HES4


In [16]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux 8.10 (Ootpa)

Matrix products: default
BLAS/LAPACK: /maps/projects/ralab/people/lpm537/software/anaconda3/envs/Notebook_E2G_240505/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Copenhagen
tzcode source: system (glibc)

attached base packages:
 [1] parallel  stats4    grid      stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] data.table_1.15.2                 dplyr_1.1.4                      
 [3] foreach_1.5.2                     Matrix_1.6