In [1]:
library(ArchR)
library(Seurat)
library(Signac)
library(magrittr)
library(genomation)
library(GenomicRanges)
library(Matrix)
library(ggplot2)


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
       \               '        ##_______ _____ ,--,__,=##,__   ///
        ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
        -,____,---'       \\####\\________________,--\\_##,/
           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _  \     
         /  ^  \    |  |_) 

Specify file path

In [2]:
gene_gtf_path = "/maps/projects/ralab/data/genome/hg38/gencode.v43.chr_patch_hapl_scaff.annotation.gtf"
abc_genes_path = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/241203/scE2G/ENCODE_rE2G/ABC/reference/hg38/CollapsedGeneBounds.hg38.TSS500bp.bed"
path.seurat = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/BMMC/1.prepare_data/5.merge_peaks_from_5_super_groups.240524/obj.seurat.rds"
path.pairs.E2G = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/BMMC/1.prepare_data/5.merge_peaks_from_5_super_groups.240524/pairs.rds"
dir.output = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/BMMC/2.Genome_wide_prediction/ArchR_250226/BMMC5.merge.250226/"

In [3]:
n.cores = 16
maxDist = 1000000

In [4]:
addArchRThreads(threads = n.cores) 
addArchRGenome("hg38")

Setting default number of Parallel threads to 16.

Setting default genome to Hg38.



Import seurat object

In [5]:
obj.seurat = readRDS(path.seurat)

In [6]:
obj.seurat = subset(obj.seurat,
                    cells = colnames(obj.seurat)[!is.na(obj.seurat$cell_type.merged)])

Import candidate E-G pairs

In [7]:
pairs.E2G = readRDS(path.pairs.E2G)

Create an ArchRProject

In [8]:
str(Fragments(obj.seurat))

List of 13
 $ :Formal class 'Fragment' [package "Signac"] with 3 slots
  .. ..@ path : chr "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellra"| __truncated__
  .. ..@ hash : chr [1:2] "86350e844e0a3a1991e9c650ccac7f43" "555000121b665f095f13712669597ac8"
  .. ..@ cells: Named chr [1:5315] "TAGTTGTCACCCTCAC-1" "CTATGGCCATAACGGG-1" "CCGCACACAGGTTAAA-1" "TCATTTGGTAATGGAA-1" ...
  .. .. ..- attr(*, "names")= chr [1:5315] "s1d1_TAGTTGTCACCCTCAC-1" "s1d1_CTATGGCCATAACGGG-1" "s1d1_CCGCACACAGGTTAAA-1" "s1d1_TCATTTGGTAATGGAA-1" ...
 $ :Formal class 'Fragment' [package "Signac"] with 3 slots
  .. ..@ path : chr "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellra"| __truncated__
  .. ..@ hash : chr [1:2] "3ee058ae058031e3f233c68b0db9e6b9" "1039e761a9da92608ebabb952d11b3c8"
  .. ..@ cells: Named chr [1:5246] "GATGCAGCAGCAACAG-1" "TGAGTGAAGAGGAAGG-1" "CCGCTAAAGGACCGCT-1" "CTAACC

In [9]:
ArrowFiles <- createArrowFiles(
  inputFiles = c(Fragments(obj.seurat)[[1]]@path,
                 Fragments(obj.seurat)[[2]]@path,
                 Fragments(obj.seurat)[[3]]@path,
                 Fragments(obj.seurat)[[4]]@path,
                 Fragments(obj.seurat)[[5]]@path,
                 Fragments(obj.seurat)[[6]]@path,
                 Fragments(obj.seurat)[[7]]@path,
                 Fragments(obj.seurat)[[8]]@path,
                 Fragments(obj.seurat)[[9]]@path,
                 Fragments(obj.seurat)[[10]]@path,
                 Fragments(obj.seurat)[[11]]@path,
                 Fragments(obj.seurat)[[12]]@path,
                 Fragments(obj.seurat)[[13]]@path),
  sampleNames = c("s1d1",
                  "s1d2",
                  "s1d3",
                  "s2d1",
                  "s2d4",
                  "s2d5",
                  "s3d10",
                  "s3d3",
                  "s3d6",
                  "s3d7",
                  "s4d1",
                  "s4d8",
                  "s4d9"),
  validBarcodes = list(s1d1 = unname(Fragments(obj.seurat)[[1]]@cells),
                       s1d2 = unname(Fragments(obj.seurat)[[2]]@cells),
                       s1d3 = unname(Fragments(obj.seurat)[[3]]@cells),
                       s2d1 = unname(Fragments(obj.seurat)[[4]]@cells),
                       s2d4 = unname(Fragments(obj.seurat)[[5]]@cells),
                       s2d5 = unname(Fragments(obj.seurat)[[6]]@cells),
                       s3d10 = unname(Fragments(obj.seurat)[[7]]@cells),
                       s3d3 = unname(Fragments(obj.seurat)[[8]]@cells),
                       s3d6 = unname(Fragments(obj.seurat)[[9]]@cells),
                       s3d7 = unname(Fragments(obj.seurat)[[10]]@cells),
                       s4d1 = unname(Fragments(obj.seurat)[[11]]@cells),
                       s4d8 = unname(Fragments(obj.seurat)[[12]]@cells),
                       s4d9 = unname(Fragments(obj.seurat)[[13]]@cells)),
  addTileMat = T,
  addGeneScoreMat = F,
  force = T
)

Using GeneAnnotation set by addArchRGenome(Hg38)!

Using GeneAnnotation set by addArchRGenome(Hg38)!

ArchR logging to : ArchRLogs/ArchR-createArrows-cb5e9429f6bd2-Date-2025-02-26_Time-15-00-52.852547.log
If there is an issue, please report to github with logFile!

Cleaning Temporary Files

2025-02-26 15:00:53.561106 : Batch Execution w/ safelapply!, 0 mins elapsed.

ArchR logging successful to : ArchRLogs/ArchR-createArrows-cb5e9429f6bd2-Date-2025-02-26_Time-15-00-52.852547.log



In [10]:
ArrowFiles

In [11]:
dir.create(paste(dir.output,"ArchR",sep="/"),recursive = T)

In [12]:
obj.ArchR <- ArchRProject(
  ArrowFiles = ArrowFiles, 
  outputDirectory = paste(dir.output,"ArchR",sep="/"),
  copyArrows = TRUE 
)

Using GeneAnnotation set by addArchRGenome(Hg38)!

Using GeneAnnotation set by addArchRGenome(Hg38)!

Validating Arrows...

Getting SampleNames...



Copying ArrowFiles to Ouptut Directory! If you want to save disk space set copyArrows = FALSE

1 
2 
3 
4 
5 
6 
7 
8 
9 
10 
11 
12 
13 


Getting Cell Metadata...



Merging Cell Metadata...

Initializing ArchRProject...


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__           

In [13]:
obj.ArchR


           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _  \     
         /  ^  \    |  |_)  |    |  ,----'|  |__|  | |  |_)  |    
        /  /_\  \   |      /     |  |     |   __   | |      /     
       /  _____  \  |  |\  \\___ |  `----.|  |  |  | |  |\  \\___.
      /__/     \__\ | _| `._____| \______||__|  |__| | _| `._____|
    



class: ArchRProject 
outputDirectory: /maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/BMMC/2.Genome_wide_prediction/ArchR_250226/BMMC5.merge.250226/ArchR 
samples(13): s3d10 s3d3 ... s4d9 s3d6
sampleColData names(1): ArrowFiles
cellColData names(13): Sample TSSEnrichment ... nDiFrags BlacklistRatio
numberOfCells(1): 55479
medianTSS(1): 16.031
medianFrags(1): 9711

In [14]:
obj.ArchR = addIterativeLSI(
    ArchRProj = obj.ArchR,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI"
)

Checking Inputs...

ArchR logging to : ArchRLogs/ArchR-addIterativeLSI-cb5e94f40bcc7-Date-2025-02-26_Time-15-34-43.648311.log
If there is an issue, please report to github with logFile!

2025-02-26 15:34:45.28732 : Computing Total Across All Features, 0.01 mins elapsed.

2025-02-26 15:34:52.796609 : Computing Top Features, 0.135 mins elapsed.

###########
2025-02-26 15:34:55.10371 : Running LSI (1 of 2) on Top Features, 0.174 mins elapsed.
###########

2025-02-26 15:34:55.158746 : Sampling Cells (N = 10007) for Estimated LSI, 0.175 mins elapsed.

2025-02-26 15:34:55.169883 : Creating Sampled Partial Matrix, 0.175 mins elapsed.

2025-02-26 15:35:05.112386 : Computing Estimated LSI (projectAll = FALSE), 0.341 mins elapsed.

2025-02-26 15:36:09.731648 : Identifying Clusters, 1.418 mins elapsed.

“Data is of class matrix. Coercing to dgCMatrix.”
2025-02-26 15:36:21.320922 : Identified 6 Clusters, 1.611 mins elapsed.

2025-02-26 15:36:21.346459 : Saving LSI Iteration, 1.611 mins elapsed.

F


************************************************************
2025-02-26 15:36:31.40406 : ERROR Found in .saveIteration for  
LogFile = ArchRLogs/ArchR-addIterativeLSI-cb5e94f40bcc7-Date-2025-02-26_Time-15-34-43.648311.log

<simpleError in g$grobs[[legend]]: no such index at level 2
>

************************************************************



2025-02-26 15:36:31.40662 : Creating Cluster Matrix on the total Group Features, 1.779 mins elapsed.

2025-02-26 15:36:46.424224 : Computing Variable Features, 2.029 mins elapsed.

###########
2025-02-26 15:36:46.573563 : Running LSI (2 of 2) on Variable Features, 2.032 mins elapsed.
###########

2025-02-26 15:36:46.601951 : Creating Partial Matrix, 2.032 mins elapsed.

2025-02-26 15:37:07.781082 : Computing LSI, 2.385 mins elapsed.

2025-02-26 15:38:24.472785 : Finished Running IterativeLSI, 3.663 mins elapsed.



In [15]:
obj.ArchR = addPeakSet(obj.ArchR,
                       peakSet = reduce(pairs.E2G))

In [16]:
obj.ArchR = addPeakMatrix(obj.ArchR)

ArchR logging to : ArchRLogs/ArchR-addPeakMatrix-cb5e941737fff-Date-2025-02-26_Time-15-38-27.586949.log
If there is an issue, please report to github with logFile!

2025-02-26 15:38:27.967899 : Batch Execution w/ safelapply!, 0 mins elapsed.

ArchR logging successful to : ArchRLogs/ArchR-addPeakMatrix-cb5e941737fff-Date-2025-02-26_Time-15-38-27.586949.log



In [17]:
paths.h5 = c("/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site1_donor1/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site1_donor2/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site1_donor3/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site2_donor1/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site2_donor4/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site2_donor5/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site3_donor10/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site3_donor3/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site3_donor6/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site3_donor7/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site4_donor1/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site4_donor8/outs/raw_feature_bc_matrix.h5",
             "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/10x_multiome_240430/neurips2021_BMMC/cellranger_res/site4_donor9/outs/raw_feature_bc_matrix.h5")
names.sample = c("s1d1",
                 "s1d2",
                 "s1d3",
                 "s2d1",
                 "s2d4",
                 "s2d5",
                 "s3d10",
                 "s3d3",
                 "s3d6",
                 "s3d7",
                 "s4d1",
                 "s4d8",
                 "s4d9")
list.seRNA = lapply(1:length(names.sample),function(x){
    import10xFeatureMatrix(paths.h5[x],
                           names.sample[x])
})

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.

Importing Feature Matrix 1 of 1

Re-ordering RNA matricies for consistency.


In [18]:
id.intersect = rowData(list.seRNA[[1]])[,"id"]
length(id.intersect)
for(n.tmp in 2:13){
    id.intersect = intersect(id.intersect,rowData(list.seRNA[[n.tmp]])[,"id"])
}
length(id.intersect)

In [19]:
seRNA = list.seRNA[[1]][rowData(list.seRNA[[1]])[,"id"] %in% id.intersect]
for(n.tmp in 2:13){
    seRNA = cbind(seRNA,list.seRNA[[n.tmp]][rowData(list.seRNA[[n.tmp]])[,"id"] %in% id.intersect])
}
seRNA

class: RangedSummarizedExperiment 
dim: 61165 9222766 
metadata(0):
assays(1): counts
rownames(61165): DDX11L1 MIR6859-1 ... RNA5-8SN5 ENSG00000273937
rowData names(5): feature_type genome id interval name
colnames(9222766): s1d1#AAACAGCCAAACAACA-1 s1d1#AAACAGCCAAACATAG-1 ...
  s4d9#TTTGTTGGTTTGTGGA-1 s4d9#TTTGTTGGTTTGTTGC-1
colData names(0):

In [20]:
seRNA = seRNA[,rownames(obj.ArchR@cellColData)]

In [21]:
seRNA

class: RangedSummarizedExperiment 
dim: 61165 55479 
metadata(0):
assays(1): counts
rownames(61165): DDX11L1 MIR6859-1 ... RNA5-8SN5 ENSG00000273937
rowData names(5): feature_type genome id interval name
colnames(55479): s3d10#GTAGCTGTCAGCAAAG-1 s3d10#AACTAGTGTTACCTGT-1 ...
  s3d6#TCACCGGCAGCCTTGG-1 s3d6#GAACCTGTCGTTACAA-1
colData names(0):

Map gene names

In [22]:
extract_attributes <- function(gtf_attributes, att_of_interest){
  att <- unlist(strsplit(gtf_attributes, " "))
  if(att_of_interest %in% att){
    return(gsub("\"|;","", att[which(att %in% att_of_interest)+1]))
  } else {
    return(NA)}
}
map_gene_names <- function(rna_matrix, gene_gtf_path, abc_genes_path){
    library(dplyr)
    library(data.table)
    
	gene_ref <- fread(gene_gtf_path, header = FALSE, sep = "\t") %>%
		setNames(c("chr","source","type","start","end","score","strand","phase","attributes")) %>%
		dplyr::filter(type == "gene")
	gene_ref$gene_ref_name <- unlist(lapply(gene_ref$attributes, extract_attributes, "gene_name"))
	gene_ref$Ensembl_ID <- unlist(lapply(gene_ref$attributes, extract_attributes, "gene_id"))
	gene_ref <- dplyr::select(gene_ref, gene_ref_name, Ensembl_ID) %>%
		mutate(Ensembl_ID = sub("\\.\\d+$", "", Ensembl_ID)) %>% # remove decimal digits 
		distinct()
	
	abc_genes <- fread(abc_genes_path, col.names = c("chr", "start", "end", "name", "score", "strand", "Ensembl_ID", "gene_type")) %>%
		dplyr::select(name, Ensembl_ID) %>%
		rename(abc_name = name) %>%
		left_join(gene_ref, by = "Ensembl_ID") %>%
		group_by(Ensembl_ID) %>% # remove cases where multiple genes map to one ensembl ID
		filter(n() == 1) %>%
		ungroup()

	gene_key <- abc_genes$abc_name
	names(gene_key) <- abc_genes$gene_ref_name

	# remove genes not in our gene universe	
	row_sub <- intersect(rownames(rna_matrix), names(gene_key)) # gene ref names
	rna_matrix_filt <- rna_matrix[row_sub,] # still gene ref names
	rownames(rna_matrix_filt) <- gene_key[row_sub] # converted to abc names

	return(rna_matrix_filt)
}

In [23]:
seRNA.rename = map_gene_names(seRNA,gene_gtf_path, abc_genes_path)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:Biostrings’:

    collapse, intersect, setdiff, setequal, union


The following object is masked from ‘package:XVector’:

    slice


The following object is masked from ‘package:Biobase’:

    combine


The following objects are masked from ‘package:GenomicRanges’:

    intersect, setdiff, union


The following object is masked from ‘package:GenomeInfoDb’:

    intersect


The following objects are masked from ‘package:IRanges’:

    collapse, desc, intersect, setdiff, slice, union


The following objects are masked from ‘package:S4Vectors’:

    first, intersect, rename, setdiff, setequal, union


The following objects are masked from ‘package:BiocGenerics’:

    combine, intersect, setdiff, union


The following object is masked from ‘package:matrixStats’:

    count


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:plyr’

In [24]:
obj.ArchR = addGeneExpressionMatrix(obj.ArchR,
                                    seRNA = seRNA.rename,
                                    strictMatch = T)

ArchR logging to : ArchRLogs/ArchR-addGeneExpressionMatrix-cb5e97e6d45fb-Date-2025-02-26_Time-15-44-21.646315.log
If there is an issue, please report to github with logFile!

Overlap w/ scATAC = 1

2025-02-26 15:44:22.66762 : 

Overlap Per Sample w/ scATAC : s1d1=5255,s1d2=5205,s1d3=3342,s2d1=3446,s2d4=4835,s2d5=3660,s3d10=5404,s3d3=3500,s3d6=1421,s3d7=1319,s4d1=6035,s4d8=8730,s4d9=3327

2025-02-26 15:44:22.733185 : 

2025-02-26 15:44:25.516919 : Batch Execution w/ safelapply!, 0 mins elapsed.

ArchR logging successful to : ArchRLogs/ArchR-addGeneExpressionMatrix-cb5e97e6d45fb-Date-2025-02-26_Time-15-44-21.646315.log



Run ArchR prediction

In [25]:
start_time <- Sys.time()
obj.ArchR = addPeak2GeneLinks(
    ArchRProj = obj.ArchR,
    reducedDims = "IterativeLSI",
    useMatrix = "GeneExpressionMatrix",
    maxDist = maxDist,
    threads = n.cores
)
end_time <- Sys.time()
execution_time <- end_time - start_time

ArchR logging to : ArchRLogs/ArchR-addPeak2GeneLinks-cb5e96a960a1f-Date-2025-02-26_Time-15-47-02.520915.log
If there is an issue, please report to github with logFile!

2025-02-26 15:47:03.025618 : Getting Available Matrices, 0.008 mins elapsed.

2025-02-26 15:47:05.567197 : Filtered Low Prediction Score Cells (0 of 55479, 0), 0.019 mins elapsed.

2025-02-26 15:47:06.75913 : Computing KNN, 0.039 mins elapsed.

2025-02-26 15:47:07.006213 : Identifying Non-Overlapping KNN pairs, 0.043 mins elapsed.

2025-02-26 15:47:08.606753 : Identified 500 Groupings!, 0.069 mins elapsed.

2025-02-26 15:47:08.652627 : Getting Group RNA Matrix, 0.07 mins elapsed.

2025-02-26 15:47:39.774134 : Getting Group ATAC Matrix, 0.589 mins elapsed.

2025-02-26 15:48:30.138413 : Normalizing Group Matrices, 1.428 mins elapsed.

2025-02-26 15:48:38.513787 : Finding Peak Gene Pairings, 1.568 mins elapsed.

2025-02-26 15:48:39.951799 : Computing Correlations, 1.592 mins elapsed.

2025-02-26 15:49:03.984368 : Completed

In [26]:
execution_time

Time difference of 2.024864 mins

In [27]:
p2g = metadata(obj.ArchR@peakSet)$Peak2GeneLinks

In [28]:
metadata(p2g)[[1]]$PeakName = paste(seqnames(metadata(p2g)[[1]]),
                                    ranges(metadata(p2g)[[1]]),sep="-")

In [29]:
p2g$PairName = paste(metadata(p2g)[[1]][p2g$idxATAC]$PeakName,
                     metadata(p2g)[[2]][p2g$idxRNA]$name,
                     sep = "_")

In [30]:
rownames(p2g) = p2g$PairName
p2g

DataFrame with 7057138 rows and 7 columns
                                idxATAC    idxRNA Correlation        FDR
                              <integer> <integer>   <numeric>  <numeric>
chr1-10010-10512_FAM138A              1         1         NaN        NaN
chr1-115495-115916_FAM138A            2         1         NaN        NaN
chr1-180477-181904_FAM138A            3         1         NaN        NaN
chr1-191243-191960_FAM138A            4         1         NaN        NaN
chr1-267886-268118_FAM138A            5         1         NaN        NaN
...                                 ...       ...         ...        ...
chrX-156008487-156008683_IL9R    303478     20486  -0.0286262 0.67903951
chrX-156008873-156009316_IL9R    303479     20486   0.0977911 0.07583251
chrX-156019835-156020122_IL9R    303480     20486  -0.0272367 0.69545735
chrX-156029840-156030373_IL9R    303481     20486   0.1436141 0.00516523
chrX-156030498-156030725_IL9R    303482     20486   0.0305847 0.65587454
         

In [31]:
pairs.E2G.res = pairs.E2G[pairs.E2G$PairName %in% p2g$PairName]
mcols(pairs.E2G.res)[,c("Correlation","FDR","VarQATAC","VarQRNA")] = p2g[pairs.E2G.res$PairName,c("Correlation","FDR","VarQATAC","VarQRNA")]

In [32]:
pairs.E2G.res

GRanges object with 5682007 ranges and 7 metadata columns:
            seqnames              ranges strand |  TargetGene
               <Rle>           <IRanges>  <Rle> | <character>
        [1]     chr1 100015450-100016026      * |         AGL
        [2]     chr1 100015450-100016026      * |      CDC14A
        [3]     chr1 100015450-100016026      * |         DBT
        [4]     chr1 100015450-100016026      * |       EXTL2
        [5]     chr1 100015450-100016026      * |     MFSD14A
        ...      ...                 ...    ... .         ...
  [5682003]     chrX     9995615-9996434      * |        WWC3
  [5682004]     chrX     9997010-9998101      * |       CLCN4
  [5682005]     chrX     9997010-9998101      * |     SHROOM2
  [5682006]     chrX     9997010-9998101      * |       TBL1X
  [5682007]     chrX     9997010-9998101      * |        WWC3
                          PeakName               PairName Correlation
                       <character>            <character>   <nume

Save results

In [33]:
saveRDS(obj.ArchR,
        paste(dir.output,"obj.ArchR.rds",sep = "/"))
saveRDS(pairs.E2G.res,
        paste(dir.output,"pairs.E2G.res.rds",sep = "/"))
df.output = as.data.frame(pairs.E2G.res,row.names = NULL)
colnames(df.output)[1] = "chr"
df.output[,"CellType"] = "K562"
df.output = df.output[,c("chr",
                         "start",
                         "end",
                         "TargetGene",
                         "CellType",
                         "Correlation",
                         "FDR",
                         "VarQATAC",
                         "VarQRNA")]
data.table::fwrite(df.output,
                   file = paste(dir.output,"pairs.E2G.res.tsv.gz",sep = "/"),
                   row.names = F,
                   quote = F,
                   sep = "\t")
pairs.E2G.res
df.output

GRanges object with 5682007 ranges and 7 metadata columns:
            seqnames              ranges strand |  TargetGene
               <Rle>           <IRanges>  <Rle> | <character>
        [1]     chr1 100015450-100016026      * |         AGL
        [2]     chr1 100015450-100016026      * |      CDC14A
        [3]     chr1 100015450-100016026      * |         DBT
        [4]     chr1 100015450-100016026      * |       EXTL2
        [5]     chr1 100015450-100016026      * |     MFSD14A
        ...      ...                 ...    ... .         ...
  [5682003]     chrX     9995615-9996434      * |        WWC3
  [5682004]     chrX     9997010-9998101      * |       CLCN4
  [5682005]     chrX     9997010-9998101      * |     SHROOM2
  [5682006]     chrX     9997010-9998101      * |       TBL1X
  [5682007]     chrX     9997010-9998101      * |        WWC3
                          PeakName               PairName Correlation
                       <character>            <character>   <nume

chr,start,end,TargetGene,CellType,Correlation,FDR,VarQATAC,VarQRNA
<fct>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,100015450,100016026,AGL,K562,-0.06076611,3.182129e-01,0.1738093,0.7999121
chr1,100015450,100016026,CDC14A,K562,-0.01057877,8.873440e-01,0.1738093,0.9706629
chr1,100015450,100016026,DBT,K562,0.02538999,7.171450e-01,0.1738093,0.6209118
chr1,100015450,100016026,EXTL2,K562,0.05905580,3.349352e-01,0.1738093,0.4855511
chr1,100015450,100016026,MFSD14A,K562,0.02304631,7.445408e-01,0.1738093,0.3837743
chr1,100015450,100016026,RTCA,K562,0.02181795,7.588357e-01,0.1738093,0.6939373
chr1,100015450,100016026,RTCA-AS1,K562,-0.04240575,5.160080e-01,0.1738093,0.4477692
chr1,100015450,100016026,SASS6,K562,0.02975432,6.656602e-01,0.1738093,0.8313971
chr1,100015450,100016026,SLC30A7,K562,0.04653812,4.685653e-01,0.1738093,0.8036708
chr1,100015450,100016026,SLC35A3,K562,0.06122999,3.137617e-01,0.1738093,0.6069999


In [34]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux 8.10 (Ootpa)

Matrix products: default
BLAS/LAPACK: /maps/projects/ralab/people/lpm537/software/anaconda3/envs/Notebook_E2G_240505/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0

Random number generation:
 RNG:     L'Ecuyer-CMRG 
 Normal:  Inversion 
 Sample:  Rejection 
 
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Copenhagen
tzcode source: system (glibc)

attached base packages:
 [1] parallel  stats4    grid      stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] dplyr_1.1.4           