In [1]:
library(FigR)
library(Seurat)
library(genomation)
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg38)
library(foreach)

Loading required package: Matrix

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats,

Specify file path

In [2]:
path.matrix.atac_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_Xu/K562/Kendall/atac_matrix.csv.gz"
path.matrix.rna_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Xu/1.prepare_data/1.seurat_pipeline.240507/rna_count_matrix.csv.gz"
dir.output = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Xu/3.Genome_wide_prediction/FigR.default/FigR.240612/"

In [3]:
n.cores = 16

Import ATAC matrix

In [4]:
matrix.atac = read.csv(path.matrix.atac_count,
                       row.names = 1,
                       check.names = F)
matrix.atac = Matrix(as.matrix(matrix.atac), 
                     sparse = TRUE)

Import RNA matrix

In [5]:
matrix.rna_count = read.csv(path.matrix.rna_count,
                            row.names = 1,
                            check.names = F)
matrix.rna_count = Matrix(as.matrix(matrix.rna_count), sparse = TRUE)
matrix.rna_count = matrix.rna_count[,colnames(matrix.atac)]
matrix.rna = matrix.rna_count[rowSums(matrix.rna_count) > 0,]
matrix.rna = NormalizeData(matrix.rna)
rm(matrix.rna_count)

Make ATAC SummarizedExperiment object

In [9]:
df.peaks = as.data.frame(do.call(rbind,strsplit(rownames(matrix.atac),"-")))
colnames(df.peaks) = c("chr","start","end")

In [10]:
bed.peaks = makeGRangesFromDataFrame(df.peaks)

In [11]:
ATAC.se <- SummarizedExperiment(assays=SimpleList(counts=matrix.atac),
                                rowRanges=bed.peaks)

Run FigR prediction

In [None]:
cisCor <- runGenePeakcorr(ATAC.se = ATAC.se,
                          RNAmat = matrix.rna,
                          genome = "hg38", 
                          nCores = n.cores, 
                          p.cut = NULL,
                          keepPosCorOnly = F)
cisCor

Assuming paired scATAC/scRNA-seq data ..



Matrix object input detectedCentering counts for cells sequentially in groups of size  1000  ..

Computing centered counts for cells:  1  to  1000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  1001  to  2000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  2001  to  3000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  3001  to  4000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  4001  to  5000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  5001  to  6000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  6001  to  7000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  

Loading required package: iterators

Loading required package: parallel



In [13]:
cisCor

Peak,PeakRanges,Gene,rObs,pvalZ
<dbl>,<chr>,<chr>,<dbl>,<dbl>
1338,chr1:115493-115961,OR4F5,0.0157355382,0.111991475
17878,chr1:804348-805153,FAM87B,0.0254473489,0.050114786
17901,chr1:816701-817594,FAM87B,0.0266505827,0.028215881
17808,chr1:778120-779401,LINC01128,0.0249139102,0.020607569
17909,chr1:818917-819221,LINC01128,0.0096345124,0.337101803
17973,chr1:842493-843467,LINC01128,0.0418846845,0.003623764
18036,chr1:849559-850153,LINC01128,-0.0005044386,0.670700760
18079,chr1:856218-856698,LINC01128,0.0005674452,0.602986234
18092,chr1:860170-861174,LINC01128,0.0091232008,0.385078608
18146,chr1:869578-870262,LINC01128,0.0151104067,0.204736225


In [20]:
df.peak.res = do.call(rbind,strsplit(gsub(":","-",cisCor$PeakRanges),"-"))
df.peak.res = as.data.frame(df.peak.res)
colnames(df.peak.res) = c("chr","start","end")

In [21]:
pairs.E2G.res = makeGRangesFromDataFrame(df.peak.res)
mcols(pairs.E2G.res) = cisCor

In [22]:
pairs.E2G.res$TargetGene = pairs.E2G.res$Gene
pairs.E2G.res$PeakName = gsub(":","-",pairs.E2G.res$PeakRanges)
pairs.E2G.res$PairName = paste(pairs.E2G.res$PeakName,
                               pairs.E2G.res$TargetGene,
                               sep = "_")
pairs.E2G.res

GRanges object with 111348 ranges and 8 metadata columns:
           seqnames              ranges strand |      Peak
              <Rle>           <IRanges>  <Rle> | <numeric>
       [1]     chr1       115493-115961      * |      1338
       [2]     chr1       804348-805153      * |     17878
       [3]     chr1       816701-817594      * |     17901
       [4]     chr1       778120-779401      * |     17808
       [5]     chr1       818917-819221      * |     17909
       ...      ...                 ...    ... .       ...
  [111344]     chrX 156000905-156002182      * |    155177
  [111345]     chrX 156002257-156003190      * |    155178
  [111346]     chrX 156016191-156016823      * |    155181
  [111347]     chrX 155997283-155998084      * |    155175
  [111348]     chrX 156009722-156010137      * |    155180
                       PeakRanges        Gene        rObs     pvalZ  TargetGene
                      <character> <character>   <numeric> <numeric> <character>
       [1]     

Save results

In [24]:
dir.create(dir.output,recursive = T)

In [25]:
saveRDS(pairs.E2G.res,
        paste(dir.output,"pairs.E2G.res.rds",sep = "/"))
pairs.E2G.res

GRanges object with 111348 ranges and 8 metadata columns:
           seqnames              ranges strand |      Peak
              <Rle>           <IRanges>  <Rle> | <numeric>
       [1]     chr1       115493-115961      * |      1338
       [2]     chr1       804348-805153      * |     17878
       [3]     chr1       816701-817594      * |     17901
       [4]     chr1       778120-779401      * |     17808
       [5]     chr1       818917-819221      * |     17909
       ...      ...                 ...    ... .       ...
  [111344]     chrX 156000905-156002182      * |    155177
  [111345]     chrX 156002257-156003190      * |    155178
  [111346]     chrX 156016191-156016823      * |    155181
  [111347]     chrX 155997283-155998084      * |    155175
  [111348]     chrX 156009722-156010137      * |    155180
                       PeakRanges        Gene        rObs     pvalZ  TargetGene
                      <character> <character>   <numeric> <numeric> <character>
       [1]     

In [26]:
df.output = as.data.frame(pairs.E2G.res,row.names = NULL)
colnames(df.output)[1] = "chr"
df.output[,"CellType"] = "K562"
df.output = df.output[,c("chr",
                         "start",
                         "end",
                         "TargetGene",
                         "CellType",
                         "rObs",
                         "pvalZ")]
data.table::fwrite(df.output,
                   file = paste(dir.output,"pairs.E2G.res.tsv.gz",sep = "/"),
                   row.names = F,
                   quote = F,
                   sep = "\t")
df.output

chr,start,end,TargetGene,CellType,rObs,pvalZ
<fct>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>
chr1,115493,115961,OR4F5,K562,0.0157355382,0.111991475
chr1,804348,805153,FAM87B,K562,0.0254473489,0.050114786
chr1,816701,817594,FAM87B,K562,0.0266505827,0.028215881
chr1,778120,779401,LINC01128,K562,0.0249139102,0.020607569
chr1,818917,819221,LINC01128,K562,0.0096345124,0.337101803
chr1,842493,843467,LINC01128,K562,0.0418846845,0.003623764
chr1,849559,850153,LINC01128,K562,-0.0005044386,0.670700760
chr1,856218,856698,LINC01128,K562,0.0005674452,0.602986234
chr1,860170,861174,LINC01128,K562,0.0091232008,0.385078608
chr1,869578,870262,LINC01128,K562,0.0151104067,0.204736225


In [27]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux 8.9 (Ootpa)

Matrix products: default
BLAS/LAPACK: /maps/projects/ralab/people/lpm537/software/anaconda3/envs/Notebook_E2G_240505/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Copenhagen
tzcode source: system (glibc)

attached base packages:
 [1] parallel  grid      stats4    stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] pbmcapply_1.5.1                   doParallel_1.0.17                
 [3] iterators_1.0.14                  foreach_1.5