In [1]:
library(FigR)
library(Seurat)
library(genomation)
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg38)
library(foreach)

Loading required package: Matrix

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats,

Specify file path

In [2]:
path.pairs.E2G = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_IGVF/K562/Kendall/Pairs.tsv.gz"
path.matrix.atac_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_IGVF/K562/Kendall/atac_matrix.csv.gz"
path.matrix.rna_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_IGVF/1.prepare_data/1.import_IGVF_processed_data.240508/rna_count_matrix.csv.gz"
dir.output = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_IGVF/3.Genome_wide_prediction/FigR/FigR.240517/"

Import candidate E-G pairs

In [3]:
pairs.E2G = readGeneric(path.pairs.E2G,
                        header = T,
                        keep.all.metadata = T)

Import ATAC matrix

In [4]:
matrix.atac_count = read.csv(path.matrix.atac_count,
                             row.names = 1,
                             check.names = F)
matrix.atac_count = Matrix(as.matrix(matrix.atac_count), sparse = TRUE)
matrix.atac = centerCounts(matrix.atac_count)

Matrix object input detectedCentering counts for cells sequentially in groups of size  1000  ..

Computing centered counts for cells:  1  to  1000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  1001  to  2000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  2001  to  3000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  3001  to  4000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  4001  to  5000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  5001  to  6000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  6001  to  6266 ..
Computing centered counts per cell using mean reads in features ..

Merging results..
Done!


Import RNA matrix

In [5]:
matrix.rna_count = read.csv(path.matrix.rna_count,
                            row.names = 1,
                            check.names = F)
matrix.rna_count = Matrix(as.matrix(matrix.rna_count), sparse = TRUE)
matrix.rna_count = matrix.rna_count[,colnames(matrix.atac)]
matrix.rna = matrix.rna_count[rowSums(matrix.rna_count) > 0,]
matrix.rna = NormalizeData(matrix.rna_count)
rm(matrix.rna_count)

In [6]:
pairs.E2G.filter = pairs.E2G[pairs.E2G$TargetGene %in% rownames(matrix.rna) &
                             pairs.E2G$PeakName %in% rownames(matrix.atac)]

Prepare FigR input data

In [7]:
bed.peak = pairs.E2G
mcols(bed.peak) = NULL
bed.peak = unique(bed.peak)
bed.peak$PeakName = paste(seqnames(bed.peak),
                          start(bed.peak),
                          end(bed.peak),
                          sep = "-")
names(bed.peak) = bed.peak$PeakName
bed.peak = bed.peak[rownames(matrix.atac)]

In [8]:
ATAC.se = SummarizedExperiment(assays = SimpleList(counts = matrix.atac_count),
                               rowRanges = bed.peak)
rm(matrix.atac_count)
ATAC.se <- chromVAR::addGCBias(ATAC.se, genome = BSgenome.Hsapiens.UCSC.hg38)
ATAC.se

class: RangedSummarizedExperiment 
dim: 153522 6266 
metadata(0):
assays(1): counts
rownames(153522): chr1-10002583-10003163 chr1-100028213-100029660 ...
  chrX-9940388-9940697 chrX-9995829-9996183
rowData names(2): PeakName bias
colnames(6266): e10l1_AAACCAGGTCACAGGG-1 e10l1_AAACCGCCAAGGAATA-1 ...
  e10l2_TTTGTGAGTTAGCGTT-1 e10l2_TTTGTGAGTTCCGGAC-1
colData names(0):

In [9]:
bg <- chromVAR::getBackgroundPeaks(ATAC.se, niterations = 100)

In [10]:
index.gene = data.frame(gene_name = rownames(matrix.rna),
                       index = 1:nrow(matrix.rna))
rownames(index.gene) = index.gene$gene_name

index.peak = data.frame(peak_name = rownames(matrix.atac),
                       index = 1:nrow(matrix.atac))
rownames(index.peak) = index.peak$peak_name

genePeakOv = data.frame(queryHits = index.gene[pairs.E2G.filter$TargetGene, "index"],
                        subjectHits = index.peak[pairs.E2G.filter$PeakName, "index"])
genePeakOv

queryHits,subjectHits
<int>,<int>
321,1
176,1
294,1
295,1
195,1
266,1
168,1
298,1
243,1
246,1


Run FigR prediction

In [11]:
start_time <- Sys.time()
ObsCor = PeakGeneCor(ATAC = matrix.atac, 
                     RNA = matrix.rna,
                     OV = genePeakOv, 
                     chunkSize = 500,
                     ncores = 16, 
                     bg = bg)
end_time <- Sys.time()
execution_time <- end_time - start_time
execution_time
ObsCor

Running in parallel using  16 cores ..
Computing observed correlations ..
Finished!

Time Elapsed:  0.44210410118103 secs 

Computing background correlations ..


Loading required package: parallel




Time Elapsed:  22.5752475261688 secs 



Time difference of 32.04453 secs

Gene,Peak,rObs,rBg1,rBg2,rBg3,rBg4,rBg5,rBg6,rBg7,⋯,rBg91,rBg92,rBg93,rBg94,rBg95,rBg96,rBg97,rBg98,rBg99,rBg100
<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
321,1,0.017126846,0.039864734,0.016653874,0.015611746,-0.005931205,0.04128343,0.01725659,-0.006229939,⋯,0.01708859,0.01815164,0.04436050,0.01381457,-0.006855891,-0.006855891,0.01647375,0.01673224,0.038184734,-0.006445145
176,1,-0.004486999,0.009772838,-0.004007834,0.009266287,0.015374973,0.02493168,0.02488278,0.026770464,⋯,0.02893904,0.01293845,0.01149089,-0.02073390,0.007127084,-0.004973629,0.00138315,-0.01529702,-0.002818709,0.004019534
294,1,0.017126846,0.039864734,0.016653874,0.015611746,-0.005931205,0.04128343,0.01725659,-0.006229939,⋯,0.01708859,0.01815164,0.04436050,0.01381457,-0.006855891,-0.006855891,0.01647375,0.01673224,0.038184734,-0.006445145
295,1,-0.004486999,0.009772838,-0.004007834,0.009266287,0.015374973,0.02493168,0.02488278,0.026770464,⋯,0.02893904,0.01293845,0.01149089,-0.02073390,0.007127084,-0.004973629,0.00138315,-0.01529702,-0.002818709,0.004019534
195,1,0.017126846,0.039864734,0.016653874,0.015611746,-0.005931205,0.04128343,0.01725659,-0.006229939,⋯,0.01708859,0.01815164,0.04436050,0.01381457,-0.006855891,-0.006855891,0.01647375,0.01673224,0.038184734,-0.006445145
266,1,-0.004486999,0.009772838,-0.004007834,0.009266287,0.015374973,0.02493168,0.02488278,0.026770464,⋯,0.02893904,0.01293845,0.01149089,-0.02073390,0.007127084,-0.004973629,0.00138315,-0.01529702,-0.002818709,0.004019534
168,1,0.017126846,0.039864734,0.016653874,0.015611746,-0.005931205,0.04128343,0.01725659,-0.006229939,⋯,0.01708859,0.01815164,0.04436050,0.01381457,-0.006855891,-0.006855891,0.01647375,0.01673224,0.038184734,-0.006445145
298,1,-0.004486999,0.009772838,-0.004007834,0.009266287,0.015374973,0.02493168,0.02488278,0.026770464,⋯,0.02893904,0.01293845,0.01149089,-0.02073390,0.007127084,-0.004973629,0.00138315,-0.01529702,-0.002818709,0.004019534
243,1,0.017126846,0.039864734,0.016653874,0.015611746,-0.005931205,0.04128343,0.01725659,-0.006229939,⋯,0.01708859,0.01815164,0.04436050,0.01381457,-0.006855891,-0.006855891,0.01647375,0.01673224,0.038184734,-0.006445145
246,1,-0.004486999,0.009772838,-0.004007834,0.009266287,0.015374973,0.02493168,0.02488278,0.026770464,⋯,0.02893904,0.01293845,0.01149089,-0.02073390,0.007127084,-0.004973629,0.00138315,-0.01529702,-0.002818709,0.004019534


In [13]:
pairs.E2G.res = pairs.E2G.filter
pairs.E2G.res$rObs = ObsCor[,"rObs"]
pairs.E2G.res$rBgSD <- matrixStats::rowSds(as.matrix(ObsCor[, 4:103]))
pairs.E2G.res$rBgMean <- rowMeans(ObsCor[, 4:103])
pairs.E2G.res$pvalZ <- 1 - stats::pnorm(q = pairs.E2G.res$rObs, 
                                        mean = pairs.E2G.res$rBgMean,
                                        sd = pairs.E2G.res$rBgSD)
pairs.E2G.res

GRanges object with 10205326 ranges and 7 metadata columns:
             seqnames            ranges strand |  TargetGene
                <Rle>         <IRanges>  <Rle> | <character>
         [1]     chr1 10002583-10003163      * |     AADACL4
         [2]     chr1 10002583-10003163      * |       ACOT7
         [3]     chr1 10002583-10003163      * |      AGTRAP
         [4]     chr1 10002583-10003163      * |    C1orf167
         [5]     chr1 10002583-10003163      * |      CAMTA1
         ...      ...               ...    ... .         ...
  [10205322]     chrX   9995829-9996183      * |       TBL1X
  [10205323]     chrX   9995829-9996183      * |      TCEANC
  [10205324]     chrX   9995829-9996183      * |      TMSB4X
  [10205325]     chrX   9995829-9996183      * |     TRAPPC2
  [10205326]     chrX   9995829-9996183      * |        WWC3
                           PeakName               PairName       rObs     rBgSD
                        <character>            <character>  <numeri

Save results

In [14]:
dir.create(dir.output,recursive = T)
saveRDS(pairs.E2G.res,
        paste(dir.output,"pairs.E2G.res.rds",sep = "/"))
pairs.E2G.res

GRanges object with 10205326 ranges and 7 metadata columns:
             seqnames            ranges strand |  TargetGene
                <Rle>         <IRanges>  <Rle> | <character>
         [1]     chr1 10002583-10003163      * |     AADACL4
         [2]     chr1 10002583-10003163      * |       ACOT7
         [3]     chr1 10002583-10003163      * |      AGTRAP
         [4]     chr1 10002583-10003163      * |    C1orf167
         [5]     chr1 10002583-10003163      * |      CAMTA1
         ...      ...               ...    ... .         ...
  [10205322]     chrX   9995829-9996183      * |       TBL1X
  [10205323]     chrX   9995829-9996183      * |      TCEANC
  [10205324]     chrX   9995829-9996183      * |      TMSB4X
  [10205325]     chrX   9995829-9996183      * |     TRAPPC2
  [10205326]     chrX   9995829-9996183      * |        WWC3
                           PeakName               PairName       rObs     rBgSD
                        <character>            <character>  <numeri

In [15]:
df.output = as.data.frame(pairs.E2G.res)
colnames(df.output)[1] = "chr"
df.output[,"CellType"] = "K562"
data.table::fwrite(df.output,
                   file = paste(dir.output,"pairs.E2G.res.tsv.gz",sep = "/"),
                   row.names = F,
                   quote = F,
                   sep = "\t")
df.output

chr,start,end,width,strand,TargetGene,PeakName,PairName,rObs,rBgSD,rBgMean,pvalZ,CellType
<fct>,<int>,<int>,<int>,<fct>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
chr1,10002583,10003163,581,*,AADACL4,chr1-10002583-10003163,chr1-10002583-10003163_AADACL4,0.017126846,0.01592321,0.005655146,0.2356274,K562
chr1,10002583,10003163,581,*,ACOT7,chr1-10002583-10003163,chr1-10002583-10003163_ACOT7,-0.004486999,0.01356252,0.008585765,0.8324493,K562
chr1,10002583,10003163,581,*,AGTRAP,chr1-10002583-10003163,chr1-10002583-10003163_AGTRAP,0.017126846,0.01592321,0.005655146,0.2356274,K562
chr1,10002583,10003163,581,*,C1orf167,chr1-10002583-10003163,chr1-10002583-10003163_C1orf167,-0.004486999,0.01356252,0.008585765,0.8324493,K562
chr1,10002583,10003163,581,*,CAMTA1,chr1-10002583-10003163,chr1-10002583-10003163_CAMTA1,0.017126846,0.01592321,0.005655146,0.2356274,K562
chr1,10002583,10003163,581,*,CASZ1,chr1-10002583-10003163,chr1-10002583-10003163_CASZ1,-0.004486999,0.01356252,0.008585765,0.8324493,K562
chr1,10002583,10003163,581,*,CHD5,chr1-10002583-10003163,chr1-10002583-10003163_CHD5,0.017126846,0.01592321,0.005655146,0.2356274,K562
chr1,10002583,10003163,581,*,CLCN6,chr1-10002583-10003163,chr1-10002583-10003163_CLCN6,-0.004486999,0.01356252,0.008585765,0.8324493,K562
chr1,10002583,10003163,581,*,CLSTN1,chr1-10002583-10003163,chr1-10002583-10003163_CLSTN1,0.017126846,0.01592321,0.005655146,0.2356274,K562
chr1,10002583,10003163,581,*,CTNNBIP1,chr1-10002583-10003163,chr1-10002583-10003163_CTNNBIP1,-0.004486999,0.01356252,0.008585765,0.8324493,K562


In [16]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux 8.9 (Ootpa)

Matrix products: default
BLAS/LAPACK: /maps/projects/ralab/people/lpm537/software/anaconda3/envs/Notebook_E2G_240505/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Copenhagen
tzcode source: system (glibc)

attached base packages:
 [1] parallel  grid      stats4    stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] pbmcapply_1.5.1                   foreach_1.5.2                    
 [3] BSgenome.Hsapiens.UCSC.hg38_1.4.5 BSgenome_1.