In [1]:
library(FigR)
library(Seurat)
library(genomation)
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg38)
library(foreach)

Loading required package: Matrix

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats,

Specify file path

In [2]:
path.pairs.E2G = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_Wang/K562/Kendall/Pairs.tsv.gz"
path.matrix.atac_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_Wang/K562/Kendall/atac_matrix.csv.gz"
path.matrix.rna_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Wang/1.prepare_data/1.seurat_pipeline.240507/rna_count_matrix.csv.gz"
dir.output = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Wang/3.Genome_wide_prediction/FigR/FigR.240517/"

Import candidate E-G pairs

In [3]:
pairs.E2G = readGeneric(path.pairs.E2G,
                        header = T,
                        keep.all.metadata = T)

Import ATAC matrix

In [4]:
matrix.atac_count = read.csv(path.matrix.atac_count,
                             row.names = 1,
                             check.names = F)
matrix.atac_count = Matrix(as.matrix(matrix.atac_count), sparse = TRUE)
matrix.atac = centerCounts(matrix.atac_count)

Matrix object input detectedCentering counts for cells sequentially in groups of size  1000  ..

Computing centered counts for cells:  1  to  1000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  1001  to  2000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  2001  to  3000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  3001  to  4000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  4001  to  4574 ..
Computing centered counts per cell using mean reads in features ..

Merging results..
Done!


Import RNA matrix

In [5]:
matrix.rna_count = read.csv(path.matrix.rna_count,
                            row.names = 1,
                            check.names = F)
matrix.rna_count = Matrix(as.matrix(matrix.rna_count), sparse = TRUE)
matrix.rna_count = matrix.rna_count[,colnames(matrix.atac)]
matrix.rna = matrix.rna_count[rowSums(matrix.rna_count) > 0,]
matrix.rna = NormalizeData(matrix.rna_count)
rm(matrix.rna_count)

In [6]:
pairs.E2G.filter = pairs.E2G[pairs.E2G$TargetGene %in% rownames(matrix.rna) &
                             pairs.E2G$PeakName %in% rownames(matrix.atac)]

Prepare FigR input data

In [7]:
bed.peak = pairs.E2G
mcols(bed.peak) = NULL
bed.peak = unique(bed.peak)
bed.peak$PeakName = paste(seqnames(bed.peak),
                          start(bed.peak),
                          end(bed.peak),
                          sep = "-")
names(bed.peak) = bed.peak$PeakName
bed.peak = bed.peak[rownames(matrix.atac)]

In [8]:
ATAC.se = SummarizedExperiment(assays = SimpleList(counts = matrix.atac_count),
                               rowRanges = bed.peak)
rm(matrix.atac_count)
ATAC.se <- chromVAR::addGCBias(ATAC.se, genome = BSgenome.Hsapiens.UCSC.hg38)
ATAC.se

class: RangedSummarizedExperiment 
dim: 153715 4574 
metadata(0):
assays(1): counts
rownames(153715): chr1-100028738-100029317 chr1-100029410-100029652 ...
  chrX-9940445-9940756 chrX-9995833-9996097
rowData names(2): PeakName bias
colnames(4574): K562_Wang_AAACAGCCAATGCCCG-1
  K562_Wang_AAACAGCCACCCTCAC-1 ... K562_Wang_TCCATATTCAAACCTA-1
  K562_Wang_TGACTTCGTGGATTAT-1
colData names(0):

In [9]:
bg <- chromVAR::getBackgroundPeaks(ATAC.se, niterations = 100)

In [10]:
index.gene = data.frame(gene_name = rownames(matrix.rna),
                       index = 1:nrow(matrix.rna))
rownames(index.gene) = index.gene$gene_name

index.peak = data.frame(peak_name = rownames(matrix.atac),
                       index = 1:nrow(matrix.atac))
rownames(index.peak) = index.peak$peak_name

genePeakOv = data.frame(queryHits = index.gene[pairs.E2G.filter$TargetGene, "index"],
                        subjectHits = index.peak[pairs.E2G.filter$PeakName, "index"])
genePeakOv

queryHits,subjectHits
<int>,<int>
2483,1
2426,1
2506,1
2496,1
2519,1
2453,1
2515,1
2479,1
2495,1
2490,1


Run FigR prediction

In [11]:
start_time <- Sys.time()
ObsCor = PeakGeneCor(ATAC = matrix.atac, 
                     RNA = matrix.rna,
                     OV = genePeakOv, 
                     chunkSize = 500,
                     ncores = 16, 
                     bg = bg)
end_time <- Sys.time()
execution_time <- end_time - start_time
execution_time
ObsCor

Running in parallel using  16 cores ..
Computing observed correlations ..
Finished!

Time Elapsed:  0.331213474273682 secs 

Computing background correlations ..


Loading required package: parallel




Time Elapsed:  24.6826527118683 secs 



Time difference of 37.6762 secs

Gene,Peak,rObs,rBg1,rBg2,rBg3,rBg4,rBg5,rBg6,rBg7,⋯,rBg91,rBg92,rBg93,rBg94,rBg95,rBg96,rBg97,rBg98,rBg99,rBg100
<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2483,1,0.004720585,-0.009008794,0.02181716,0.02061151,-0.008880727,-0.009975507,-0.002138583,0.022733820,⋯,-0.0016244340,-0.014327464,0.001115147,0.02381226,0.015270250,-0.02707097,0.02621844,0.015320757,-0.004315474,0.003484317
2426,1,-0.012274704,0.001958824,0.01673719,0.02380841,0.004787301,0.037875927,0.003711133,-0.001758795,⋯,0.0008778608,0.003190378,0.014909170,0.02420112,-0.001300163,-0.02836688,-0.02010790,0.001543144,-0.009089388,-0.015230127
2506,1,0.004720585,-0.009008794,0.02181716,0.02061151,-0.008880727,-0.009975507,-0.002138583,0.022733820,⋯,-0.0016244340,-0.014327464,0.001115147,0.02381226,0.015270250,-0.02707097,0.02621844,0.015320757,-0.004315474,0.003484317
2496,1,-0.012274704,0.001958824,0.01673719,0.02380841,0.004787301,0.037875927,0.003711133,-0.001758795,⋯,0.0008778608,0.003190378,0.014909170,0.02420112,-0.001300163,-0.02836688,-0.02010790,0.001543144,-0.009089388,-0.015230127
2519,1,0.004720585,-0.009008794,0.02181716,0.02061151,-0.008880727,-0.009975507,-0.002138583,0.022733820,⋯,-0.0016244340,-0.014327464,0.001115147,0.02381226,0.015270250,-0.02707097,0.02621844,0.015320757,-0.004315474,0.003484317
2453,1,-0.012274704,0.001958824,0.01673719,0.02380841,0.004787301,0.037875927,0.003711133,-0.001758795,⋯,0.0008778608,0.003190378,0.014909170,0.02420112,-0.001300163,-0.02836688,-0.02010790,0.001543144,-0.009089388,-0.015230127
2515,1,0.004720585,-0.009008794,0.02181716,0.02061151,-0.008880727,-0.009975507,-0.002138583,0.022733820,⋯,-0.0016244340,-0.014327464,0.001115147,0.02381226,0.015270250,-0.02707097,0.02621844,0.015320757,-0.004315474,0.003484317
2479,1,-0.012274704,0.001958824,0.01673719,0.02380841,0.004787301,0.037875927,0.003711133,-0.001758795,⋯,0.0008778608,0.003190378,0.014909170,0.02420112,-0.001300163,-0.02836688,-0.02010790,0.001543144,-0.009089388,-0.015230127
2495,1,0.004720585,-0.009008794,0.02181716,0.02061151,-0.008880727,-0.009975507,-0.002138583,0.022733820,⋯,-0.0016244340,-0.014327464,0.001115147,0.02381226,0.015270250,-0.02707097,0.02621844,0.015320757,-0.004315474,0.003484317
2490,1,-0.012274704,0.001958824,0.01673719,0.02380841,0.004787301,0.037875927,0.003711133,-0.001758795,⋯,0.0008778608,0.003190378,0.014909170,0.02420112,-0.001300163,-0.02836688,-0.02010790,0.001543144,-0.009089388,-0.015230127


In [12]:
pairs.E2G.res = pairs.E2G.filter
pairs.E2G.res$rObs = ObsCor[,"rObs"]
pairs.E2G.res$rBgSD <- matrixStats::rowSds(as.matrix(ObsCor[, 4:103]))
pairs.E2G.res$rBgMean <- rowMeans(ObsCor[, 4:103])
pairs.E2G.res$pvalZ <- 1 - stats::pnorm(q = pairs.E2G.res$rObs, 
                                        mean = pairs.E2G.res$rBgMean,
                                        sd = pairs.E2G.res$rBgSD)
pairs.E2G.res

GRanges object with 10231864 ranges and 7 metadata columns:
             seqnames              ranges strand |  TargetGene
                <Rle>           <IRanges>  <Rle> | <character>
         [1]     chr1 100028738-100029317      * |         AGL
         [2]     chr1 100028738-100029317      * |       ALG14
         [3]     chr1 100028738-100029317      * |      CDC14A
         [4]     chr1 100028738-100029317      * |         DBT
         [5]     chr1 100028738-100029317      * |        DPH5
         ...      ...                 ...    ... .         ...
  [10231860]     chrX     9995833-9996097      * |       TBL1X
  [10231861]     chrX     9995833-9996097      * |      TCEANC
  [10231862]     chrX     9995833-9996097      * |      TMSB4X
  [10231863]     chrX     9995833-9996097      * |     TRAPPC2
  [10231864]     chrX     9995833-9996097      * |        WWC3
                           PeakName               PairName        rObs
                        <character>            <ch

Save results

In [13]:
dir.create(dir.output,recursive = T)
saveRDS(pairs.E2G.res,
        paste(dir.output,"pairs.E2G.res.rds",sep = "/"))
pairs.E2G.res

GRanges object with 10231864 ranges and 7 metadata columns:
             seqnames              ranges strand |  TargetGene
                <Rle>           <IRanges>  <Rle> | <character>
         [1]     chr1 100028738-100029317      * |         AGL
         [2]     chr1 100028738-100029317      * |       ALG14
         [3]     chr1 100028738-100029317      * |      CDC14A
         [4]     chr1 100028738-100029317      * |         DBT
         [5]     chr1 100028738-100029317      * |        DPH5
         ...      ...                 ...    ... .         ...
  [10231860]     chrX     9995833-9996097      * |       TBL1X
  [10231861]     chrX     9995833-9996097      * |      TCEANC
  [10231862]     chrX     9995833-9996097      * |      TMSB4X
  [10231863]     chrX     9995833-9996097      * |     TRAPPC2
  [10231864]     chrX     9995833-9996097      * |        WWC3
                           PeakName               PairName        rObs
                        <character>            <ch

In [14]:
df.output = as.data.frame(pairs.E2G.res)
colnames(df.output)[1] = "chr"
df.output[,"CellType"] = "K562"
data.table::fwrite(df.output,
                   file = paste(dir.output,"pairs.E2G.res.tsv.gz",sep = "/"),
                   row.names = F,
                   quote = F,
                   sep = "\t")
df.output

chr,start,end,width,strand,TargetGene,PeakName,PairName,rObs,rBgSD,rBgMean,pvalZ,CellType
<fct>,<int>,<int>,<int>,<fct>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
chr1,100028738,100029317,580,*,AGL,chr1-100028738-100029317,chr1-100028738-100029317_AGL,0.004720585,0.01527687,0.006546992,0.5475816,K562
chr1,100028738,100029317,580,*,ALG14,chr1-100028738-100029317,chr1-100028738-100029317_ALG14,-0.012274704,0.01419180,0.002619505,0.8530246,K562
chr1,100028738,100029317,580,*,CDC14A,chr1-100028738-100029317,chr1-100028738-100029317_CDC14A,0.004720585,0.01527687,0.006546992,0.5475816,K562
chr1,100028738,100029317,580,*,DBT,chr1-100028738-100029317,chr1-100028738-100029317_DBT,-0.012274704,0.01419180,0.002619505,0.8530246,K562
chr1,100028738,100029317,580,*,DPH5,chr1-100028738-100029317,chr1-100028738-100029317_DPH5,0.004720585,0.01527687,0.006546992,0.5475816,K562
chr1,100028738,100029317,580,*,DPYD,chr1-100028738-100029317,chr1-100028738-100029317_DPYD,-0.012274704,0.01419180,0.002619505,0.8530246,K562
chr1,100028738,100029317,580,*,EXTL2,chr1-100028738-100029317,chr1-100028738-100029317_EXTL2,0.004720585,0.01527687,0.006546992,0.5475816,K562
chr1,100028738,100029317,580,*,FRRS1,chr1-100028738-100029317,chr1-100028738-100029317_FRRS1,-0.012274704,0.01419180,0.002619505,0.8530246,K562
chr1,100028738,100029317,580,*,LRRC39,chr1-100028738-100029317,chr1-100028738-100029317_LRRC39,0.004720585,0.01527687,0.006546992,0.5475816,K562
chr1,100028738,100029317,580,*,MFSD14A,chr1-100028738-100029317,chr1-100028738-100029317_MFSD14A,-0.012274704,0.01419180,0.002619505,0.8530246,K562


In [15]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux 8.9 (Ootpa)

Matrix products: default
BLAS/LAPACK: /maps/projects/ralab/people/lpm537/software/anaconda3/envs/Notebook_E2G_240505/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Copenhagen
tzcode source: system (glibc)

attached base packages:
 [1] parallel  grid      stats4    stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] pbmcapply_1.5.1                   foreach_1.5.2                    
 [3] BSgenome.Hsapiens.UCSC.hg38_1.4.5 BSgenome_1.