Combine Xu et al K562 and PBMC datasets to calculate Kendall correlation

In [1]:
library(Seurat)
library(Signac)
library(magrittr)
library(genomation)
library(GenomicRanges)
library(Matrix)
library(ggplot2)

Loading required package: SeuratObject

Loading required package: sp

‘SeuratObject’ was built under R 4.3.2 but the current version is
4.3.3; it is recomended that you reinstall ‘SeuratObject’ as the ABI
for R may have changed

‘SeuratObject’ was built with package ‘Matrix’ 1.6.3 but the current
version is 1.6.5; it is recomended that you reinstall ‘SeuratObject’ as
the ABI for ‘Matrix’ may have changed


Attaching package: ‘SeuratObject’


The following object is masked from ‘package:base’:

    intersect


Loading required package: grid

“replacing previous import ‘Biostrings::pattern’ by ‘grid::pattern’ when loading ‘genomation’”
Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following object is masked from ‘package:SeuratObject’:

    intersect


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, ap

In [2]:
atac.matrix.PBMC = readRDS("/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/PBMC/1.prepare_data/2.1.use_K562_Xu_peaks.240715/atac.matrix.PBMC.rds")

In [3]:
rna.matrix.PBMC = readRDS("/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/PBMC/1.prepare_data/2.1.use_K562_Xu_peaks.240715/matrix.rna.PBMC.rds")

In [4]:
atac.matrix.K562 = read.csv("/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_Xu/K562/Kendall/atac_matrix.csv.gz",
                       row.names = 1,
                       check.names = F)
atac.matrix.K562 = Matrix(as.matrix(atac.matrix.K562), sparse = TRUE)

In [5]:
rna.matrix.K562 = read.csv("/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Xu/1.prepare_data/1.seurat_pipeline.240507/rna_count_matrix.csv.gz",
                      row.names = 1,
                      check.names = F)
rna.matrix.K562 = Matrix(as.matrix(rna.matrix.K562), sparse = TRUE)
rna.matrix.K562 = rna.matrix.K562[,colnames(atac.matrix.K562)]

In [6]:
dim(rna.matrix.PBMC)
dim(rna.matrix.K562)

In [7]:
table(rownames(rna.matrix.PBMC) == rownames(rna.matrix.K562))


 TRUE 
62757 

In [8]:
dim(atac.matrix.PBMC)
dim(atac.matrix.K562)

In [9]:
table(rownames(atac.matrix.PBMC) == rownames(atac.matrix.K562))


 FALSE   TRUE 
157599      1 

In [10]:
rownames(atac.matrix.PBMC)[1:10]

In [11]:
rownames(atac.matrix.K562)[1:10]

In [12]:
table(rownames(atac.matrix.PBMC) %in% rownames(atac.matrix.K562))


  TRUE 
157600 

In [13]:
rna.matrix = cbind(rna.matrix.K562,rna.matrix.PBMC)

In [14]:
atac.matrix = cbind(atac.matrix.K562,atac.matrix.PBMC[rownames(atac.matrix.K562),])

In [15]:
table(colnames(rna.matrix) == colnames(atac.matrix))


 TRUE 
26004 

In [16]:
rna.matrix.norm = NormalizeData(rna.matrix)

In [17]:
atac.matrix.bi = BinarizeCounts(atac.matrix)

In [18]:
pairs.E2G = readGeneric("/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_Xu/K562/Kendall/Pairs.tsv.gz",
                        keep.all.metadata = T,
                        header = T)

In [20]:
# Load required packages
suppressPackageStartupMessages({
  library(GenomicRanges)
  library(genomation)
  library(foreach)
  library(Signac)
  library(Seurat)
  library(Rcpp)
  library(data.table)
  library(Matrix)
#  library(anndata)
  library(tools)
})

## Define functions --------------------------------------------------------------------------------

# Calculate the difference between concordant and disconcordant pairs from a sorted logical matrix
cppFunction('
NumericVector count_diff(LogicalMatrix y_matrix_sorted) {
    int n = y_matrix_sorted.nrow();
    int m = y_matrix_sorted.ncol();
    NumericVector result(m);
    for (int j = 0; j < m; j++) {
        long long concordant = 0;
        long long disconcordant = 0;
        long long cumsum = 0;
        for (int i = 0; i < n; i++) {
            bool tmp = y_matrix_sorted(i, j);
            cumsum += tmp;
            if (tmp) {
                disconcordant += (i + 1 - cumsum);
            } else {
                concordant += cumsum;
            }
        }
        result[j] = static_cast<double>(concordant - disconcordant);
    }
    return result;
}
')

# Compute Kendall correlation between a single gene and multiple enhancers
kendall_one_gene = function(x, y.matrix){
  
  # Sort x in decreasing order and accordingly sort y.matrix
  ord = order(x, 
              decreasing = T)
  x.sorted = x[ord]
  y.matrix.sorted = 
    y.matrix[ord, ,drop = F]
  
  # Calculate initial differences between concordant and disconcordant pairs
  n.diff = count_diff(as.matrix(y.matrix.sorted))
  
  # Adjust differences for ties in x
  x.ties = unique(x.sorted[duplicated(x.sorted)])
  for (x.tie in x.ties) {
    n.diff = 
      n.diff - 
      count_diff(as.matrix(y.matrix.sorted[x.sorted == x.tie, ,drop = F]))
  }
  
  # Calculate Kendall's tau-b coefficient
  l = length(x)
  s = colSums(y.matrix)
  tx = table(x)
  
  n0 = choose(l, 2)
  n1 = sum(choose(tx, 2))
  n2 = (s*(s-1) + (l-s)*(l-s-1))/2
  
  tau_b = n.diff / sqrt((n0 - n1) * (n0 - n2))
  
  return(tau_b)
}


# Compute Kendall correlation between a mutliple genes and multiple enhancers
kendall_mutliple_genes = function(bed.E2G,
                                  data.RNA,
                                  data.ATAC,
                                  colname.gene_name = "gene_name",
                                  colname.enhancer_name = "peak_name",
                                  colname.output = "Kendall") {
  
  # Filter E2G pairs based on presence in RNA and ATAC data
  bed.E2G.filter = 
    bed.E2G[mcols(bed.E2G)[,colname.gene_name] %in% rownames(data.RNA) &
              mcols(bed.E2G)[,colname.enhancer_name] %in% rownames(data.ATAC)] 
  

  
  # Compute Kendall correlation for each gene
  bed.E2G.output <- foreach(gene.name = unique(mcols(bed.E2G.filter)[,colname.gene_name]),
                            .combine = 'c') %do% {
                              
                              bed.E2G.tmp <- bed.E2G.filter[mcols(bed.E2G.filter)[,colname.gene_name] == gene.name]
                              
                              mcols(bed.E2G.tmp)[, colname.output] = 
                                kendall_one_gene(as.numeric(data.RNA[gene.name, ]),
                                                 t(data.ATAC[mcols(bed.E2G.tmp)[,colname.enhancer_name], , drop = F]))
                              bed.E2G.tmp
                            }
  return(bed.E2G.output)
}
## -------------------------------------------------------------------------------------------------

In [21]:
pairs.E2G = kendall_mutliple_genes(pairs.E2G,
                                   rna.matrix.norm,
                                   atac.matrix.bi,
                                   colname.gene_name = "TargetGene",
                                   colname.enhancer_name = "PeakName",
                                   colname.output = "Kendall")

In [22]:
pairs.E2G

GRanges object with 11201948 ranges and 4 metadata columns:
             seqnames            ranges strand |  TargetGene
                <Rle>         <IRanges>  <Rle> | <character>
         [1]     chr1 10001196-10001745      * |       ACOT7
         [2]     chr1 10003541-10003870      * |       ACOT7
         [3]     chr1 10003956-10004703      * |       ACOT7
         [4]     chr1 10006252-10006807      * |       ACOT7
         [5]     chr1 10015764-10016393      * |       ACOT7
         ...      ...               ...    ... .         ...
  [11201944]     chrX 90471599-90471978      * |       KLHL4
  [11201945]     chrX 90472102-90472252      * |       KLHL4
  [11201946]     chrX 90472324-90472773      * |       KLHL4
  [11201947]     chrX 90473654-90474926      * |       KLHL4
  [11201948]     chrX 90575772-90576567      * |       KLHL4
                           PeakName               PairName   Kendall
                        <character>            <character> <numeric>
         

In [23]:
dir.output = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/PBMC/1.prepare_data/2.2.Kendall_PBMC_K562.240715/"
dir.create(dir.output,recursive = T)
saveRDS(pairs.E2G,
        paste(dir.output,"pairs.E2G.rds",sep = "/"))

df.output = as.data.frame(pairs.E2G)
colnames(df.output)[1] = "chr"
df.output[,"CellType"] = "K562"
data.table::fwrite(df.output,
                   file = paste(dir.output,"pairs.E2G.res.tsv.gz",sep = "/"),
                   row.names = F,
                   quote = F,
                   sep = "\t")

In [25]:
saveRDS(rna.matrix,
        paste(dir.output,"rna.matrix.rds",sep = "/"))

In [26]:
saveRDS(atac.matrix,
        paste(dir.output,"atac.matrix.rds",sep = "/"))

In [27]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux 8.10 (Ootpa)

Matrix products: default
BLAS/LAPACK: /maps/projects/ralab/people/lpm537/software/anaconda3/envs/Notebook_E2G_240505/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Copenhagen
tzcode source: system (glibc)

attached base packages:
 [1] tools     stats4    grid      stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] data.table_1.15.2    Rcpp_1.0.12          foreach_1.5.2       
 [4] ggplot2_3.5.1        Matrix_1.6-5         Genomic