In [1]:
library(DIRECTNET)
library(Seurat)
library(Signac)
library(magrittr)
library(genomation)
library(GenomicRanges)
library(Matrix)
library(ggplot2)
library(EnsDb.Hsapiens.v86)
library(patchwork)
library(dplyr)



Loading required package: SeuratObject

Loading required package: sp

'SeuratObject' was built under R 4.3.2 but the current version is
4.3.3; it is recomended that you reinstall 'SeuratObject' as the ABI
for R may have changed

'SeuratObject' was built with package 'Matrix' 1.6.3 but the current
version is 1.6.5; it is recomended that you reinstall 'SeuratObject' as
the ABI for 'Matrix' may have changed


Attaching package: 'SeuratObject'


The following object is masked from 'package:base':

    intersect


Loading required package: grid

"replacing previous import 'Biostrings::pattern' by 'grid::pattern' when loading 'genomation'"
Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: 'BiocGenerics'


The following object is masked from 'package:SeuratObject':

    intersect


The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs


The following objects are masked from 'package:base':

    anyDuplicated, aperm, 

Specify file path

In [2]:
gene_gtf_path = "/maps/projects/ralab/data/genome/hg38/gencode.v43.chr_patch_hapl_scaff.annotation.gtf"
abc_genes_path = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/241203/scE2G/ENCODE_rE2G/ABC/reference/hg38/CollapsedGeneBounds.hg38.TSS500bp.bed"
# path.seurat = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/GM12878/1.prepare_data/1.seurat_pipeline.240615/obj.seurat.qc.rds"
path.matrix.atac_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/250319/scE2G/test/results/Jurkat/Jurkat/Kendall/atac_matrix.rds"
path.matrix.atac_frag = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/scE2G_input_Maya/Jurkat/atac_fragments.tsv.gz"
path.matrix.rna_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/processed/scE2G_input_Maya/Jurkat/rna_count_matrix.csv.gz"
path.pairs.E2G = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/250319/scE2G/test/results/Jurkat/Jurkat/Kendall/Pairs.tsv.gz"
path.CollapsedGeneBounds = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/ENCODE_rE2G/ABC/reference/hg38/CollapsedGeneBounds.hg38.bed"
dir.output = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/Jurkat/1.Genome_wide_prediction/DIRECTNET/DIRECTNET.250321/"
celltype = "Jurkat"

In [3]:
n.cores = 8
dist.max = 1000000

Import ATAC matrix

In [4]:
matrix.atac_count = readRDS(path.matrix.atac_count)

Import RNA matrix

In [5]:
matrix.rna = read.csv(path.matrix.rna_count,
                      row.names = 1,
                      check.names = F)
matrix.rna = Matrix(as.matrix(matrix.rna), sparse = TRUE)
matrix.rna = matrix.rna[,colnames(matrix.atac_count)]
matrix.rna = matrix.rna[rowSums(matrix.rna) > 0,]

Map gene names

In [6]:
extract_attributes <- function(gtf_attributes, att_of_interest){
  att <- unlist(strsplit(gtf_attributes, " "))
  if(att_of_interest %in% att){
    return(gsub("\"|;","", att[which(att %in% att_of_interest)+1]))
  } else {
    return(NA)}
}
map_gene_names <- function(rna_matrix, gene_gtf_path, abc_genes_path){
    library(dplyr)
    library(data.table)
    
	gene_ref <- fread(gene_gtf_path, header = FALSE, sep = "\t") %>%
		setNames(c("chr","source","type","start","end","score","strand","phase","attributes")) %>%
		dplyr::filter(type == "gene")
	gene_ref$gene_ref_name <- unlist(lapply(gene_ref$attributes, extract_attributes, "gene_name"))
	gene_ref$Ensembl_ID <- unlist(lapply(gene_ref$attributes, extract_attributes, "gene_id"))
	gene_ref <- dplyr::select(gene_ref, gene_ref_name, Ensembl_ID) %>%
		mutate(Ensembl_ID = sub("\\.\\d+$", "", Ensembl_ID)) %>% # remove decimal digits 
		distinct()
	
	abc_genes <- fread(abc_genes_path, col.names = c("chr", "start", "end", "name", "score", "strand", "Ensembl_ID", "gene_type")) %>%
		dplyr::select(name, Ensembl_ID) %>%
		rename(abc_name = name) %>%
		left_join(gene_ref, by = "Ensembl_ID") %>%
		group_by(Ensembl_ID) %>% # remove cases where multiple genes map to one ensembl ID
		filter(n() == 1) %>%
		ungroup()

	gene_key <- abc_genes$abc_name
	names(gene_key) <- abc_genes$gene_ref_name

	# remove genes not in our gene universe	
	row_sub <- intersect(rownames(rna_matrix), names(gene_key)) # gene ref names
	rna_matrix_filt <- rna_matrix[row_sub,] # still gene ref names
	rownames(rna_matrix_filt) <- gene_key[row_sub] # converted to abc names

	return(rna_matrix_filt)
}

In [7]:
dim(matrix.rna)
matrix.rna[1:10,1:10]

  [[ suppressing 10 column names 'e10l1_AAACCAGGTACTTATG-1', 'e10l1_AAACCAGGTCACACTG-1', 'e10l1_AAACCAGGTTATGTTG-1' ... ]]



10 x 10 sparse Matrix of class "dgCMatrix"
                                   
WASH7P          . . . . . . . . . .
MIR1302-2HG     . . . . . . . . . .
OR4F5           . . . . . . . . . .
ENSG00000238009 . . . . . . . . . .
CICP27          . . . . . . . . . .
ENSG00000268903 . . . . . . . . . .
ENSG00000241860 . . . . . . . . . .
DDX11L17        . . . . . . . . . .
WASH9P          . . . . . . . . . .
ENSG00000228463 . . . . . . . . . .

In [8]:
matrix.rna.rename = map_gene_names(matrix.rna,gene_gtf_path, abc_genes_path)


Attaching package: 'data.table'


The following objects are masked from 'package:dplyr':

    between, first, last


The following object is masked from 'package:GenomicRanges':

    shift


The following object is masked from 'package:IRanges':

    shift


The following objects are masked from 'package:S4Vectors':

    first, second




In [9]:
dim(matrix.rna.rename)
matrix.rna.rename[1:10,1:10]

  [[ suppressing 10 column names 'e10l1_AAACCAGGTACTTATG-1', 'e10l1_AAACCAGGTCACACTG-1', 'e10l1_AAACCAGGTTATGTTG-1' ... ]]



10 x 10 sparse Matrix of class "dgCMatrix"
                             
OR4F5     . . . . . . . . . .
FAM87B    . . . . . . . . . .
LINC01128 1 . . . . . . . . .
FAM41C    . . . . . . . . . .
SAMD11    . . . . . . . . . .
NOC2L     . 1 . . . . . . . .
KLHL17    . . . 1 . . . . . .
PLEKHN1   . . . . . . . . . .
PERM1     . . . . . . . . . .
HES4      1 1 3 . 2 . . 1 . .

Import candidate E-G pairs

In [10]:
pairs.E2G = readGeneric(path.pairs.E2G,
                        header = T,
                        keep.all.metadata = T)

Import TSS information

In [11]:
df.CollapsedGeneBounds = read.table(path.CollapsedGeneBounds)
df.CollapsedGeneBounds

V1,V2,V3,V4,V5,V6,V7,V8
<chr>,<int>,<int>,<chr>,<int>,<chr>,<chr>,<chr>
chr1,34610,36081,FAM138A,0,-,ENSG00000237613,lincRNA
chr1,34610,36081,FAM138F,0,-,ENSG00000282591,lincRNA
chr1,69090,70008,OR4F5,0,+,ENSG00000186092,protein_coding
chr1,817370,819834,FAM87B,0,+,ENSG00000177757,lincRNA
chr1,826205,827522,LINC00115,0,-,ENSG00000225880,lincRNA
chr1,827590,859446,LINC01128,0,+,ENSG00000228794,processed_transcript
chr1,868070,876802,FAM41C,0,-,ENSG00000230368,lincRNA
chr1,925740,944581,SAMD11,0,+,ENSG00000187634,protein_coding
chr1,944202,959299,NOC2L,0,-,ENSG00000188976,protein_coding
chr1,960586,965719,KLHL17,0,+,ENSG00000187961,protein_coding


In [12]:
df.tss = df.CollapsedGeneBounds[,1:4]
colnames(df.tss) = c("Chrom","Starts","Ends","genes")
df.tss[df.CollapsedGeneBounds[,6] == "+","Ends"] = df.tss[df.CollapsedGeneBounds[,6] == "+","Starts"]
df.tss[df.CollapsedGeneBounds[,6] == "-","Starts"] = df.tss[df.CollapsedGeneBounds[,6] == "-","Ends"]
df.tss

Chrom,Starts,Ends,genes
<chr>,<int>,<int>,<chr>
chr1,36081,36081,FAM138A
chr1,36081,36081,FAM138F
chr1,69090,69090,OR4F5
chr1,817370,817370,FAM87B
chr1,827522,827522,LINC00115
chr1,827590,827590,LINC01128
chr1,876802,876802,FAM41C
chr1,925740,925740,SAMD11
chr1,959299,959299,NOC2L
chr1,960586,960586,KLHL17


Create seurat object

In [13]:
obj.seurat <- CreateSeuratObject(
  counts = matrix.rna.rename,
  assay = "RNA"
)

In [14]:
DefaultAssay(obj.seurat) <- "RNA"
obj.seurat <- obj.seurat %>%
  NormalizeData() %>%
  FindVariableFeatures() %>%
  ScaleData() %>%
  RunPCA() %>%
  RunUMAP(dims = 1:30) %>%
  FindNeighbors(dims = 1:30) %>%
  FindClusters()

Normalizing layer: counts

Finding variable features for layer counts

Centering and scaling data matrix

PC_ 1 
Positive:  ARHGAP15, MALAT1, LINC01572, TALAM1, KIF14, DIAPH3, TOX, LSAMP, SCLT1, ECT2 
	   KIF18A, CIT, CENPE, ASPM, KIF4A, SSBP2, NEIL3, ITPR2, WDPCP, CDCA2 
	   DEPDC1B, G2E3, CEP128, ANLN, TOP2A, MYEF2, MIS18BP1, ELMO1, BUB1B, KIF18B 
Negative:  PTMA, HSP90AB1, HSP90AA1, PRDX1, HSPD1, GAPDH, FTH1, NPM1, ENO1, NCL 
	   RANBP1, ACTG1, EIF5A, HSPA8, PEBP1, DDX21, UBB, MARCKSL1, SOX4, ODC1 
	   UNG, MYC, RPL22L1, COTL1, HMGN2, ZFP36L2, PMAIP1, BEX1, HES4, FNDC7 
PC_ 2 
Positive:  ASPM, TOP2A, HMGB2, CENPE, DLGAP5, CCNB1, TPX2, CENPF, KIF14, PLK1 
	   UBE2S, AURKA, KPNA2, AURKB, GTSE1, CDC20, CDCA8, BUB1, MKI67, UBE2C 
	   CDCA2, CDK1, HMMR, CENPA, KIF18A, KIF2C, KNL1, NUF2, KNSTRN, CCNA2 
Negative:  POLA1, IMMP2L, ANO5, DPP10, RAD51B, GALNT18, LINC01036, KSR2, RAPGEF5, ATRNL1 
	   FBXL17, TBC1D32, AHI1, MIR924HG, ZNF521, DNAH14, EML5, OPCML, KCNQ3, CHD7 
	   UNG, RNF150, FAM

Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 5277
Number of edges: 185739

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.7119
Number of communities: 8
Elapsed time: 0 seconds


Add ATAC matrix to seurat object

In [15]:
list.fragments = list()
cells.tmp = colnames(matrix.atac_count)
names(cells.tmp) = colnames(matrix.atac_count)
list.fragments[[1]] =
  CreateFragmentObject(path = path.matrix.atac_frag,
                       cells = cells.tmp)

Computing hash



In [16]:
obj.seurat[["ATAC"]] <- CreateChromatinAssay(
  counts = matrix.atac_count,
  fragments = list.fragments
)
rm(matrix.atac_count)
DefaultAssay(obj.seurat) <- "ATAC"

In [17]:
obj.seurat <- obj.seurat %>%
  RunTFIDF() %>%
  FindTopFeatures(min.cutoff = 'q0') %>%
  RunSVD() %>%
  RunUMAP(reduction = 'lsi', dims = 2:30)

Performing TF-IDF normalization

Running SVD

Scaling cell embeddings

20:46:20 UMAP embedding parameters a = 0.9922 b = 1.112

Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

Also defined by 'spam'

20:46:20 Read 5277 rows and found 29 numeric columns

20:46:20 Using Annoy for neighbor search, n_neighbors = 30

Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

Also defined by 'spam'

20:46:20 Building Annoy index with metric = cosine, n_trees = 50

0%   10   20   30   40   50   60   70   80   90   100%

[----|----|----|----|----|----|----|----|----|----|

*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
|

20:46:21 Writing NN index file to temp file /tmp/RtmpCbcfaW/file18624cfa218fe

20:46:21 Searching Annoy index using 1 thread, search_k = 3000

20:46:23 Annoy recall = 100%

20:46:29 Commencing smooth kNN distance calibration using 1 thread
 with targ

In [18]:
obj.seurat@reductions$wnn.umap = obj.seurat@reductions$umap

In [19]:
Idents(obj.seurat) = celltype

Aggregate data

In [20]:
# In the new version of seurat, object@assays$RNA$counts was used instead of object@assays$RNA@counts. 
# It makes errors for original Aggregate_data function
generate_aggregated_data.modified = function (object, cell_coord, k_neigh = 50, atacbinary = TRUE, 
    max_overlap = 0.8, seed = 123, verbose = TRUE) 
{
    if (nrow(cell_coord) > k_neigh) {
        nn_map <- as.data.frame(FNN::knn.index(cell_coord, k = (k_neigh - 
            1)))
        row.names(nn_map) <- row.names(cell_coord)
        nn_map$agg_cell <- 1:nrow(nn_map)
        good_choices <- 1:nrow(nn_map)
        if (verbose) 
            message("Sample cells randomly.")
        set.seed(seed)
        choice <- sample(1:length(good_choices), size = 1, replace = FALSE)
        chosen <- good_choices[choice]
        good_choices <- good_choices[good_choices != good_choices[choice]]
        it <- 0
        while (length(good_choices) > 0 & it < nrow(cell_coord)/((1 - 
            max_overlap) * k_neigh)) {
            it <- it + 1
            choice <- sample(1:length(good_choices), size = 1, 
                replace = FALSE)
            new_chosen <- c(chosen, good_choices[choice])
            good_choices <- good_choices[good_choices != good_choices[choice]]
            cell_sample <- nn_map[new_chosen, ]
            combs <- data.frame(1:(nrow(cell_sample) - 1), nrow(cell_sample))
            shared <- apply(combs, 1, function(x) {
                (k_neigh * 2) - length(unique(as.vector(as.matrix(cell_sample[x, 
                  ]))))
            })
            if (max(shared) < max_overlap * k_neigh) {
                chosen <- new_chosen
            }
        }
        if ("RNA" %in% names(object@assays)) {
            # rna_old <- as.matrix(object@assays$RNA@counts)
            rna_old <- as.matrix(object@assays$RNA$counts)
            rna_mask <- sapply(seq_len(nrow(cell_sample)), function(x) seq_len(ncol(rna_old)) %in% 
                cell_sample[x, , drop = FALSE])
            rna_mask <- Matrix::Matrix(rna_mask)
            rna_new <- rna_old %*% rna_mask
            rna_new <- as.matrix(rna_new)
        }
        # atac_old <- object@assays$ATAC@counts
        atac_old <- object@assays$ATAC$counts
        if (atacbinary) {
            atac_old <- atac_old > 0
        }
        atac_mask <- sapply(seq_len(nrow(cell_sample)), function(x) seq_len(ncol(atac_old)) %in% 
            cell_sample[x, , drop = FALSE])
        atac_mask <- Matrix::Matrix(atac_mask)
        atac_new <- atac_old %*% atac_mask
        atac_new <- as.matrix(atac_new)
    }
    else {
        if ("RNA" %in% names(object@assays)) {
            rna_old <- as.matrix(object@assays$RNA@counts)
            rna_new <- rowSums(rna_old)
            rna_new <- as.matrix(rna_new)
        }
        atac_old <- object@assays$ATAC@counts
        if (atacbinary) {
            atac_old <- atac_old > 0
        }
        atac_new <- rowSums(atac_old)
        atac_new <- as.matrix(atac_new)
        cell_sample <- as.data.frame(t(matrix(seq(from = 1, to = nrow(cell_coord)))))
    }
    new_data <- list()
    if ("RNA" %in% names(object@assays)) {
        new_data$rna <- rna_new
    }
    new_data$atac <- atac_new
    new_data$cell_sample <- cell_sample
    return(new_data)
}

In [21]:
Aggregate_data.modified = function (object, k_neigh = 50, atacbinary = TRUE, max_overlap = 0.8, 
    reduction.name = NULL, size_factor_normalize = TRUE, seed = 123, 
    verbose = TRUE) 
{
    if (!is.null(reduction.name)) {
        cell_coord <- object@reductions[[reduction.name]]
    }
    else {
        if ("RNA" %in% names(object@assays)) {
            cell_coord <- object@reductions$wnn.umap@cell.embeddings
        }
        else {
            cell_coord <- object@reductions$umap@cell.embeddings
        }
    }
    group <- as.character(Idents(object))
    uniqgroup <- unique(group)
    if ("RNA" %in% names(object@assays)) {
        # rna_new <- matrix(0, nrow = nrow(object@assays$RNA@counts), 
        rna_new <- matrix(0, nrow = nrow(object@assays$RNA$counts), 
            ncol = 1)
    }
    # atac_new <- matrix(0, nrow = nrow(object@assays$ATAC@counts), 
    atac_new <- matrix(0, nrow = nrow(object@assays$ATAC$counts), 
        ncol = 1)
    cell_sample <- matrix(0, nrow = 1, ncol = k_neigh)
    for (i in 1:length(uniqgroup)) {
        if (verbose) {
            message(paste0("Aggregating cluster ", uniqgroup[i]))
        }
        subobject <- subset(object, idents = uniqgroup[i])
        sub_index <- which(group %in% uniqgroup[i])
        cell_coord_i <- cell_coord[sub_index, ]
        # sub_aggregated_data <- generate_aggregated_data(subobject, 
        sub_aggregated_data <- generate_aggregated_data.modified(subobject, 
            cell_coord_i, k_neigh, atacbinary, max_overlap, seed, 
            verbose)
        sub_cell_sample <- sub_aggregated_data$cell_sample
        if ("RNA" %in% names(object@assays)) {
            rna_new <- cbind(rna_new, sub_aggregated_data$rna)
        }
        atac_new <- cbind(atac_new, sub_aggregated_data$atac)
        if (ncol(sub_cell_sample) < k_neigh) {
            sub_cell_sample_new <- as.matrix(sub_cell_sample)
            sub_cell_sample_new <- cbind(sub_cell_sample_new, 
                matrix(0, nrow = 1, ncol = k_neigh - ncol(sub_cell_sample_new)))
        }
        else {
            sub_cell_sample_new <- apply(sub_cell_sample, 2, 
                function(x) {
                  sub_index[x]
                })
            sub_cell_sample_new <- as.data.frame(sub_cell_sample_new)
            sub_cell_sample_new <- as.matrix(sub_cell_sample_new)
        }
        cell_sample <- rbind(cell_sample, sub_cell_sample_new)
    }
    if ("RNA" %in% names(object@assays)) {
        rna_new <- rna_new[, -1]
    }
    atac_new <- atac_new[, -1]
    cell_sample <- cell_sample[-1, ]
    if (size_factor_normalize) {
        if ("RNA" %in% names(object@assays)) {
            rna_new <- t(t(log(rna_new + 1))/estimateSizeFactorsForMatrix(rna_new))
        }
        atac_new <- t(t(log(atac_new + 1))/estimateSizeFactorsForMatrix(atac_new))
    }
    new_data <- list()
    if ("RNA" %in% names(object@assays)) {
        new_data$rna <- rna_new
    }
    new_data$atac <- atac_new
    new_data$cell_sample <- cell_sample
    return(new_data)
}

In [22]:
Misc(obj.seurat, slot = "aggregated.data") <- Aggregate_data.modified(obj.seurat,
                                                                      k_neigh = 50, 
                                                                      atacbinary = TRUE,
                                                                      max_overlap = 0.5,
                                                                      reduction.name = NULL,
                                                                      size_factor_normalize = FALSE)

Aggregating cluster Jurkat

Sample cells randomly.



Run DIRECT-NET Predition

In [23]:
# Allow setting the max TSS distance 
Run_DIRECT_NET.modify = function (object, peakcalling = FALSE, macs2.path = NULL, fragments = NULL, 
    k_neigh = 50, atacbinary = TRUE, max_overlap = 0.8, reduction.name = NULL, 
    size_factor_normalize = FALSE, genome.info, focus_markers, 
    params = NULL, nthread = 2, early_stop = FALSE, HC_cutoff = NULL, 
    LC_cutoff = NULL, rescued = FALSE, seed = 123, verbose = TRUE, dist.max = 250000) 
{
    library(xgboost)
    if (peakcalling) {
        if (verbose) {
            message("Calling Peak")
        }
        object$cluster <- Idents(object)
        if (is.null(macs2.path)) {
            message("Please give the path to macs2!")
        }
        peaks <- CallPeaks(object = object, group.by = "cluster", 
            macs2.path = macs2.path)
        if (is.null(fragments)) {
            message("Please input fragments!")
        }
        new_atac_data <- FeatureMatrix(fragments = fragments, 
            features = peaks)
        object@assays$ATAC@counts <- new_atac_data
        if (verbose) {
            message("Peak calling finished")
        }
    }
    if (verbose) {
        message("Generating aggregated data")
    }
    if ("aggregated.data" %in% names(Misc(object))) {
        agg.data <- Misc(object, slot = "aggregated.data")
    }
    else {
        agg.data <- Aggregate_data(object, k_neigh = k_neigh, 
            atacbinary = atacbinary, max_overlap = max_overlap, 
            reduction.name = NULL, size_factor_normalize = size_factor_normalize)
        Misc(object, slot = "aggregated.data") <- agg.data
    }
    options(stringsAsFactors = FALSE)
    if (is.null(params)) {
        params <- list(eta = 0.3, max_depth = 6, min_child_weight = 1, 
            subsample = 1, colsample_bytree = 1, lambda = 1)
    }
    if ("rna" %in% names(agg.data)) {
        data_rna <- as.matrix(agg.data$rna)
        rna <- rownames(data_rna)
        rna <- lapply(rna, function(x) strsplit(x, "[.]")[[1]][1])
        rna <- unlist(rna)
        rownames(data_rna) <- rna
        unik <- !duplicated(rna)
        data_rna <- data_rna[unik, ]
    }
    data_atac <- as.matrix(agg.data$atac)
    rownames(data_atac) <- gsub("-", "_", rownames(data_atac))
    peaks <- rownames(data_atac)
    genes <- lapply(genome.info$genes, function(x) strsplit(x, 
        "[|]")[[1]][1])
    genes <- lapply(genes, function(x) strsplit(x, "[.]")[[1]][1])
    genes <- unlist(genes)
    genome.info$genes <- genes
    unik <- !duplicated(genes)
    genome.info <- genome.info[unik, ]
    focus_markers <- lapply(focus_markers, function(x) strsplit(x, 
        "[.]")[[1]][1])
    focus_markers <- unique(unlist(focus_markers))
    focus_markers <- genome.info$genes[which(genome.info$genes %in% 
        focus_markers)]
    genome.info.used <- genome.info[which(genome.info$genes %in% 
        focus_markers), ]
    Chr <- genome.info.used$Chrom
    Starts <- genome.info.used$Starts
    Ends <- genome.info.used$Ends
    DIRECT_NET_Result <- list()
    TXs <- list()
    TYs <- list()
    for (i in 1:length(focus_markers)) {
        if (verbose) {
            message(paste0("Inferring links for ", focus_markers[i]))
        }
        p1 <- paste(Chr[i], ":", Starts[i] - 500, "-", Starts[i], 
            sep = "")
        # p2 <- paste(Chr[i], ":", Starts[i] - 250000, "-", Starts[i] + 
        #     250000, sep = "")
        p2 <- paste(Chr[i], ":", Starts[i] - dist.max, "-", Starts[i] + 
            dist.max, sep = "")        
        # promoters <- find_overlapping_coordinates(peaks, p1)
        promoters <- cicero::find_overlapping_coordinates(peaks, p1)
        # enhancers <- find_overlapping_coordinates(peaks, p2)
        enhancers <- cicero::find_overlapping_coordinates(peaks, p2)
        enhancers <- setdiff(enhancers, promoters)
        if ("rna" %in% names(agg.data)) {
            idx <- which(rownames(data_rna) == focus_markers[i])
        }
        else {
            idx <- 1
        }
        if ((length(promoters) > 0 && length(enhancers) > 1) && 
            length(idx) != 0) {
            id1 <- match(promoters, peaks)
            id1 <- id1[!is.na(id1)]
            id2 <- match(enhancers, peaks)
            id2 <- id2[!is.na(id2)]
            id2_new <- setdiff(id2, id1)
            X <- data_atac[id2_new, ]
            TXs[[i]] <- X
            Y <- data_atac[id1, ]
            if (length(id1) > 1) {
                Y <- colSums(Y)
            }
            Y <- t(as.matrix(Y))
            rownames(Y) <- peaks[id1[1]]
            TYs[[i]] <- Y
            if ("rna" %in% names(agg.data)) {
                Z <- data_rna[idx, ]
                Z <- t(as.matrix(Z))
                rownames(Z) <- focus_markers[i]
            }
            else {
                Z <- Y
            }
            flag <- 1
        }
        else {
            flag <- 0
            message(paste0("There are less than two peaks detected within 500 kb for ", 
                focus_markers[i]))
        }
        if (flag == 1) {
            X <- as.matrix(X)
            if (ncol(X) == 1) {
                X <- t(X)
            }
            Y <- as.matrix(Y)
            Z <- as.matrix(Z)
            rownames(Z) <- rownames(Y)
            if (early_stop) {
                if (length(size_factor)/5 < 100) {
                  message("The number of cells is too small to split!")
                }
                set.seed(seed)
                cv_idx <- sample(1:5, size = length(size_factor), 
                  replace = T)
                test_idx <- which(cv_idx == 1)
                validation_idx <- which(cv_idx == 2)
                x_train <- as.matrix(X[, -c(test_idx, validation_idx)])
                x_test <- as.matrix(X[, test_idx])
                x_validation <- as.matrix(X[, validation_idx])
                y_train <- Y[-c(test_idx, validation_idx)]
                y_test <- Y[test_idx]
                y_validation <- Y[validation_idx]
                dtrain = xgb.DMatrix(data = t(x_train), label = as.numeric(y_train))
                dtest = xgb.DMatrix(data = t(x_test), label = as.numeric(y_test))
                dvalidation = xgb.DMatrix(data = t(x_validation), 
                  label = as.numeric(y_validation))
                watchlist1 = list(train = dtrain, test = dvalidation)
                watchlist2 = list(train = dtrain, test = dtest)
                xgb_v <- xgb.train(params = params, data = dtrain, 
                  watchlist = watchlist1, nrounds = 100, nthread = nthread, 
                  objective = "reg:linear", verbose = 0)
                cv1 <- xgb_v$evaluation_log
                rmse_d <- cv1$test_rmse - cv1$train_rmse
                rmse_dd <- abs(rmse_d[2:100] - rmse_d[1:99])/rmse_d[1]
                stop_index <- which(rmse_dd == min(rmse_dd))
                # xgb.fit.final <- xgboost(params = params, data = t(X), 
                xgb.fit.final <- xgboost::xgboost(params = params, data = t(X), 
                  label = as.numeric(Z), nrounds = stop_index[1], 
                  nthread = nthread, objective = "reg:squarederror", 
                  verbose = 0)
            }
            else {
                # xgb.fit.final <- xgboost(params = params, data = t(X), 
                xgb.fit.final <- xgboost::xgboost(params = params, data = t(X), 
                  label = as.numeric(Z), nrounds = 100, nthread = nthread, 
                  objective = "reg:squarederror", verbose = 0)
            }
            tryCatch({
                importance_matrix <- xgb.importance(model = xgb.fit.final)
                Imp_peak <- importance_matrix$Feature
                Imp_peak <- as.vector(Imp_peak)
                Imp_value <- importance_matrix$Gain
                Imp_peak_h <- Imp_peak
                Imp_value_h <- Imp_value
                conns_h <- list()
                conns_h$Peak1 <- as.character(rownames(Y))
                conns_h$Peak2 <- as.character(Imp_peak_h)
                conns_h$Importance <- Imp_value_h
                conns_h <- as.data.frame(conns_h)
                colnames(conns_h) <- c("Peak1", "Peak2", "Importance")
            }, error = function(e) {
            })
            if (length(ncol(conns_h))) {
                DIRECT_NET_Result[[i]] <- conns_h
            }
        }
    }
    conns <- do.call(rbind, DIRECT_NET_Result)
    if (is.null(HC_cutoff)) {
        HC_cutoff = max(stats::quantile(conns$Importance, 0.5), 
            0.001)
    }
    if (is.null(LC_cutoff)) {
        LC_cutoff = min(0.001, stats::quantile(conns$Importance, 
            0.25))
    }
    for (i in 1:length(DIRECT_NET_Result)) {
        if (!is.null(DIRECT_NET_Result[[i]])) {
            conns_h <- DIRECT_NET_Result[[i]]
            Imp_value <- conns_h$Importance
            index1 <- which(Imp_value > HC_cutoff)
            index2 <- intersect(which(Imp_value > LC_cutoff), 
                which(Imp_value <= HC_cutoff))
            index3 <- which(Imp_value <= LC_cutoff)
            function_type <- rep(NA, length(Imp_value))
            function_type[index1] <- "HC"
            function_type[index2] <- "MC"
            function_type[index3] <- "LC"
            if (rescued) {
                if (i <= length(TXs)) {
                  X <- TXs[[i]]
                  Y <- TYs[[i]]
                  CPi <- abs(cor(t(X)))
                  for (p in 1:nrow(CPi)) {
                    CPi[p, p] <- 0
                  }
                  hic_index <- which(rownames(X) %in% conns_h$Peak2[index1])
                  other_index <- which(rownames(X) %in% conns_h$Peak2[-index1])
                  CPi_sub <- CPi[hic_index, other_index, drop = FALSE]
                  flag_matrix <- matrix(0, nrow = nrow(CPi_sub), 
                    ncol = ncol(CPi_sub))
                  flag_matrix[which(CPi_sub > 0.25)] <- 1
                  correlated_index <- which(colSums(flag_matrix) > 
                    0)
                  if (!is.null(correlated_index)) {
                    function_type[conns_h$Peak2 %in% rownames(X)[other_index[correlated_index]]] <- "HC"
                  }
                }
            }
            DIRECT_NET_Result[[i]] <- cbind(data.frame(gene = focus_markers[i], 
                Chr = Chr[i], Starts = Starts[i], Ends = Ends[i]), 
                cbind(conns_h, function_type = function_type))
        }
    }
    DIRECT_NET_Result_all <- do.call(rbind, DIRECT_NET_Result)
    DIRECT_NET_Result_all$Starts <- as.numeric(DIRECT_NET_Result_all$Starts)
    DIRECT_NET_Result_all$Ends <- as.numeric(DIRECT_NET_Result_all$Ends)
    DIRECT_NET_Result_all$Importance <- as.numeric(DIRECT_NET_Result_all$Importance)
    Misc(object, slot = "direct.net") <- DIRECT_NET_Result_all
    return(object)
}

In [24]:
gene.target = unique(pairs.E2G$TargetGene)
gene.target = gene.target[gene.target %in% rownames(obj.seurat@assays$RNA$counts)]
gene.target = gene.target[rowSums(obj.seurat@assays$RNA$counts[gene.target,])>0]
length(gene.target)

In [None]:
start_time <- Sys.time()
obj.seurat <- Run_DIRECT_NET.modify(obj.seurat,
                                    peakcalling = FALSE,
                                    k_neigh = 50,
                                    atacbinary = TRUE, 
                                    max_overlap=0.5,
                                    size_factor_normalize = FALSE,
                                    nthread = n.cores,
                                    genome.info = df.tss,
                                    focus_markers = gene.target,
                                    dist.max = dist.max)
end_time <- Sys.time()
execution_time <- end_time - start_time


Attaching package: 'xgboost'


The following object is masked from 'package:dplyr':

    slice


The following object is masked from 'package:IRanges':

    slice


Generating aggregated data

Inferring links for LINC01128

"The 2 combined objects have no sequence levels in common. (Use
There are less than two peaks detected within 500 kb for LINC01128

Inferring links for FAM41C

"The 2 combined objects have no sequence levels in common. (Use
There are less than two peaks detected within 500 kb for FAM41C

Inferring links for SAMD11

"The 2 combined objects have no sequence levels in common. (Use
There are less than two peaks detected within 500 kb for SAMD11

Inferring links for NOC2L

"The 2 combined objects have no sequence levels in common. (Use
There are less than two peaks detected within 500 kb for NOC2L

Inferring links for KLHL17

"The 2 combined objects have no sequence levels in common. (Use
There are less than two peaks detected within 500 kb for KLHL17

Inferring links f

In [None]:
execution_time

In [None]:
direct.net_result <- Misc(obj.seurat, slot = 'direct.net')
direct.net_result <- as.data.frame(do.call(cbind,direct.net_result))
rownames(direct.net_result) = paste(gsub("_","-",direct.net_result$Peak2),direct.net_result$gene,sep = "_")
direct.net_result

In [None]:
pairs.E2G.res = pairs.E2G[pairs.E2G$PairName %in% rownames(direct.net_result)]
pairs.E2G.res$Importance = as.numeric(direct.net_result[pairs.E2G.res$PairName,"Importance"])
pairs.E2G.res$function_type = direct.net_result[pairs.E2G.res$PairName,"function_type"]

Save results

In [None]:
dir.create(dir.output,recursive = T)

saveRDS(direct.net_result,
        paste(dir.output,"direct.net_result.rds",sep = "/"))
saveRDS(pairs.E2G.res,
        paste(dir.output,"pairs.E2G.res.rds",sep = "/"))
pairs.E2G.res

In [None]:
df.output = as.data.frame(pairs.E2G.res,row.names = NULL)
colnames(df.output)[1] = "chr"
df.output[,"CellType"] = celltype
df.output = df.output[,c("chr",
                         "start",
                         "end",
                         "TargetGene",
                         "CellType",
                         "Importance",
                         "function_type")]
data.table::fwrite(df.output,
                   file = paste(dir.output,"pairs.E2G.res.tsv.gz",sep = "/"),
                   row.names = F,
                   quote = F,
                   sep = "\t")
df.output

In [None]:
sessionInfo()