In [1]:
library(SCENT)
library(Signac)
library(genomation)
library(GenomicRanges)
library(Matrix)
library(parallel)

“replacing previous import ‘Hmisc::capitalize’ by ‘R.utils::capitalize’ when loading ‘SCENT’”
Loading required package: grid

“replacing previous import ‘Biostrings::pattern’ by ‘grid::pattern’ when loading ‘genomation’”
Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following object is masked from ‘package:utils’:



Specify file path

In [2]:
path.pairs.E2G = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_Xu/K562/Kendall/Pairs.tsv.gz"
path.pairs.ABC = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_Xu/K562/Predictions/EnhancerPredictionsAllPutative.tsv.gz"
path.matrix.atac_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/software/scE2G_pipeline/240508/sc-E2G/test/results_K562_Xu/K562/Kendall/atac_matrix.csv.gz"
path.matrix.rna_count = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Xu/1.prepare_data/1.seurat_pipeline.240507/rna_count_matrix.csv.gz"
dir.output = "/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Xu/3.Genome_wide_prediction/SCENT.default/SCENT.240611/"

In [3]:
n.cores = 48

Import candidate E-G pairs

In [4]:
pairs.E2G = readGeneric(path.pairs.E2G,
                        header = T,
                        keep.all.metadata = T)

Import ABC results

In [5]:
pairs.ABC = readGeneric(path.pairs.ABC,
                        header = T,
                        keep.all.metadata = T)

Filter ABC results which distance < 500kb

In [6]:
pairs.ABC.500kb = pairs.ABC[pairs.ABC$distance < 500000]

Filter E-G pairs overlaping with pairs.ABC.500kb

In [7]:
df.pairs.E2G.chr_rename = as.data.frame(pairs.E2G)[,1:3]
df.pairs.E2G.chr_rename[,"seqnames"] = paste(seqnames(pairs.E2G),
                                             mcols(pairs.E2G)[,"TargetGene"],
                                             sep = "_")
pairs.E2G.chr_rename = GRanges(df.pairs.E2G.chr_rename)
rm(df.pairs.E2G.chr_rename)

df.pairs.ABC.500kb.chr_rename = as.data.frame(pairs.ABC.500kb)[,1:3]
df.pairs.ABC.500kb.chr_rename[,"seqnames"] = paste(seqnames(pairs.ABC.500kb),
                                                mcols(pairs.ABC.500kb)[,"TargetGene"],
                                                sep = "_")
pairs.ABC.500kb.chr_rename = GRanges(df.pairs.ABC.500kb.chr_rename)
rm(df.pairs.ABC.500kb.chr_rename)

pairs.E2G.filter = pairs.E2G[countOverlaps(pairs.E2G.chr_rename,
                                           pairs.ABC.500kb.chr_rename) > 0]
rm(pairs.E2G.chr_rename)
rm(pairs.ABC.500kb.chr_rename)
pairs.E2G.filter

GRanges object with 1734842 ranges and 3 metadata columns:
            seqnames            ranges strand |  TargetGene
               <Rle>         <IRanges>  <Rle> | <character>
        [1]     chr1 10001196-10001745      * | APITD1-CORT
        [2]     chr1 10001196-10001745      * |       CENPS
        [3]     chr1 10001196-10001745      * |      CLSTN1
        [4]     chr1 10001196-10001745      * |        CORT
        [5]     chr1 10001196-10001745      * |    CTNNBIP1
        ...      ...               ...    ... .         ...
  [1734838]     chrX     989754-991253      * |       IL3RA
  [1734839]     chrX     989754-991253      * |   LINC00106
  [1734840]     chrX     989754-991253      * |     SLC25A6
  [1734841]     chrX   9995827-9996208      * |       CLCN4
  [1734842]     chrX   9995827-9996208      * |        WWC3
                          PeakName               PairName
                       <character>            <character>
        [1] chr1-10001196-10001745 chr1-10001

Import ATAC matrix

In [8]:
matrix.atac = read.csv(path.matrix.atac_count,
                       row.names = 1,
                       check.names = F)
matrix.atac = Matrix(as.matrix(matrix.atac), sparse = TRUE)
matrix.atac = BinarizeCounts(matrix.atac)

Import RNA matrix

In [9]:
matrix.rna = read.csv(path.matrix.rna_count,
                      row.names = 1,
                      check.names = F)
matrix.rna = Matrix(as.matrix(matrix.rna), sparse = TRUE)
matrix.rna = matrix.rna[,colnames(matrix.atac)]
matrix.rna = matrix.rna[rowSums(matrix.rna) > 0,]

In [10]:
pairs.E2G.filter2 = pairs.E2G.filter[pairs.E2G.filter$TargetGene %in% rownames(matrix.rna) &
                                     pairs.E2G.filter$PeakName %in% rownames(matrix.atac)]

Prepare SCENT meta data

In [11]:
meta.data = data.frame(cell = colnames(matrix.rna),
                       nUMI = colSums(matrix.rna),
                       nMito = colSums(matrix.rna[grep("MT-",rownames(matrix.rna)),]),
                       celltype = "K562")
meta.data[,"log.nUMI"] = log(meta.data[,"nUMI"])
meta.data[,"percent.mito"] = meta.data[,"nMito"] / meta.data[,"nUMI"]
meta.data

Unnamed: 0_level_0,cell,nUMI,nMito,celltype,log.nUMI,percent.mito
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
K562_Xu_AAACAGCCAAGCGATG-1,K562_Xu_AAACAGCCAAGCGATG-1,11087,1001,K562,9.313529,0.09028592
K562_Xu_AAACAGCCACATGCTA-1,K562_Xu_AAACAGCCACATGCTA-1,17288,2295,K562,9.757768,0.13275104
K562_Xu_AAACAGCCACGGTACT-1,K562_Xu_AAACAGCCACGGTACT-1,18989,1411,K562,9.851615,0.07430618
K562_Xu_AAACAGCCATAAAGCA-1,K562_Xu_AAACAGCCATAAAGCA-1,10270,1109,K562,9.236982,0.10798442
K562_Xu_AAACATGCAACTCGCG-1,K562_Xu_AAACATGCAACTCGCG-1,10962,1785,K562,9.302190,0.16283525
K562_Xu_AAACATGCAATCCTAG-1,K562_Xu_AAACATGCAATCCTAG-1,22686,2486,K562,10.029503,0.10958300
K562_Xu_AAACATGCACATAACT-1,K562_Xu_AAACATGCACATAACT-1,10649,512,K562,9.273221,0.04807963
K562_Xu_AAACATGCACCTATAG-1,K562_Xu_AAACATGCACCTATAG-1,15716,2278,K562,9.662435,0.14494782
K562_Xu_AAACATGCAGGGAGCT-1,K562_Xu_AAACATGCAGGGAGCT-1,12360,1570,K562,9.422221,0.12702265
K562_Xu_AAACATGCAGTTTACG-1,K562_Xu_AAACATGCAGTTTACG-1,24704,3638,K562,10.114720,0.14726360


RUN SCENT prediction

In [12]:
dir.create(paste(dir.output,"chr",sep = "/"),recursive = T)

“'/maps/projects/ralab_nnfc-AUDIT/people/lpm537/project/E2G/analysis/E2G_240503/data/K562_Xu/3.Genome_wide_prediction/SCENT.default/SCENT.240611//chr' already exists”


In [13]:
chr.done = dir(paste(dir.output,"chr",sep = "/"))
chr.run = as.character(unique(seqnames(pairs.E2G.filter2)))
chr.run = chr.run[!chr.run %in% chr.done]

In [None]:
for(chr.tmp in chr.run){
  print(chr.tmp)
  pairs.E2G.chr.res = pairs.E2G.filter2[seqnames(pairs.E2G.filter2) == chr.tmp]
  gene_peak.chr = as.data.frame(mcols(pairs.E2G.chr.res)[,c("TargetGene","PeakName")])
  
  SCENT_obj.chr <- CreateSCENTObj(rna = matrix.rna[rownames(matrix.rna) %in% gene_peak.chr$TargetGene,], 
                                  atac = matrix.atac[rownames(matrix.atac) %in% gene_peak.chr$PeakName,], 
                                  meta.data = meta.data,
                                  peak.info = gene_peak.chr,
                                  covariates = c("log.nUMI","percent.mito"), 
                                  celltypes = "celltype")
  SCENT_obj.chr <- SCENT_algorithm(object = SCENT_obj.chr, 
                                   celltype = "K562", 
                                   ncores = n.cores, 
                                   regr = 'poisson', bin = TRUE)
  
  saveRDS(SCENT_obj.chr@SCENT.result,
          paste(dir.output,"chr",chr.tmp,sep = "/"))
  rm (SCENT_obj.chr)
}

Save results

In [None]:
saveRDS(pairs.E2G.res,
        paste(dir.output,"pairs.E2G.res.rds",sep = "/"))
pairs.E2G.res

In [None]:
df.output = as.data.frame(pairs.E2G.res,row.names = NULL)
colnames(df.output)[1] = "chr"
df.output[,"CellType"] = "K562"
df.output = df.output[,c("chr",
                         "start",
                         "end",
                         "TargetGene",
                         "CellType",
                         "beta","se","z","p","boot_basic_p",
                         "boot_basic_p.log10",
                         "sign",
                         "boot_basic_p.log10.signed")]
data.table::fwrite(df.output,
                   file = paste(dir.output,"pairs.E2G.res.tsv.gz",sep = "/"),
                   row.names = F,
                   quote = F,
                   sep = "\t")
df.output

In [None]:
sessionInfo()