# NEATseq Analysis and Enrichment in R

In [1]:
# read in the raw data from NEATseq
library(Matrix)

In [2]:
directory <- '/projects/zhanglab/users/ana/multiome/raw/neatseq/'

In [3]:
cells <- read.csv(file = paste(directory,'cd4t/GSM5396332_CD4cells.csv',sep=''),
                    header=T,sep=',')
head(cells)

Unnamed: 0_level_0,Sample,TSSEnrichment,ReadsInTSS,ReadsInPromoter,ReadsInBlacklist,PromoterRatio,PassQC,NucleosomeRatio,nMultiFrags,nMonoFrags,nFrags,nDiFrags,BlacklistRatio,Clusters,ReadsInPeaks,FRIP
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<int>,<dbl>
lane1#AAACAGCCAACTGGCT-1,lane1,24.227,1615,1568,26,0.4045408,1,0.382311,18,1402,1938,518,0.006707946,C2,2662,0.6867905
lane1#AAACATGCACAATTAC-1,lane1,16.511,2368,2160,44,0.3408015,1,0.451672,20,2183,3169,966,0.006942253,C2,4293,0.6775568
lane1#AAACATGCAGCACGTT-1,lane1,18.53,1151,1110,14,0.3675497,1,0.3006029,4,1161,1510,345,0.004635762,C3,2216,0.7337748
lane1#AAACATGCATATTGAC-1,lane1,22.239,1752,1597,34,0.3659487,1,0.3740554,12,1588,2182,582,0.007791017,C6,3195,0.7321265
lane1#AAACCAACAATAGCAA-1,lane1,19.625,1883,1630,32,0.3364988,1,0.468769,17,1649,2422,756,0.006606111,C4,3202,0.6610239
lane1#AAACCAACAGGCTTCG-1,lane1,19.88,3564,3337,40,0.3834751,1,0.3273337,20,3278,4351,1053,0.004596644,C1,6327,0.7270742


In [4]:
all(cells$PassQC == TRUE) # they all pass QC

In [5]:
rna <- readRDS(file = paste(directory,'cd4t/GSM5396333_CD4_RNA_counts.rds',sep=''))

In [6]:
atac <- readRDS(file = paste(directory,'cd4t/GSM5396336_CD4_Peak_matrix.rds',sep=''))

In [7]:
atac <- t(atac)

# Signac analysis

In [1]:
library(GenomeInfoDb)
library(Seurat)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomicRanges)
library(Signac)

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

Loading required package: stats4


Attaching package: ‘S4Vectors’


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading required package: SeuratObject

Loading required package: sp


Attaching pac

In [40]:
# get gene annotations for hg38
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevels(annotation) <- paste0('chr', seqlevels(annotation))

"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence

In [58]:
# create a Seurat object containing the RNA adata
neat <- CreateSeuratObject(
  counts = rna,
  assay = "RNA"
)

In [61]:
# normalizedata
DefaultAssay(neat) <- "RNA"
neat <- NormalizeData(neat)

Normalizing layer: counts



Ran the following in terminal to generate tbi files.
```
gzip -d <fragment.tsv.gz>
bgzip <fragment.tsv>
tabix -p bed <fragment.tsv.gz>
```

In [63]:
# create fragment objects for chromatin assay
lane1_frags <- CreateFragmentObject(path = paste(directory,'GSM5396332_lane1_atac_fragments.tsv.gz',sep=''))
lane2_frags <- CreateFragmentObject(path = paste(directory,'GSM5396336_lane2_atac_fragments.tsv.gz',sep=''))

Computing hash

Computing hash



In [64]:
# create ATAC assay and add it to the object
neat[['ATAC']] <- CreateChromatinAssay(
  counts = atac,
  sep = c(":", "-"),
  fragments = list(lane1_frags,lane2_frags),
  annotation = annotation
)

In [65]:
DefaultAssay(neat) <- "ATAC"
# first compute the GC content for each peak
neat <- RegionStats(neat, genome = BSgenome.Hsapiens.UCSC.hg38)

In [None]:
start.time <- Sys.time()
# link peaks to genes
neat <- LinkPeaks(
  object = neat,
  peak.assay = "ATAC",
  expression.assay = "RNA",
  gene.id = TRUE, # since this dataset uses ensemblIDs
  pvalue_cutoff = 1, # max pvalue
  score_cutoff = 0 # min pearson corr
)
# takes about 8hrs

Testing 16329 genes and 80566 peaks

Found gene coordinates for 14141 genes



In [None]:
l = neat(Links)
write.csv(l, file = "signac_neatseq_links.csv")

# SCENT analysis

In [154]:
library(SCENT)

In [10]:
library(Seurat)

Loading required package: SeuratObject

Loading required package: sp

‘SeuratObject’ was built under R 4.3.2 but the current version is
4.3.3; it is recomended that you reinstall ‘SeuratObject’ as the ABI
for R may have changed

‘SeuratObject’ was built with package ‘Matrix’ 1.6.3 but the current
version is 1.6.5; it is recomended that you reinstall ‘SeuratObject’ as
the ABI for ‘Matrix’ may have changed


Attaching package: ‘SeuratObject’


The following object is masked from ‘package:base’:

    intersect




Adding Metadata for covariates.

In [159]:
# adding mitochondrial genes
rna_copy <- rna
# List of mitochondrial genes
mito_genes <- c('ENSG00000198888', 'ENSG00000198763', 'ENSG00000198804',
                'ENSG00000198712', 'ENSG00000228253', 'ENSG00000198899',
                'ENSG00000198938', 'ENSG00000198840', 'ENSG00000212907',
                'ENSG00000198886', 'ENSG00000198786', 'ENSG00000198695',
                'ENSG00000198727')

# Add 'MT-' prefix to mitochondrial genes
new_row_names <- rownames(rna_copy)
new_row_names[rownames(rna_copy) %in% mito_genes] <- paste0('MT-', mito_genes)

# Update row names in the dcgMatrix for the genes in mito_genes
rownames(rna_copy) <- new_row_names

In [160]:
mito_rna <- CreateSeuratObject(rna_copy)

In [162]:
mito_rna[["percent.mt"]] <- PercentageFeatureSet(object = mito_rna, pattern = "^MT-")

In [163]:
mito_data <- mito_rna@meta.data$percent.mt

In [9]:
# first we need to get the meta.data

In [164]:
cells$log_nFrags <- log(cells$nFrags)

In [165]:
cells$celltype <- 'CD4'

In [166]:
cells <- cbind(cell = rownames(cells), cells)
rownames(cells) <- 1:nrow(cells)

In [167]:
cells$mito <- mito_data

In [236]:
SCENT_obj <- CreateSCENTObj(rna = rna, atac = atac, meta.data = cells[c('cell','celltype','Sample','log_nFrags','mito')],
                            covariates = c('Sample','log_nFrags','mito'), 
                            celltypes = 'CD4')

In [190]:
# create peak-gene links using windowed +/-500kb around genes
directory <- '/projects/zhanglab/users/ana/bedtools2/ana_bedfiles/neatseq_beds/scent_hg38/scentpaper_genes_windowed.bed'

In [237]:
SCENT_obj <- SCENT::CreatePeakToGeneList(SCENT_obj, genebed = directory,nbatch = 1000,tmpfile="~/temporary_atac_peak.bed",intersectedfile="~/temporary_atac_peak_intersected.bed.gz")
# scent's approach for adding peak gene links. must generate gene windows bed yourself.

In [228]:
str(SCENT_obj, max.level = 2)

Formal class 'SCENT' [package "SCENT"] with 8 slots
  ..@ rna           :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  ..@ atac          :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  ..@ meta.data     :'data.frame':	8472 obs. of  4 variables:
  ..@ peak.info     :'data.frame':	0 obs. of  0 variables
Formal class 'data.frame' [package "methods"] with 4 slots
  ..@ peak.info.list:List of 10
  ..@ covariates    : chr [1:3] "Sample" "log_nFrags" "mito"
  ..@ celltypes     : chr "CD4"
  ..@ SCENT.result  :'data.frame':	0 obs. of  0 variables
Formal class 'data.frame' [package "methods"] with 4 slots


In [250]:
saveRDS(SCENT_obj, file = 'SCENT_neatseq.rds')

Run `parallelizedSCENT.sh` on this object in `/projects/zhanglab/users/ana/multiome/results/scent` for faster results. For batch size ~1000, takes about ~3days.