In [None]:
#Might need to change filepaths

rnaCountMatrix = "../../dorcs_data_example/out.gene.bc.matrices.h5"
atacFragFile = "../../dorcs_data_example/shareseq-project.atac.GRCh38.cleaned.filtered.bedpe"
peakFile = "../../dorcs_data_example/GM_nonoverlap.bed"

genome = "hg38"
nCores = 4
savePlotsToDir = TRUE

minFeature_RNA = 200 #Seurat QC for number of min features
maxFeature_RNA = 2500 #Seurat QC for number of max features
percentMT_RNA = 5 #Seurat QC for max % of mt 
minCells_RNA = 3 #Seurat QC for min number of cells

dorcGeneCutOff = 10 #No. sig peaks needed to be called a DORC
fripCutOff = 0.3 #QC threshold for fRIP score
corrPVal = 0.05 #pval cutoff for correlation statistical test
topNGene = 20 #Label top N genes in j-Plot
windowPadSize = 50000 # Padding around TSS to define regulatory region
bootstrap = 100 # Number of bootstraps

numNearestNeighbor = 100 #Number of nearest neighbors
numBackgroundPairs = 1e+05 #Number of background gene-peak pairs to generate
chunkSize = 50000 #chunk size (number of pairs) to parallelize centering ATAC counts 

In [None]:
packages = c("dplyr","Seurat","hdf5r","patchwork","GenomicRanges","ggplot2","ggrepel","reshape2","ggrastr","BuenColors","ComplexHeatmap", "circlize","networkD3","GGally","network","motifmatchr","foreach","iterators","parallel","Biostrings","rtracklayer")

if(genome == "hg38"){
    BiocManager::install("BSgenome.Hsapiens.UCSC.hg38", update=T, ask=F)
    packages = c(packages, "BSgenome.Hsapiens.UCSC.hg38")
} else if(genome == "mm10"){
    BiocManager::install("BSgenome.Mmusculus.UCSC.mm10", update=T, ask=F)
    packages = c(packages, "BSgenome.Mmusculus.UCSC.mm10")
}

new.packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) BiocManager::install(new.packages, update=T, ask=F)

suppressMessages(lapply(packages, library, character.only = TRUE))

In [None]:
#Might need to change filepaths
#download from gh and source?
source("/home/R/DORCS_helper_functions.R")
load("/home/R/TSSRanges.RData")

In [None]:
#Create and preprocess RNA count matrix; using Seurat functions

rnaCounts = Read10X_h5(rnaCountMatrix)
rnaCounts = CreateSeuratObject(counts = rnaCounts, project = "shareseq", min.cells = minCells_RNA, min.features = minFeature_RNA)
rnaCounts[["percent.mt"]] = PercentageFeatureSet(rnaCounts, pattern = "^MT-")
RNAVlnPlot = VlnPlot(rnaCounts, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
print(RNAVlnPlot)

#QC filtering and normalization

rnaCounts = subset(rnaCounts, subset = nFeature_RNA > minFeature_RNA & nFeature_RNA < maxFeature_RNA & percent.mt < percentMT_RNA)
rnaCounts = NormalizeData(rnaCounts)

In [None]:
#Create and preprocess scATACseq

#myPeaks = read.table(peakFile, sep="\t", header=F)
#myFrags = read.table(atacFragFile, sep="\t", header=F)

#peakRanges = makeGRangesFromDataFrame(myPeaks,seqnames.field = "V1",start.field = "V2",end.field = "V3",starts.in.df.are.0based = TRUE)
#fragRanges = makeGRangesFromDataFrame(myFrags, seqnames.field = "V1", start.field = "V2", end.field = "V3", keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)

peaksSE = getCountsFromFrags(fragFile=atacFragFile, peakFile=peakFile)


In [None]:
# Get counts from fragfile
#peaksSE = getCountsFromFrags(fragRanges=fragRanges, peaks=peakRanges )
SE.filt = peaksSE[,peaksSE$FRIP > fripCutOff]

#extract RNA count matrix
rnaMat <- rnaCounts[["RNA"]]@data

#clean up
rm(peaksSE)
rm(rnaCounts)

#Cleaning - change barcode names
SE.filt$sample = sub(",P1\\.[0-9]+", "", SE.filt$sample)
colnames(SE.filt) = sub(",P1\\.[0-9]+", "", colnames(SE.filt))
colnames(SE.filt) = gsub(",", "\\.", colnames(SE.filt))
colnames(rnaMat) = sub("\\,P1\\.[0-9]+", "", colnames(rnaMat))
colnames(rnaMat) = gsub(",", "\\.", colnames(rnaMat))

#Get intersect of cells in RNA amd ATAC
cells = intersect(colnames(SE.filt), colnames(rnaMat))


In [None]:
set.seed(123)

#Run fast gene peak correlation
cisCor <- fastGenePeakcorr(
  SE.filt[,cells],
  rnaMat[,cells],
  genome = genome, # This will be one of "hg19","hg38" or "mm10"
  windowPadSize = windowPadSize,
  n_bg = bootstraps,
  normalizeATACmat = TRUE,
  nCores = nCores,
  p.cut = NULL,
  n_bg = numNearestNeighbor,
  n_BgPairs = numBackgroundPairs,
  chunkSize = chunkSize
)

cisCor.filt <- cisCor %>% filter(pvalZ <= corrPVal)

In [None]:
#dorcGenes and j-Plot
dorcGenes = dorcJPlot(dorcTab = cisCor.filt,
                       cutoff = dorcGeneCutOff, # No. sig peaks needed to be called a DORC
                       labelTop = topNGene,
                       returnGeneList = TRUE, # Set this to FALSE for just the plot
                       force=2)

In [None]:
if(savePlotsToDir){
    dir.create("plots")
    savePlots = function(filename, plotObject){
        pdf(paste0("plots/",filename,".pdf"))
        print(plotObject)
        dev.off()
        
        png(paste0("plots/",filename,".png"))
        print(plotObject)
        dev.off()
    }
    savePlots("RNAViolinPlot", RNAVlnPlot)
    savePlots("JPlot", dorcJPlot(dorcTab = cisCor.filt,cutoff = dorcGeneCutOff, labelTop = topNGene,returnGeneList = FALSE,force=2))
    write.table(dorcGenes, file="dorc_genes_summary.csv", row.names = T, quote = F, sep = ",")
    write.table(cisCor, file="dorc_regions_summary.csv", row.names = T, quote = F, sep = ",")
}

files2zip <- dir('plots', full.names = TRUE)
zip(zipfile = 'plots.zip', files = files2zip)