In [2]:
args <- commandArgs(T) 

In [1]:
library(here)
library(rjson)
library(Matrix)
library(tidyverse)
library(dplyr)
library(DropletUtils) 

library(Seurat)
library(Signac)
library(SeuratDisk)

here() starts at /NFS_home/NFS_home_2/wsg/BM/pipeline/scripts/data_manipulation/BMMC

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.1     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mtidyr[39m::[32mexpand()[39m masks [34mMatrix[39m::expand()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[31m✖[39m [34mtidyr[39m::[32mpack()[39m   masks [34mMatrix[39m::pack()
[31m✖[39m [34mtidyr[39m::[32munpack()[39m masks 

# Data Manipulation: Load and Subset BMMC

In [4]:
input_path <- "/Data/wangsg/BM/pipeline/results/BMMC/data_preprocess"
output_path <- "/Data/wangsg/BM/pipeline/results/BMMC/data_preprocess/pair_10p"
dataset <- unlist(fromJSON(file = "/Data/wangsg/BM/pipeline/results/BMMC/data_preprocess/BMMC.json"))

“cannot open file '/Data/wangsg/BM/pipeline/results/BMMC/data_preprocess/BMMC.json': No such file or directory”


ERROR: Error in file(con, "r"): cannot open the connection


In [5]:
# Load RNA
BMMC_RNA_Dir <- "/Data/wangsg/BM/pipeline/results/BMMC/data_preprocess/BMMC-raw-pair-RNA-counts.mtx"
BMMC_RNA_counts <- Read10X(data.dir = BMMC_RNA_Dir, gene.column = 1)

In [6]:
metadata <- read.csv(paste0(BMMC_RNA_Dir, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

set.seed(1234)
# random sample 500 cells of each donor
# bmmc_rna_500_meta <- metadata %>% group_by(batch) %>% slice_sample(n=500)

# random sample 10% cells of each donor
bmmc_rna_10p_meta <- metadata %>% group_by(batch) %>% sample_frac(.1)
table(bmmc_rna_10p_meta$batch)
bmmc_rna_10p_meta <- as.data.frame(bmmc_rna_10p_meta)
rownames(bmmc_rna_10p_meta) <- bmmc_rna_10p_meta$barcode


 s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
  622   674   428   422   611   490   678   432   168   177   802   988   432 

In [6]:
BMMC_RNA_counts_10p <- BMMC_RNA_counts[ , colnames(BMMC_RNA_counts) %in% bmmc_rna_10p_meta$barcode]

In [7]:
# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       "raw",
                       dataset["task_type"], 
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = BMMC_RNA_counts_10p, path = data_path, version = "3")
write_csv(bmmc_rna_10p_meta, here(data_path, "metadata.csv"))

In [8]:
# save raw rna to rds
saveRDS(BMMC_RNA_counts_10p, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          "raw",
                          dataset["task_type"], 
                          "RNA", "counts.rds", sep = "-")))

In [10]:
bmmc_rna_10p <- CreateSeuratObject(counts = BMMC_RNA_counts_10p, meta.data = bmmc_rna_10p_meta)

In [11]:
# save raw rna to h5Seurat
SaveH5Seurat(bmmc_rna_10p, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   "raw", 
                                   dataset["task_type"], 
                                   "RNA", "counts.h5Seurat", sep = "-")))

"Overwriting previous file /Data/wangsg/BM/pipeline/results/BMMC/data_preprocess/pair_10p/BMMC-raw-pair-RNA-counts.h5Seurat"
Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA



In [12]:
# save raw rna to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"],"raw", dataset["task_type"], "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [3]:
# Load ATAC
BMMC_ATAC_Dir <- "/Data/wangsg/BM/pipeline/results/BMMC/data_preprocess/BMMC-raw-pair-ATAC-peaks.mtx/"
BMMC_ATAC_counts <- Read10X(data.dir = BMMC_ATAC_Dir, gene.column = 1)

as(<dgTMatrix>, "dgCMatrix") is deprecated since Matrix 1.5-0; do as(., "CsparseMatrix") instead



In [7]:
metadata <- read.csv(paste0(BMMC_ATAC_Dir, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

# set.seed(1234)
# random sample 500 cells of each donor
# bmmc_rna_500_meta <- metadata %>% group_by(batch) %>% slice_sample(n=500)

# random sample 10% cells of each donor
# bmmc_rna_10p_meta <- metadata %>% group_by(batch) %>% sample_frac(.1)

table(metadata$batch)
bmmc_atac_10p_meta <- metadata[rownames(metadata) %in% rownames(bmmc_rna_10p_meta), ]
table(bmmc_atac_10p_meta$batch)


 s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
 6224  6740  4279  4220  6111  4895  6781  4325  1679  1771  8023  9876  4325 


 s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
  622   674   428   422   611   490   678   432   168   177   802   988   432 

In [8]:
BMMC_ATAC_counts_10p <- BMMC_ATAC_counts[ , colnames(BMMC_ATAC_counts) %in% bmmc_atac_10p_meta$barcode]

In [9]:
# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       "raw",
                       dataset["task_type"], 
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = BMMC_ATAC_counts_10p, path = data_path, version = "3")
write_csv(bmmc_atac_10p_meta, here(data_path, "metadata.csv"))

In [10]:
# save raw atac to rds
saveRDS(BMMC_ATAC_counts_10p, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          "raw",
                          dataset["task_type"], 
                          "ATAC", "peaks.rds", sep = "-")))

In [11]:
chrom_assay <- CreateChromatinAssay(
    counts = BMMC_ATAC_counts_10p,
    sep = c("-", "-")
)
bmmc_atac_10p <- CreateSeuratObject(counts = chrom_assay, assay = "ATAC", meta.data = bmmc_atac_10p_meta)

"Keys should be one or more alphanumeric characters followed by an underscore, setting key from atac to atac_"


In [12]:
# save raw rna to h5Seurat
SaveH5Seurat(bmmc_atac_10p, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   "raw", 
                                   dataset["task_type"], 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Writing out ranges for ATAC

Writing out motifs for ATAC

Writing out fragments for ATAC

Writing out seqinfo for ATAC

Writing out annotation for ATAC

Writing out bias for ATAC

Writing out positionEnrichment for ATAC

Writing out links for ATAC



In [14]:
# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"],"raw", dataset["task_type"], "ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



## subset cells from single sample

In [11]:
input_path <- "/Data/wangsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/RawData"
output_path <- "/Data/wangsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/s3d10"
dataset <- unlist(fromJSON(file = "/Data/wangsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/dataset.json"))

In [21]:
# 设置提取样本
# s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
# 6224  6740  4279  4220  6111  4895  6781  4325  1679  1771  8023  9876  4325 
sample = "s3d10"

In [13]:
paste0(input_path, "/BMMC-multiome-raw-RNA-counts.mtx")

In [14]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-multiome-raw-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(RNA_Dir, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

set.seed(1234)

In [15]:
RNA_counts

   [[ suppressing 32 column names 'TAGTTGTCACCCTCAC-1-s1d1', 'CTATGGCCATAACGGG-1-s1d1', 'CCGCACACAGGTTAAA-1-s1d1' ... ]]

   [[ suppressing 32 column names 'TAGTTGTCACCCTCAC-1-s1d1', 'CTATGGCCATAACGGG-1-s1d1', 'CCGCACACAGGTTAAA-1-s1d1' ... ]]

   [[ suppressing 32 column names 'TAGTTGTCACCCTCAC-1-s1d1', 'CTATGGCCATAACGGG-1-s1d1', 'CCGCACACAGGTTAAA-1-s1d1' ... ]]



13431 x 69249 sparse Matrix of class "dgCMatrix"
                                                                                           
AL627309.5   . .  1 . .  . . .  . .  .  .  .  . . . .  .  . .  . .  .  . . . .  . .  . .  .
LINC01409    . .  . . .  . . .  . .  .  .  .  . . . .  .  . .  . .  .  . . . .  . .  . .  .
LINC01128    . .  . . .  . . .  . .  .  .  .  . . . .  .  . .  . .  .  . . . .  . 1  . .  .
NOC2L        . .  . . .  . . .  1 .  .  .  .  . . . .  1  . .  . .  .  . . . 1  . .  . 1  1
KLHL17       . .  . . .  1 . .  . .  .  .  .  . . . .  .  . .  . .  .  . . . .  . .  . .  .
ISG15        . .  . . .  2 . .  . .  .  .  .  . . . 1  .  1 .  . .  .  . . . .  . 1  . .  1
C1orf159     . .  . . .  . . .  . .  1  .  .  . . . 1  .  . .  . .  .  . . . .  . .  . .  .
SDF4         . .  . . .  . . .  . .  .  .  .  . . 1 .  .  . .  . .  .  . . . .  . .  . 1  .
B3GALT6      . .  . . .  . . .  . .  .  .  .  . . . .  .  . .  . .  .  . . . .  . .  . .  .
UBE2J2       . 1  . . .  . . 1 

In [16]:
rna_subset_meta <- metadata[metadata$batch==sample, ]
table(rna_subset_meta$batch)
table(rna_subset_meta$cell_type)

rna_subset_meta <- as.data.frame(rna_subset_meta)
rownames(rna_subset_meta) <- rna_subset_meta$barcode


s3d10 
 6781 


            B1 B       CD14+ Mono       CD16+ Mono CD4+ T activated 
             127             1603              253              207 
    CD4+ T naive           CD8+ T     CD8+ T naive             cDC2 
             106              432              366               83 
    Erythroblast         G/M prog              HSC              ILC 
             391              432              291              129 
      Lymph prog        MK/E prog    Naive CD20+ B               NK 
             231              125              421              545 
      Normoblast              pDC      Plasma cell  Proerythroblast 
             129              427               13              111 
  Transitional B 
             359 

In [17]:
# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"],
                       dataset["task_type"],
                       sample,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          sample,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   sample,
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], sample,"RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [18]:
# Load binarized ATAC
ATAC_Dir <- paste0(input_path, "/BMMC-multiome-binarized-ATAC-peaks.mtx")
ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

metadata <- read.csv(paste0(ATAC_Dir, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

atac_subset_meta <- metadata[metadata$batch==sample, ]
table(atac_subset_meta$batch)
table(atac_subset_meta$cell_type)

atac_subset_meta <- as.data.frame(atac_subset_meta)
rownames(atac_subset_meta) <- atac_subset_meta$barcode

In [32]:
# subset atac counts matrix
ATAC_subset_counts <- ATAC_counts[ , colnames(ATAC_counts) %in% atac_subset_meta$barcode]

# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       sample,
                       "ATAC", "binarized_peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(atac_subset_meta, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          sample,
                          "ATAC", "binarized_peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = atac_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   sample, 
                                   "ATAC", "binarized_peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], sample,"ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



In [20]:
# Load ATAC
ATAC_Dir <- paste0(input_path, "/BMMC-multiome-raw-ATAC-peaks.mtx")
ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

metadata <- read.csv(paste0(ATAC_Dir, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

atac_subset_meta <- metadata[metadata$batch==sample, ]
table(atac_subset_meta$batch)
table(atac_subset_meta$cell_type)

atac_subset_meta <- as.data.frame(atac_subset_meta)
rownames(atac_subset_meta) <- atac_subset_meta$barcode


s3d10 
 6781 


            B1 B       CD14+ Mono       CD16+ Mono CD4+ T activated 
             127             1603              253              207 
    CD4+ T naive           CD8+ T     CD8+ T naive             cDC2 
             106              432              366               83 
    Erythroblast         G/M prog              HSC              ILC 
             391              432              291              129 
      Lymph prog        MK/E prog    Naive CD20+ B               NK 
             231              125              421              545 
      Normoblast              pDC      Plasma cell  Proerythroblast 
             129              427               13              111 
  Transitional B 
             359 

In [22]:
# subset atac counts matrix
ATAC_subset_counts <- ATAC_counts[ , colnames(ATAC_counts) %in% atac_subset_meta$barcode]

# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       sample,
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(atac_subset_meta, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          sample,
                          "ATAC", "peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = atac_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   sample, 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], sample,"ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



## sample N cells

In [44]:
input_path <- "/Data/wangsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/RawData"
output_path <- "/Data/wangsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/c50k"
dataset <- unlist(fromJSON(file = "/Data/wangsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/dataset.json"))

# 设置提取数量
Ncell = 50000
process = "c50k"

In [10]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-multiome-raw-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

In [57]:
# random sample 500 cells of each donor
# bmmc_rna_500_meta <- metadata %>% group_by(batch) %>% slice_sample(n=500)

# random sample 10% cells of each donor
# rna_subset_meta <- metadata %>% group_by(batch) %>% sample_frac(percent)

# random sample Ncells of data
rna_subset_meta <- metadata %>% slice_sample(n=Ncell)

table(rna_subset_meta$batch)
table(rna_subset_meta$cell_type)




 s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
 9006  9791  6144  6039  8807  7141  9777  6200  2406  2528 11609 14340  6212 


               B1 B          CD14+ Mono          CD16+ Mono    CD4+ T activated 
               2767               15625                2769                8059 
       CD4+ T naive              CD8+ T        CD8+ T naive                cDC2 
               6261               16900                1426                1230 
       Erythroblast            G/M prog                 HSC ID2-hi myeloid prog 
               7167                1718                1531                 163 
                ILC          Lymph prog           MK/E prog       Naive CD20+ B 
               1183                2536                1259                7262 
                 NK          Normoblast                 pDC         Plasma cell 
              10018                2499                1714                 556 
    Proerythroblast      Transitional B 
               3318                4039 

In [41]:
rna_subset_meta <- as.data.frame(rna_subset_meta)
rownames(rna_subset_meta) <- rna_subset_meta$barcode

# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [16]:
# Load binarized ATAC
ATAC_Dir <- paste0(input_path, "/BMMC-multiome-raw-ATAC-peaks.mtx")
ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

metadata <- read.csv(paste0(ATAC_Dir, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

In [42]:
atac_subset_meta <- metadata[rownames(metadata) %in% rownames(rna_subset_meta), ]
table(atac_subset_meta$batch)
table(atac_subset_meta$cell_type)

atac_subset_meta <- as.data.frame(atac_subset_meta)
rownames(atac_subset_meta) <- atac_subset_meta$barcode


 s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
 4464  4848  3058  3050  4415  3508  4929  3163  1191  1305  5827  7104  3138 


               B1 B          CD14+ Mono          CD16+ Mono    CD4+ T activated 
               1372                7823                1390                3919 
       CD4+ T naive              CD8+ T        CD8+ T naive                cDC2 
               3168                8354                 725                 642 
       Erythroblast            G/M prog                 HSC ID2-hi myeloid prog 
               3518                 854                 768                  71 
                ILC          Lymph prog           MK/E prog       Naive CD20+ B 
                601                1277                 639                3705 
                 NK          Normoblast                 pDC         Plasma cell 
               5030                1273                 858                 285 
    Proerythroblast      Transitional B 
               1669                2059 

In [43]:
# subset atac counts matrix
ATAC_subset_counts <- ATAC_counts[ , colnames(ATAC_counts) %in% atac_subset_meta$barcode]

# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(atac_subset_meta, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ATAC", "peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = atac_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



## sample N + N cells

In [3]:
input_path <- "/home/wsg/BM/data/BMMC/RNA+ATAC/RawData"
output_path <- "/home/wsg/BM/data/BMMC/RNA+ATAC/c20k_c20k"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/data/BMMC/RNA+ATAC/RawData/dataset.json"))

In [4]:
# 设置提取样本
# s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
# 6224  6740  4279  4220  6111  4895  6781  4325  1679  1771  8023  9876  4325 

process = "c20k_c20k"

In [5]:
# # Load RNA
# RNA_Dir <- paste0(input_path, "/BMMC-multiome-raw-RNA-counts.mtx")
# RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

In [6]:
# 添加barcode到metadata
metadata <- read.csv(paste0(RNA_Dir, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

In [8]:
set.seed(1234)
# random sample Ncells of data
# rna_subset_meta <- metadata %>% slice_sample(n=20000)
meta_qry <- metadata %>% slice_sample(n=20000)

In [9]:
metadata_others <- metadata[!(metadata$barcode %in% meta_qry$barcode), ]
meta_ref <- metadata_others %>% slice_sample(n=20000)

meta_qry$data_size <- "c20k_1"
meta_ref$data_size <- "c20k_2"

rna_subset_meta <- rbind(meta_qry, meta_ref)

table(rna_subset_meta$data_size)
dim(rna_subset_meta)


c20k_1 c20k_2 
 20000  20000 

In [10]:
max(table(rna_subset_meta$barcode))

In [11]:
# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [12]:
# Load ATAC
ATAC_Dir <- paste0(input_path, "/BMMC-multiome-raw-ATAC-peaks.mtx")
ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

In [13]:
atac_subset_meta <- rna_subset_meta

table(atac_subset_meta$batch)
table(atac_subset_meta$cell_type)

# subset atac counts matrix
ATAC_subset_counts <- ATAC_counts[ , colnames(ATAC_counts) %in% atac_subset_meta$barcode]

# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
# write_csv(atac_subset_meta, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ATAC", "peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = atac_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")


 s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
 3578  3915  2546  2424  3520  2820  3938  2454   966  1043  4641  5685  2470 


               B1 B          CD14+ Mono          CD16+ Mono    CD4+ T activated 
               1117                6239                1096                3196 
       CD4+ T naive              CD8+ T        CD8+ T naive                cDC2 
               2530                6764                 578                 481 
       Erythroblast            G/M prog                 HSC ID2-hi myeloid prog 
               2863                 682                 616                  64 
                ILC          Lymph prog           MK/E prog       Naive CD20+ B 
                459                1023                 518                2912 
                 NK          Normoblast                 pDC         Plasma cell 
               4061                1017                 692                 218 
    Proerythroblast      Transitional B 
               1315                1559 

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



## sample (downsample N) + N cells

In [33]:
library(scuttle)
input_path <- "/home/wsg/BM/data/BMMC/RNA+ATAC/s1d1_s3d10"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/data/BMMC/RNA+ATAC/s1d1_s3d10/s1d1_s3d10.json"))

In [74]:
# 设置提取比例
proportion = 0.10
process = "s1d1_R10_A10_s3d10"

output_path <- paste0("/home/wsg/BM/data/BMMC/RNA+ATAC/s1d1_s3d10_robust/", process)

In [75]:
# # Load RNA
# RNA_Dir <- paste0(input_path, "/BMMC-multiome-s1d1_s3d10-RNA-counts.mtx/")
# RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# # 添加barcode到metadata
# metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
# metadata['barcode'] <- rownames(metadata)

In [76]:
table(metadata$batch)


 s1d1 s3d10 
 6224  6781 

In [77]:
metadata_data_1 <- metadata[metadata$batch == "s1d1", ]
metadata_data_2 <- metadata[metadata$batch == "s3d10", ]

table(metadata_data_1$batch)
table(metadata_data_2$batch)

RNA_counts_data_1  <- RNA_counts[, metadata_data_1$barcode]
RNA_counts_data_2  <- RNA_counts[, metadata_data_2$barcode]


s1d1 
6224 


s3d10 
 6781 

In [78]:
RNA_ds = RNA_counts_data_1
RNA_raw = RNA_counts_data_2

In [79]:
# RNA_counts
dim(RNA_ds)
nnzero(RNA_ds)
sum(RNA_ds)

In [80]:
RNA_ds_subset <- downsampleMatrix(RNA_ds, prop = proportion, bycol = T)

In [81]:
# RNA_subset_counts
dim(RNA_ds_subset)
nnzero(RNA_ds_subset)
sum(RNA_ds_subset)

In [82]:
if(!identical(rownames(RNA_ds_subset), rownames(RNA_raw))) {
  stop("The row names of the two matrices do not match.")
}

RNA_subset_counts <- cbind(RNA_ds_subset, RNA_raw)

if(!identical(colnames(RNA_subset_counts), metadata$barcode)) {
  stop("The row names of the two matrices do not match.")
}

In [83]:
sum(RNA_counts)
sum(RNA_raw)
sum(RNA_ds)

sum(RNA_subset_counts)
sum(RNA_raw)
sum(RNA_ds_subset)

In [84]:
# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = metadata)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [85]:
# # Load binarized ATAC
# ATAC_Dir <- paste0(input_path, "/BMMC-multiome-s1d1_s3d10-ATAC-peaks.mtx/")
# ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

# metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
# # 添加barcode到metadata
# metadata['barcode'] <- rownames(metadata)

In [86]:
metadata_data_1 <- metadata[metadata$batch == "s1d1", ]
metadata_data_2 <- metadata[metadata$batch == "s3d10", ]

table(metadata_data_1$batch)
table(metadata_data_2$batch)

ATAC_counts_data_1  <- ATAC_counts[, metadata_data_1$barcode]
ATAC_counts_data_2  <- ATAC_counts[, metadata_data_2$barcode]


s1d1 
6224 


s3d10 
 6781 

In [87]:
ATAC_ds = ATAC_counts_data_1
ATAC_raw = ATAC_counts_data_2

In [88]:
# ATAC_counts
dim(ATAC_ds)
nnzero(ATAC_ds)
sum(ATAC_ds)

In [89]:
ATAC_ds_subset <- downsampleMatrix(ATAC_ds, prop = proportion, bycol = T)

In [90]:
dim(ATAC_ds_subset)
nnzero(ATAC_ds_subset)
sum(ATAC_ds_subset)

In [91]:
if(!identical(rownames(ATAC_ds_subset), rownames(ATAC_raw))) {
  stop("The row names of the two matrices do not match.")
}

ATAC_subset_counts <- cbind(ATAC_ds_subset, ATAC_raw)

if(!identical(colnames(ATAC_subset_counts), metadata$barcode)) {
  stop("The row names of the two matrices do not match.")
}

In [92]:
sum(ATAC_counts)
sum(ATAC_raw)
sum(ATAC_ds)

sum(ATAC_subset_counts)
sum(ATAC_raw)
sum(ATAC_ds_subset)

In [93]:
# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ATAC", "peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = metadata)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



## sample N cells (N > rowdata)

In [23]:
input_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/RawData"
output_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/c500k"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/dataset.json"))

# 设置提取数量
Ncell = 500000
process = "c100k"

In [9]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-multiome-raw-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

In [24]:
# random sample Ncells more than raw data
set.seed(1)
rna_subset_meta_1 <- metadata %>% slice_sample(n=50000)
set.seed(2)
rna_subset_meta_2 <- metadata %>% slice_sample(n=50000)
set.seed(3)
rna_subset_meta_3 <- metadata %>% slice_sample(n=50000)
set.seed(4)
rna_subset_meta_4 <- metadata %>% slice_sample(n=50000)
set.seed(5)
rna_subset_meta_5 <- metadata %>% slice_sample(n=50000)
set.seed(6)
rna_subset_meta_6 <- metadata %>% slice_sample(n=50000)
set.seed(7)
rna_subset_meta_7 <- metadata %>% slice_sample(n=50000)
set.seed(8)
rna_subset_meta_8 <- metadata %>% slice_sample(n=50000)
set.seed(9)
rna_subset_meta_9 <- metadata %>% slice_sample(n=50000)
set.seed(10)
rna_subset_meta_10 <- metadata %>% slice_sample(n=50000)

# rna_subset_meta <- rbind(rna_subset_meta_1, rna_subset_meta_2)
rna_subset_meta <- rbind(rna_subset_meta_1, rna_subset_meta_2, rna_subset_meta_3, 
                         rna_subset_meta_4, rna_subset_meta_5, rna_subset_meta_6, 
                         rna_subset_meta_7, rna_subset_meta_8, rna_subset_meta_9, 
                         rna_subset_meta_10)
dim(rna_subset_meta)

In [25]:
rna_subset_meta <- as.data.frame(rna_subset_meta)

# subset counts matrix
RNA_subset_counts <- RNA_counts[, rna_subset_meta$barcode]
colnames(RNA_subset_counts) <- make.unique(colnames(RNA_subset_counts))

rna_subset_meta$barcode <- make.unique(rna_subset_meta$barcode)
rownames(rna_subset_meta) <- rna_subset_meta$barcode

In [26]:
dim(RNA_subset_counts)

In [27]:
# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [18]:
# Load binarized ATAC
ATAC_Dir <- paste0(input_path, "/BMMC-multiome-raw-ATAC-peaks.mtx")
ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

metadata <- read.csv(paste0(ATAC_Dir, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

In [28]:
# atac_subset_meta <- rbind(rna_subset_meta_1, rna_subset_meta_2)
atac_subset_meta <- rbind(rna_subset_meta_1, rna_subset_meta_2, rna_subset_meta_3, 
                         rna_subset_meta_4, rna_subset_meta_5, rna_subset_meta_6, 
                         rna_subset_meta_7, rna_subset_meta_8, rna_subset_meta_9, 
                         rna_subset_meta_10)
dim(atac_subset_meta)

table(atac_subset_meta$batch)
table(atac_subset_meta$cell_type)


 s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
44874 48653 30883 30565 44212 35296 48919 31296 12095 12767 57736 71465 31239 


               B1 B          CD14+ Mono          CD16+ Mono    CD4+ T activated 
              13584               78325               13702               39986 
       CD4+ T naive              CD8+ T        CD8+ T naive                cDC2 
              31679               83616                7284                6232 
       Erythroblast            G/M prog                 HSC ID2-hi myeloid prog 
              35572                8680                7694                 787 
                ILC          Lymph prog           MK/E prog       Naive CD20+ B 
               5962               12897                6393               36618 
                 NK          Normoblast                 pDC         Plasma cell 
              49889               12801                8570                2732 
    Proerythroblast      Transitional B 
              16635               20362 

In [29]:
atac_subset_meta <- as.data.frame(atac_subset_meta)

# subset counts matrix
ATAC_subset_counts <- ATAC_counts[, atac_subset_meta$barcode]
colnames(ATAC_subset_counts) <- make.unique(colnames(ATAC_subset_counts))

atac_subset_meta$barcode <- make.unique(atac_subset_meta$barcode)
rownames(atac_subset_meta) <- atac_subset_meta$barcode

In [30]:
dim(ATAC_subset_counts)

In [31]:
# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(atac_subset_meta, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ATAC", "peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = atac_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



## split sites and donors

In [285]:
input_path <- "/home/wsg/BM/data/BMMC/RNA+ATAC/RawData"
output_path <- "/home/wsg/BM/data/BMMC/RNA+ATAC/s1d2_s3d10"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/data/BMMC/RNA+ATAC/RawData/dataset.json"))

In [286]:
# 设置提取样本
# s1d1  s1d2  s1d3  s2d1  s2d4  s2d5 s3d10  s3d3  s3d6  s3d7  s4d1  s4d8  s4d9 
# 6224  6740  4279  4220  6111  4895  6781  4325  1679  1771  8023  9876  4325 

# sample = "s4d9"
sample = "s1d2_s3d10"

In [262]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-multiome-raw-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

In [287]:
# 添加barcode到metadata
metadata <- read.csv(paste0(RNA_Dir, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

set.seed(1234)

In [288]:
# rna_subset_meta <- metadata[metadata$batch==sample, ]

samples = strsplit(sample, split = "_")[[1]]
rna_subset_meta <- metadata[metadata$batch==samples[1] | metadata$batch==samples[2], ]

table(rna_subset_meta$batch)
table(rna_subset_meta$cell_type)

rna_subset_meta <- as.data.frame(rna_subset_meta)
rownames(rna_subset_meta) <- rna_subset_meta$barcode


 s1d2 s3d10 
 6740  6781 


               B1 B          CD14+ Mono          CD16+ Mono    CD4+ T activated 
                254                2307                 432                 916 
       CD4+ T naive              CD8+ T        CD8+ T naive                cDC2 
               1395                2064                 366                 172 
       Erythroblast            G/M prog                 HSC ID2-hi myeloid prog 
                527                 459                 324                  41 
                ILC          Lymph prog           MK/E prog       Naive CD20+ B 
                496                 259                 196                 542 
                 NK          Normoblast                 pDC         Plasma cell 
               1527                 198                 480                  26 
    Proerythroblast      Transitional B 
                141                 399 

In [289]:
# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"],
                       dataset["task_type"],
                       sample,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          sample,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   sample,
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
SeuratDisk::Convert(paste(dataset["data_name"], dataset["task_type"], sample, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [275]:
# Load ATAC
ATAC_Dir <- paste0(input_path, "/BMMC-multiome-raw-ATAC-peaks.mtx")
ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

In [290]:
# subset atac counts matrix
ATAC_subset_counts <- ATAC_counts[ , colnames(ATAC_counts) %in% rna_subset_meta$barcode]

# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       sample,
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          sample,
                          "ATAC", "peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = rna_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   sample, 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], sample,"ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



## dowmsample from single sample

In [3]:
input_path <- "/home/wsg/BM/data/BMMC/RNA+ATAC/site3/donor10"
output_path <- "/home/wsg/BM/data/BMMC/RNA+ATAC/site3/donor10_c5k"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/data/BMMC/RNA+ATAC/site3/donor10/s3d10.json"))

# 设置提取数量
Ncell = 5000
process = "s3d10c5k"

In [4]:
# Load RNA
RNA_Dir <- list.files(input_path, pattern = "\\RNA-counts.mtx$", full.names = TRUE)
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# add metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
metadata['barcode'] <- rownames(metadata)

## random sample 500 cells of each donor
# bmmc_rna_500_meta <- metadata %>% group_by(batch) %>% slice_sample(n=500)

## random sample 10% cells of each donor
# rna_subset_meta <- metadata %>% group_by(batch) %>% sample_frac(percent)

# random sample Ncells of data
rna_subset_meta <- metadata %>% slice_sample(n=Ncell)

table(rna_subset_meta$batch)
table(rna_subset_meta$cell_type)

# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))

write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")


s3d10 
 5000 


            B1 B       CD14+ Mono       CD16+ Mono CD4+ T activated 
              89             1185              181              141 
    CD4+ T naive           CD8+ T     CD8+ T naive             cDC2 
              79              305              274               66 
    Erythroblast         G/M prog              HSC              ILC 
             286              325              211              103 
      Lymph prog        MK/E prog    Naive CD20+ B               NK 
             177               98              298              399 
      Normoblast              pDC      Plasma cell  Proerythroblast 
             104              310                9               87 
  Transitional B 
             273 

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [5]:
# Load binarized ATAC
ATAC_Dir <- list.files(input_path, pattern = "\\ATAC-peaks.mtx$", full.names = TRUE)
ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

# add metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
metadata['barcode'] <- rownames(metadata)

atac_subset_meta <- metadata[rownames(metadata) %in% rownames(rna_subset_meta), ]

table(atac_subset_meta$batch)
table(atac_subset_meta$cell_type)

# subset atac counts matrix
ATAC_subset_counts <- ATAC_counts[ , colnames(ATAC_counts) %in% atac_subset_meta$barcode]

# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(atac_subset_meta, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ATAC", "peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = atac_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")


s3d10 
 5000 


            B1 B       CD14+ Mono       CD16+ Mono CD4+ T activated 
              89             1185              181              141 
    CD4+ T naive           CD8+ T     CD8+ T naive             cDC2 
              79              305              274               66 
    Erythroblast         G/M prog              HSC              ILC 
             286              325              211              103 
      Lymph prog        MK/E prog    Naive CD20+ B               NK 
             177               98              298              399 
      Normoblast              pDC      Plasma cell  Proerythroblast 
             104              310                9               87 
  Transitional B 
             273 

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



# Data Manipulation: BMMC CITE-seq

In [5]:
input_path <- ""
output_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/dataset.json"))

In [5]:
# Load RNA
BMMC_RNA_Dir <- paste0(input_path, "/BMMC-raw-pair-RNA-counts.mtx")
BMMC_RNA_counts <- Read10X(data.dir = BMMC_RNA_Dir, gene.column = 1)

as(<dgTMatrix>, "dgCMatrix") is deprecated since Matrix 1.5-0; do as(., "CsparseMatrix") instead

   [[ suppressing 32 column names 'GCATTAGCATAAGCGG-1-s1d1', 'TACAGGTGTTAGAGTA-1-s1d1', 'AGGATCTAGGTCTACT-1-s1d1' ... ]]

   [[ suppressing 32 column names 'GCATTAGCATAAGCGG-1-s1d1', 'TACAGGTGTTAGAGTA-1-s1d1', 'AGGATCTAGGTCTACT-1-s1d1' ... ]]

   [[ suppressing 32 column names 'GCATTAGCATAAGCGG-1-s1d1', 'TACAGGTGTTAGAGTA-1-s1d1', 'AGGATCTAGGTCTACT-1-s1d1' ... ]]



13953 x 90261 sparse Matrix of class "dgCMatrix"
                                                                                         
AL627309.5    .  .  .  . .  . .   1   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
LINC01409     .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
LINC01128     .  .  .  . .  2 .   .   1  1  4  1  .  .  .  1  .  .  .  .   .   .  .  .  .
LINC00115     .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
FAM41C        .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
NOC2L         .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  1  .   .   2  .  .  .
KLHL17        .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
HES4          .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
ISG15         .  .  .  . .  . .   1   .  .  .  .  1  .  1  .  .  .  .  .   1   .  .  .  .
AGRN          .  .  .  . .  . .   .   .  .  .  .  .

In [6]:
# save raw rna to rds
saveRDS(BMMC_RNA_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          "raw",
                          dataset["task_type"], 
                          "RNA", "counts.rds", sep = "-")))

In [7]:
# Load ADT
BMMC_ADT_Dir <- paste0(input_path, "/BMMC-raw-pair-ADT-counts.mtx")
BMMC_ADT_counts <- Read10X(data.dir = BMMC_ADT_Dir, gene.column = 1)
BMMC_ADT_counts

   [[ suppressing 33 column names 'GCATTAGCATAAGCGG-1-s1d1', 'TACAGGTGTTAGAGTA-1-s1d1', 'AGGATCTAGGTCTACT-1-s1d1' ... ]]



134 x 90261 sparse Matrix of class "dgCMatrix"
                                                                              
CD86         1  55   .   .     1    .   .  51   1   .    .   .   4   .   2   1
CD274        1   7   2   1     7    6   1   5   2   1    3   2   8   1   .   2
CD270       16  38  23  13    12    6  10  25  10   2    6   6  40  11  18  36
CD155        .  29   .   4     4    2   1  16   .   1    .   .   .   6   .   1
CD112        3  23   2  85     9    1   .  10   .   3    7   1   4   4   6   5
CD47       103 280  72  79   129   82 183 273 181  92   70  65 217 146  38 143
CD48        36 402  59  81     8    6  74 367 181   2    4  76 122  64  33  88
CD40        32   9  48   .     3    7   .  16   .   2    3   .  40   2  56  72
CD154       13  33  11   7    21   29   4  33   5   7   37   3  14   9  10  17
CD52        46  43  17  16     5    2  13  91  73   2    3  33 104   5  41 117
CD3          .   7   .   2     2    6   2  11 245   .    4  50   .   3   1   3
CD8  

In [8]:
# save raw adt to rds
saveRDS(BMMC_ADT_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          "raw",
                          dataset["task_type"], 
                          "ADT", "counts.rds", sep = "-")))

## subset 5 percents cells of BMMC CITE-seq

In [4]:
input_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/RawData"
output_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/p10"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/RawData/dataset.json"))

In [6]:
# 设置提取比例
percent = 0.1
process = "p10"

In [7]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)


set.seed(1234)

In [8]:
RNA_counts

  [[ suppressing 32 column names ‘GCATTAGCATAAGCGG-1-s1d1’, ‘TACAGGTGTTAGAGTA-1-s1d1’, ‘AGGATCTAGGTCTACT-1-s1d1’ ... ]]

  [[ suppressing 32 column names ‘GCATTAGCATAAGCGG-1-s1d1’, ‘TACAGGTGTTAGAGTA-1-s1d1’, ‘AGGATCTAGGTCTACT-1-s1d1’ ... ]]

  [[ suppressing 32 column names ‘GCATTAGCATAAGCGG-1-s1d1’, ‘TACAGGTGTTAGAGTA-1-s1d1’, ‘AGGATCTAGGTCTACT-1-s1d1’ ... ]]



13953 x 90261 sparse Matrix of class "dgCMatrix"
                                                                                         
AL627309.5    .  .  .  . .  . .   1   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
LINC01409     .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
LINC01128     .  .  .  . .  2 .   .   1  1  4  1  .  .  .  1  .  .  .  .   .   .  .  .  .
LINC00115     .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
FAM41C        .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
NOC2L         .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  1  .   .   2  .  .  .
KLHL17        .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
HES4          .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
ISG15         .  .  .  . .  . .   1   .  .  .  .  1  .  1  .  .  .  .  .   1   .  .  .  .
AGRN          .  .  .  . .  . .   .   .  .  .  .  .

In [11]:
# random sample 500 cells of each donor
# bmmc_rna_500_meta <- metadata %>% group_by(batch) %>% slice_sample(n=500)

# random sample 10% cells of each donor
rna_subset_meta <- metadata %>% group_by(batch) %>% sample_frac(percent)
table(rna_subset_meta$batch)
table(rna_subset_meta$cell_type)

rna_subset_meta <- as.data.frame(rna_subset_meta)
rownames(rna_subset_meta) <- rna_subset_meta$barcode


s1d1 s1d2 s1d3 s2d1 s2d4 s2d5 s3d1 s3d6 s3d7 s4d1 s4d8 s4d9 
 523  498  611 1046  558  912  952 1104 1147  546  393  736 


                      B1 B IGKC-                       B1 B IGKC+ 
                              55                               83 
                      CD14+ Mono                       CD16+ Mono 
                            2187                              244 
                CD4+ T activated     CD4+ T activated integrinB7+ 
                             725                              117 
           CD4+ T CD314+ CD45RA+                     CD4+ T naive 
                              14                              553 
                   CD8+ T CD49f+             CD8+ T CD57+ CD45RA+ 
                              98                              126 
            CD8+ T CD57+ CD45RO+             CD8+ T CD69+ CD45RA+ 
                             167                               82 
            CD8+ T CD69+ CD45RO+                     CD8+ T naive 
                             100                              299 
CD8+ T naive CD127+ CD26- CD101-            CD8+ T TIGIT+ CD4

In [12]:
# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [13]:
# Load ADT
ADT_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-ADT-counts.mtx")
ADT_counts <- Read10X(data.dir = ADT_Dir, gene.column = 1)

metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

adt_subset_meta <- metadata[rownames(metadata) %in% rownames(rna_subset_meta), ]
table(adt_subset_meta$batch)
table(adt_subset_meta$cell_type)

adt_subset_meta <- as.data.frame(adt_subset_meta)
rownames(adt_subset_meta) <- adt_subset_meta$barcode


s1d1 s1d2 s1d3 s2d1 s2d4 s2d5 s3d1 s3d6 s3d7 s4d1 s4d8 s4d9 
 523  498  611 1046  558  912  952 1104 1147  546  393  736 


                      B1 B IGKC-                       B1 B IGKC+ 
                              55                               83 
                      CD14+ Mono                       CD16+ Mono 
                            2187                              244 
                CD4+ T activated     CD4+ T activated integrinB7+ 
                             725                              117 
           CD4+ T CD314+ CD45RA+                     CD4+ T naive 
                              14                              553 
                   CD8+ T CD49f+             CD8+ T CD57+ CD45RA+ 
                              98                              126 
            CD8+ T CD57+ CD45RO+             CD8+ T CD69+ CD45RA+ 
                             167                               82 
            CD8+ T CD69+ CD45RO+                     CD8+ T naive 
                             100                              299 
CD8+ T naive CD127+ CD26- CD101-            CD8+ T TIGIT+ CD4

In [14]:
# set.seed(1234)
# random sample 500 cells of each donor
# bmmc_rna_500_meta <- metadata %>% group_by(batch) %>% slice_sample(n=500)

# random sample 10% cells of each donor
# bmmc_rna_10p_meta <- metadata %>% group_by(batch) %>% sample_frac(.1)



# subset adt counts matrix
ADT_subset_counts <- ADT_counts[ , colnames(ADT_counts) %in% adt_subset_meta$barcode]

# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ADT", "counts.mtx", sep = "-"))
write10xCounts(x = ADT_subset_counts, path = data_path, version = "3")
write_csv(adt_subset_meta, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ADT", "counts.rds", sep = "-")))

ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, assay = "ADT", meta.data = adt_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ADT", "counts.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ADT", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ADT

Adding data for ADT

No variable features found for ADT

No feature-level metadata found for ADT

Validating h5Seurat file

Adding data from ADT as X

Adding counts from ADT as raw

Transfering meta.data to obs



## subset cells from single sample

In [17]:
input_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/RawData"
output_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/s2d1"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/dataset.json"))

In [13]:
# 设置提取样本
#  s1d1  s1d2  s1d3  s2d1  s2d4  s2d5  s3d1  s3d6  s3d7  s4d1  s4d8  s4d9 
#  5227  4978  6106 10465  5584  9122  9521 11035 11473  5456  3929  7365 
sample = "s2d1"

In [16]:
output_path

In [5]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

set.seed(1234)

In [None]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-multiome-raw-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(RNA_Dir, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

set.seed(1234)

In [14]:
RNA_counts

  [[ suppressing 32 column names ‘GCATTAGCATAAGCGG-1-s1d1’, ‘TACAGGTGTTAGAGTA-1-s1d1’, ‘AGGATCTAGGTCTACT-1-s1d1’ ... ]]

  [[ suppressing 32 column names ‘GCATTAGCATAAGCGG-1-s1d1’, ‘TACAGGTGTTAGAGTA-1-s1d1’, ‘AGGATCTAGGTCTACT-1-s1d1’ ... ]]

  [[ suppressing 32 column names ‘GCATTAGCATAAGCGG-1-s1d1’, ‘TACAGGTGTTAGAGTA-1-s1d1’, ‘AGGATCTAGGTCTACT-1-s1d1’ ... ]]



13953 x 90261 sparse Matrix of class "dgCMatrix"
                                                                                         
AL627309.5    .  .  .  . .  . .   1   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
LINC01409     .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
LINC01128     .  .  .  . .  2 .   .   1  1  4  1  .  .  .  1  .  .  .  .   .   .  .  .  .
LINC00115     .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
FAM41C        .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
NOC2L         .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  1  .   .   2  .  .  .
KLHL17        .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
HES4          .  .  .  . .  . .   .   .  .  .  .  .  .  .  .  .  .  .  .   .   .  .  .  .
ISG15         .  .  .  . .  . .   1   .  .  .  .  1  .  1  .  .  .  .  .   1   .  .  .  .
AGRN          .  .  .  . .  . .   .   .  .  .  .  .

In [15]:
rna_subset_meta <- metadata[metadata$batch==sample, ]
table(rna_subset_meta$batch)
table(rna_subset_meta$cell_type)

rna_subset_meta <- as.data.frame(rna_subset_meta)
rownames(rna_subset_meta) <- rna_subset_meta$barcode


 s2d1 
10465 


                  B1 B IGKC-                   B1 B IGKC+ 
                          49                           86 
                  CD14+ Mono                   CD16+ Mono 
                        2958                          271 
            CD4+ T activated CD4+ T activated integrinB7+ 
                         690                          105 
                CD4+ T naive                CD8+ T CD49f+ 
                         883                           64 
        CD8+ T CD57+ CD45RA+         CD8+ T CD57+ CD45RO+ 
                          30                           48 
        CD8+ T CD69+ CD45RA+         CD8+ T CD69+ CD45RO+ 
                          92                           88 
                CD8+ T naive        CD8+ T TIGIT+ CD45RO+ 
                         312                          118 
                        cDC2                 Erythroblast 
                         166                         1202 
                    G/M prog                  gdT TCRVD

In [19]:
# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"],
                       dataset["task_type"],
                       sample,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          sample,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   sample,
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], sample,"RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [20]:
# Load ADT
ADT_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-ADT-counts.mtx")
ADT_counts <- Read10X(data.dir = ADT_Dir, gene.column = 1)

metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

In [21]:
adt_subset_meta <- metadata[rownames(metadata) %in% rownames(rna_subset_meta), ]
table(adt_subset_meta$batch)
table(adt_subset_meta$cell_type)

adt_subset_meta <- as.data.frame(adt_subset_meta)
rownames(adt_subset_meta) <- adt_subset_meta$barcode


 s2d1 
10465 


                  B1 B IGKC-                   B1 B IGKC+ 
                          49                           86 
                  CD14+ Mono                   CD16+ Mono 
                        2958                          271 
            CD4+ T activated CD4+ T activated integrinB7+ 
                         690                          105 
                CD4+ T naive                CD8+ T CD49f+ 
                         883                           64 
        CD8+ T CD57+ CD45RA+         CD8+ T CD57+ CD45RO+ 
                          30                           48 
        CD8+ T CD69+ CD45RA+         CD8+ T CD69+ CD45RO+ 
                          92                           88 
                CD8+ T naive        CD8+ T TIGIT+ CD45RO+ 
                         312                          118 
                        cDC2                 Erythroblast 
                         166                         1202 
                    G/M prog                  gdT TCRVD

In [23]:
# subset adt counts matrix
ADT_subset_counts <- ADT_counts[ , colnames(ADT_counts) %in% adt_subset_meta$barcode]

# save raw adt to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       sample,
                       "ADT", "peaks.mtx", sep = "-"))
write10xCounts(x = ADT_subset_counts, path = data_path, version = "3")
write_csv(adt_subset_meta, here(output_path, "metadata.csv"))

# save raw adt to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          sample,
                          "ADT", "peaks.rds", sep = "-")))

ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, assay = "ADT", meta.data = adt_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   sample, 
                                   "ADT", "peaks.h5Seurat", sep = "-")))

# save raw adt to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], sample, "ADT", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ADT

Adding data for ADT

No variable features found for ADT

No feature-level metadata found for ADT

Validating h5Seurat file

Adding data from ADT as X

Adding counts from ADT as raw

Transfering meta.data to obs



## subset N cells

In [26]:
input_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/RawData"
output_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/c50k"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/dataset.json"))

# 设置提取数量
Ncell = 50000
process = "c50k"

In [5]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)


set.seed(1234)

In [31]:
dim(RNA_counts)

In [27]:
# random sample Ncells of data
rna_subset_meta <- metadata %>% slice_sample(n=Ncell)

table(rna_subset_meta$batch)
table(rna_subset_meta$cell_type)


s1d1 s1d2 s1d3 s2d1 s2d4 s2d5 s3d1 s3d6 s3d7 s4d1 s4d8 s4d9 
2918 2808 3398 5773 3149 4991 5255 6043 6328 3087 2217 4033 


                      B1 B IGKC-                       B1 B IGKC+ 
                             354                              427 
                      CD14+ Mono                       CD16+ Mono 
                           12019                             1441 
                CD4+ T activated     CD4+ T activated integrinB7+ 
                            3896                              594 
           CD4+ T CD314+ CD45RA+                     CD4+ T naive 
                              43                             3280 
                   CD8+ T CD49f+             CD8+ T CD57+ CD45RA+ 
                             486                              723 
            CD8+ T CD57+ CD45RO+             CD8+ T CD69+ CD45RA+ 
                             812                              429 
            CD8+ T CD69+ CD45RO+                     CD8+ T naive 
                             497                             1730 
CD8+ T naive CD127+ CD26- CD101-            CD8+ T TIGIT+ CD4

In [28]:
rna_subset_meta <- as.data.frame(rna_subset_meta)
rownames(rna_subset_meta) <- rna_subset_meta$barcode

# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [8]:
# Load ADT
ADT_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-ADT-counts.mtx")
ADT_counts <- Read10X(data.dir = ADT_Dir, gene.column = 1)

metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

In [29]:
adt_subset_meta <- metadata[rownames(metadata) %in% rownames(rna_subset_meta), ]
table(adt_subset_meta$batch)
table(adt_subset_meta$cell_type)

adt_subset_meta <- as.data.frame(adt_subset_meta)
rownames(adt_subset_meta) <- adt_subset_meta$barcode


s1d1 s1d2 s1d3 s2d1 s2d4 s2d5 s3d1 s3d6 s3d7 s4d1 s4d8 s4d9 
2918 2808 3398 5773 3149 4991 5255 6043 6328 3087 2217 4033 


                      B1 B IGKC-                       B1 B IGKC+ 
                             354                              427 
                      CD14+ Mono                       CD16+ Mono 
                           12019                             1441 
                CD4+ T activated     CD4+ T activated integrinB7+ 
                            3896                              594 
           CD4+ T CD314+ CD45RA+                     CD4+ T naive 
                              43                             3280 
                   CD8+ T CD49f+             CD8+ T CD57+ CD45RA+ 
                             486                              723 
            CD8+ T CD57+ CD45RO+             CD8+ T CD69+ CD45RA+ 
                             812                              429 
            CD8+ T CD69+ CD45RO+                     CD8+ T naive 
                             497                             1730 
CD8+ T naive CD127+ CD26- CD101-            CD8+ T TIGIT+ CD4

In [30]:
# subset adt counts matrix
ADT_subset_counts <- ADT_counts[ , colnames(ADT_counts) %in% adt_subset_meta$barcode]

# save raw adt to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ADT", "peaks.mtx", sep = "-"))
write10xCounts(x = ADT_subset_counts, path = data_path, version = "3")
write_csv(adt_subset_meta, here(output_path, "metadata.csv"))

# save raw adt to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ADT", "peaks.rds", sep = "-")))

ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, assay = "ADT", meta.data = adt_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ADT", "peaks.h5Seurat", sep = "-")))

# save raw adt to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ADT", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ADT

Adding data for ADT

No variable features found for ADT

No feature-level metadata found for ADT

Validating h5Seurat file

Adding data from ADT as X

Adding counts from ADT as raw

Transfering meta.data to obs



## sample N + N cells

In [672]:
input_path <- "/home/wsg/BM/data/BMMC/RNA+ADT/RawData"
output_path <- "/home/wsg/BM/data/BMMC/RNA+ADT/c20k_c20k"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/data/BMMC/RNA+ADT/RawData/dataset.json"))

In [673]:
# 设置提取样本
#  s1d1  s1d2  s1d3  s2d1  s2d4  s2d5  s3d1  s3d6  s3d7  s4d1  s4d8  s4d9 
#  5227  4978  6106 10465  5584  9122  9521 11035 11473  5456  3929  7365 

process = "c20k_c20k"

In [674]:
# # Load RNA
# RNA_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-RNA-counts.mtx")
# RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

In [675]:
# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

set.seed(1234)

In [676]:
set.seed(1234)
# random sample Ncells of data
# rna_subset_meta <- metadata %>% slice_sample(n=Ncell)
meta_qry <- metadata %>% slice_sample(n=20000)

In [677]:
metadata_others <- metadata[!(metadata$barcode %in% meta_qry$barcode), ]
meta_ref <- metadata_others %>% slice_sample(n=20000)

meta_qry$data_size <- "c20k_1"
meta_ref$data_size <- "c20k_2"

rna_subset_meta <- rbind(meta_qry, meta_ref)

table(rna_subset_meta$data_size)
dim(rna_subset_meta)


c20k_1 c20k_2 
 20000  20000 

In [678]:
max(table(rna_subset_meta$barcode))

In [679]:
# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [680]:
# # Load ADT
# ADT_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-ADT-counts.mtx")
# ADT_counts <- Read10X(data.dir = ADT_Dir, gene.column = 1)

In [681]:
adt_subset_meta <- rna_subset_meta

# subset adt counts matrix
ADT_subset_counts <- ADT_counts[ , colnames(ADT_counts) %in% adt_subset_meta$barcode]

# save raw adt to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ADT", "peaks.mtx", sep = "-"))
write10xCounts(x = ADT_subset_counts, path = data_path, version = "3")
write_csv(adt_subset_meta, here(output_path, "metadata.csv"))

# save raw adt to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ADT", "peaks.rds", sep = "-")))

ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, assay = "ADT", meta.data = adt_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ADT", "peaks.h5Seurat", sep = "-")))

# save raw adt to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ADT", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ADT

Adding data for ADT

No variable features found for ADT

No feature-level metadata found for ADT

Validating h5Seurat file

Adding data from ADT as X

Adding counts from ADT as raw

Transfering meta.data to obs



## sample (downsample N) + N cells

In [94]:
library(scuttle)
input_path <- "/home/wsg/BM/data/BMMC/RNA+ADT/s2d1_s3d6"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/data/BMMC/RNA+ADT/s2d1_s3d6/s2d1_s3d6.json"))

In [239]:
# 设置提取比例
proportion = 0.10
process = "s2d1_s3d6_R10_A10"

output_path <- paste0("/home/wsg/BM/data/BMMC/RNA+ADT/s2d1_s3d6_robust/", process)

In [240]:
# # Load RNA
# RNA_Dir <- paste0(input_path, "/BMMC-CITE_seq-s2d1_s3d6-RNA-counts.mtx/")
# RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# # 添加barcode到metadata
# metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
# metadata['barcode'] <- rownames(metadata)

In [241]:
table(metadata$batch)


 s2d1  s3d6 
10465 11035 

In [242]:
metadata_data_1 <- metadata[metadata$batch == "s2d1", ]
metadata_data_2 <- metadata[metadata$batch == "s3d6", ]

table(metadata_data_1$batch)
table(metadata_data_2$batch)

RNA_counts_data_1  <- RNA_counts[, metadata_data_1$barcode]
RNA_counts_data_2  <- RNA_counts[, metadata_data_2$barcode]


 s2d1 
10465 


 s3d6 
11035 

In [243]:
RNA_raw = RNA_counts_data_1
RNA_ds = RNA_counts_data_2

In [244]:
# RNA_counts
dim(RNA_ds)
nnzero(RNA_ds)
sum(RNA_ds)

In [245]:
RNA_ds_subset <- downsampleMatrix(RNA_ds, prop = proportion, bycol = T)

In [246]:
# RNA_subset_counts
dim(RNA_ds_subset)
nnzero(RNA_ds_subset)
sum(RNA_ds_subset)

In [247]:
if(!identical(rownames(RNA_ds_subset), rownames(RNA_raw))) {
  stop("The row names of the two matrices do not match.")
}

RNA_subset_counts <- cbind(RNA_raw, RNA_ds_subset)

if(!identical(colnames(RNA_subset_counts), metadata$barcode)) {
  stop("The row names of the two matrices do not match.")
}

In [248]:
sum(RNA_counts)
sum(RNA_raw)
sum(RNA_ds)

sum(RNA_subset_counts)
sum(RNA_raw)
sum(RNA_ds_subset)

In [249]:
# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = metadata)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [250]:
# # Load binarized ADT
# ADT_Dir <- paste0(input_path, "/BMMC-CITE_seq-s2d1_s3d6-ADT-counts.mtx/")
# ADT_counts <- Read10X(data.dir = ADT_Dir, gene.column = 1)

# metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
# # 添加barcode到metadata
# metadata['barcode'] <- rownames(metadata)

In [251]:
metadata_data_1 <- metadata[metadata$batch == "s2d1", ]
metadata_data_2 <- metadata[metadata$batch == "s3d6", ]

table(metadata_data_1$batch)
table(metadata_data_2$batch)

ADT_counts_data_1  <- ADT_counts[, metadata_data_1$barcode]
ADT_counts_data_2  <- ADT_counts[, metadata_data_2$barcode]


 s2d1 
10465 


 s3d6 
11035 

In [252]:
ADT_raw = ADT_counts_data_1
ADT_ds = ADT_counts_data_2

In [253]:
# ATAC_counts
dim(ADT_ds)
nnzero(ADT_ds)
sum(ADT_ds)

In [254]:
ADT_ds_subset <- downsampleMatrix(ADT_ds, prop = proportion, bycol = T)

In [255]:
dim(ADT_ds_subset)
nnzero(ADT_ds_subset)
sum(ADT_ds_subset)

In [256]:
if(!identical(rownames(ADT_ds_subset), rownames(ADT_raw))) {
  stop("The row names of the two matrices do not match.")
}

ADT_subset_counts <- cbind(ADT_raw, ADT_ds_subset)

if(!identical(colnames(ADT_subset_counts), metadata$barcode)) {
  stop("The row names of the two matrices do not match.")
}

In [257]:
sum(ADT_counts)
sum(ADT_raw)
sum(ADT_ds)

sum(ADT_subset_counts)
sum(ADT_raw)
sum(ADT_ds_subset)

In [258]:
# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ADT", "counts.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ADT", "counts.rds", sep = "-")))

ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, assay = "ADT", meta.data = metadata)

# save raw rna to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ADT", "counts.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ADT", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ADT

Adding data for ADT

No variable features found for ADT

No feature-level metadata found for ADT

Validating h5Seurat file

Adding data from ADT as X

Adding counts from ADT as raw

Transfering meta.data to obs



## seubset N cells (N > rowdata)

In [15]:
input_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/RawData"
output_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/c500k"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/dataset.json"))

# 设置提取数量
Ncell = 500000
process = "c500k"

In [4]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

In [16]:
# random sample Ncells more than raw data
set.seed(1)
rna_subset_meta_1 <- metadata %>% slice_sample(n=50000)
set.seed(2)
rna_subset_meta_2 <- metadata %>% slice_sample(n=50000)
set.seed(3)
rna_subset_meta_3 <- metadata %>% slice_sample(n=50000)
set.seed(4)
rna_subset_meta_4 <- metadata %>% slice_sample(n=50000)
set.seed(5)
rna_subset_meta_5 <- metadata %>% slice_sample(n=50000)
set.seed(6)
rna_subset_meta_6 <- metadata %>% slice_sample(n=50000)
set.seed(7)
rna_subset_meta_7 <- metadata %>% slice_sample(n=50000)
set.seed(8)
rna_subset_meta_8 <- metadata %>% slice_sample(n=50000)
set.seed(9)
rna_subset_meta_9 <- metadata %>% slice_sample(n=50000)
set.seed(10)
rna_subset_meta_10 <- metadata %>% slice_sample(n=50000)

# rna_subset_meta <- rbind(rna_subset_meta_1, rna_subset_meta_2)
rna_subset_meta <- rbind(rna_subset_meta_1, rna_subset_meta_2, rna_subset_meta_3, 
                         rna_subset_meta_4, rna_subset_meta_5, rna_subset_meta_6, 
                         rna_subset_meta_7, rna_subset_meta_8, rna_subset_meta_9, 
                         rna_subset_meta_10)
dim(rna_subset_meta)

In [17]:
table(rna_subset_meta$batch)
table(rna_subset_meta$cell_type)


 s1d1  s1d2  s1d3  s2d1  s2d4  s2d5  s3d1  s3d6  s3d7  s4d1  s4d8  s4d9 
29016 27531 33951 57858 30792 50399 52772 61228 63328 30263 21841 41021 


                      B1 B IGKC-                       B1 B IGKC+ 
                            3427                             4530 
                      CD14+ Mono                       CD16+ Mono 
                          119866                            14616 
                CD4+ T activated     CD4+ T activated integrinB7+ 
                           38738                             5858 
           CD4+ T CD314+ CD45RA+                     CD4+ T naive 
                             541                            32643 
                   CD8+ T CD49f+             CD8+ T CD57+ CD45RA+ 
                            5055                             7277 
            CD8+ T CD57+ CD45RO+             CD8+ T CD69+ CD45RA+ 
                            8173                             4134 
            CD8+ T CD69+ CD45RO+                     CD8+ T naive 
                            4968                            17153 
CD8+ T naive CD127+ CD26- CD101-            CD8+ T TIGIT+ CD4

In [18]:
rna_subset_meta <- as.data.frame(rna_subset_meta)

# subset counts matrix
RNA_subset_counts <- RNA_counts[, rna_subset_meta$barcode]
colnames(RNA_subset_counts) <- make.unique(colnames(RNA_subset_counts))

rna_subset_meta$barcode <- make.unique(rna_subset_meta$barcode)
rownames(rna_subset_meta) <- rna_subset_meta$barcode

In [19]:
dim(RNA_subset_counts)

In [20]:
rna_subset_meta <- as.data.frame(rna_subset_meta)
rownames(rna_subset_meta) <- rna_subset_meta$barcode

# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

“Some cells in meta.data not present in provided counts matrix”
Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [9]:
# Load ADT
ADT_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-ADT-counts.mtx")
ADT_counts <- Read10X(data.dir = ADT_Dir, gene.column = 1)

metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

In [21]:
# adt_subset_meta <- rbind(rna_subset_meta_1, rna_subset_meta_2)
adt_subset_meta <- rbind(rna_subset_meta_1, rna_subset_meta_2, rna_subset_meta_3, 
                         rna_subset_meta_4, rna_subset_meta_5, rna_subset_meta_6, 
                         rna_subset_meta_7, rna_subset_meta_8, rna_subset_meta_9, 
                         rna_subset_meta_10)
dim(adt_subset_meta)

table(adt_subset_meta$batch)
table(adt_subset_meta$cell_type)


 s1d1  s1d2  s1d3  s2d1  s2d4  s2d5  s3d1  s3d6  s3d7  s4d1  s4d8  s4d9 
29016 27531 33951 57858 30792 50399 52772 61228 63328 30263 21841 41021 


                      B1 B IGKC-                       B1 B IGKC+ 
                            3427                             4530 
                      CD14+ Mono                       CD16+ Mono 
                          119866                            14616 
                CD4+ T activated     CD4+ T activated integrinB7+ 
                           38738                             5858 
           CD4+ T CD314+ CD45RA+                     CD4+ T naive 
                             541                            32643 
                   CD8+ T CD49f+             CD8+ T CD57+ CD45RA+ 
                            5055                             7277 
            CD8+ T CD57+ CD45RO+             CD8+ T CD69+ CD45RA+ 
                            8173                             4134 
            CD8+ T CD69+ CD45RO+                     CD8+ T naive 
                            4968                            17153 
CD8+ T naive CD127+ CD26- CD101-            CD8+ T TIGIT+ CD4

In [22]:
adt_subset_meta <- as.data.frame(adt_subset_meta)

# subset counts matrix
ADT_subset_counts <- ADT_counts[, adt_subset_meta$barcode]
colnames(ADT_subset_counts) <- make.unique(colnames(ADT_subset_counts))

adt_subset_meta$barcode <- make.unique(adt_subset_meta$barcode)
rownames(adt_subset_meta) <- adt_subset_meta$barcode

In [23]:
dim(ADT_subset_counts)

In [24]:
# subset adt counts matrix
ADT_subset_counts <- ADT_counts[ , colnames(ADT_counts) %in% adt_subset_meta$barcode]

# save raw adt to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ADT", "peaks.mtx", sep = "-"))
write10xCounts(x = ADT_subset_counts, path = data_path, version = "3")
write_csv(adt_subset_meta, here(output_path, "metadata.csv"))

# save raw adt to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ADT", "peaks.rds", sep = "-")))

ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, assay = "ADT", meta.data = adt_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ADT", "peaks.h5Seurat", sep = "-")))

# save raw adt to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ADT", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

“Some cells in meta.data not present in provided counts matrix”
Creating h5Seurat file for version 3.1.5.9900

Adding counts for ADT

Adding data for ADT

No variable features found for ADT

No feature-level metadata found for ADT

Validating h5Seurat file

Adding data from ADT as X

Adding counts from ADT as raw

Transfering meta.data to obs



## split sites and donors

In [559]:
input_path <- "/home/wsg/BM/data/BMMC/RNA+ADT/RawData"
output_path <- "/home/wsg/BM/data/BMMC/RNA+ADT/s2d1_s3d6"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/data/BMMC/RNA+ADT/RawData/dataset.json"))

In [560]:
# 设置提取样本
#  s1d1  s1d2  s1d3  s2d1  s2d4  s2d5  s3d1  s3d6  s3d7  s4d1  s4d8  s4d9 
#  5227  4978  6106 10465  5584  9122  9521 11035 11473  5456  3929  7365 

# sample = "s4d9"
sample = "s2d1_s3d6"

In [551]:
# # Load RNA
# RNA_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-RNA-counts.mtx")
# RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

In [561]:
# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = 1)
metadata['barcode'] <- rownames(metadata)

set.seed(1234)

In [562]:
# rna_subset_meta <- metadata[metadata$batch==sample, ]

samples = strsplit(sample, split = "_")[[1]]
rna_subset_meta <- metadata[metadata$batch==samples[1] | metadata$batch==samples[2], ]

table(rna_subset_meta$batch)
table(rna_subset_meta$cell_type)


 s2d1  s3d6 
10465 11035 


                  B1 B IGKC-                   B1 B IGKC+ 
                         116                          175 
                  CD14+ Mono                   CD16+ Mono 
                        5885                          519 
            CD4+ T activated CD4+ T activated integrinB7+ 
                        1234                          233 
                CD4+ T naive                CD8+ T CD49f+ 
                        1232                          290 
        CD8+ T CD57+ CD45RA+         CD8+ T CD57+ CD45RO+ 
                         487                          158 
        CD8+ T CD69+ CD45RA+         CD8+ T CD69+ CD45RO+ 
                         332                          373 
                CD8+ T naive        CD8+ T TIGIT+ CD45RA+ 
                         469                          244 
       CD8+ T TIGIT+ CD45RO+                         cDC2 
                         323                          562 
                Erythroblast                     G/M pr

In [563]:
# subset counts matrix
RNA_subset_counts <- RNA_counts[, colnames(RNA_counts) %in% rna_subset_meta$barcode]

# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"],
                       dataset["task_type"],
                       sample,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(rna_subset_meta, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          sample,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = rna_subset_meta)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   sample,
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], sample,"RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [564]:
# Load ADT
ADT_Dir <- paste0(input_path, "/BMMC-CITE_seq-raw-ADT-counts.mtx")
ADT_counts <- Read10X(data.dir = ADT_Dir, gene.column = 1)

In [565]:
ADT_counts

  [[ suppressing 33 column names ‘GCATTAGCATAAGCGG-1-s1d1’, ‘TACAGGTGTTAGAGTA-1-s1d1’, ‘AGGATCTAGGTCTACT-1-s1d1’ ... ]]



134 x 90261 sparse Matrix of class "dgCMatrix"
                                                                              
CD86         1  55   .   .     1    .   .  51   1   .    .   .   4   .   2   1
CD274        1   7   2   1     7    6   1   5   2   1    3   2   8   1   .   2
CD270       16  38  23  13    12    6  10  25  10   2    6   6  40  11  18  36
CD155        .  29   .   4     4    2   1  16   .   1    .   .   .   6   .   1
CD112        3  23   2  85     9    1   .  10   .   3    7   1   4   4   6   5
CD47       103 280  72  79   129   82 183 273 181  92   70  65 217 146  38 143
CD48        36 402  59  81     8    6  74 367 181   2    4  76 122  64  33  88
CD40        32   9  48   .     3    7   .  16   .   2    3   .  40   2  56  72
CD154       13  33  11   7    21   29   4  33   5   7   37   3  14   9  10  17
CD52        46  43  17  16     5    2  13  91  73   2    3  33 104   5  41 117
CD3          .   7   .   2     2    6   2  11 245   .    4  50   .   3   1   3
CD8  

In [566]:
adt_subset_meta = rna_subset_meta
ADT_subset_counts <- ADT_counts[ , colnames(ADT_counts) %in% adt_subset_meta$barcode]

In [567]:
ADT_subset_counts

  [[ suppressing 33 column names ‘TGTTGAGGTTTACGTG-1-s2d1’, ‘CTACATTTCGCAGATT-1-s2d1’, ‘ACGTACAAGTAGAATC-1-s2d1’ ... ]]



134 x 21500 sparse Matrix of class "dgCMatrix"
                                                                               
CD86        .  .  3 10  .  .  .    2   1  .  .    . 10  .  .  3   1  1   .    .
CD274       6 11  7 14 29  3  3   12   8  6 14   13 15 11  8 10   . 22   6   17
CD270       2  7  2  5  3  4  5    7   6  5 13    6  9  2  4 10   3  6   7   10
CD155       1  1  1  2  1  .  4    2   4  .  .    2  3  .  1  8   1  2   1    1
CD112       3  3  3  1  .  .  3    1   2  2  1    6  7  6  3  3   2  1   4    6
CD47       13 16  9 33  9 13 12    8  17 11 27   16 43 11 30 23   6 15   8   54
CD48       17  9 21 38 12 15 21    .  25  3 31    1 80 15 48 39   . 10   .    1
CD40        2  5  2  . 11  2  1    .   1  1  4    1  2  1  2  3   1  9   6    4
CD154       3  4  8  7  4  7  7    3   1  .  7   10 10  3  4 10   8  5  11   16
CD52        6  4  3 12  2  6  6    .   4  3 21    . 26  8 12  5   1 12   2    6
CD3        16  .  2  1  1 16  .    1   1  . 11    .  2  9 16  2   2  2   

In [568]:
adt_subset_meta = rna_subset_meta

# subset adt counts matrix
ADT_subset_counts <- ADT_counts[ , colnames(ADT_counts) %in% adt_subset_meta$barcode]

# save raw adt to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       sample,
                       "ADT", "counts.mtx", sep = "-"))
write10xCounts(x = ADT_subset_counts, path = data_path, version = "3")
write_csv(adt_subset_meta, here(output_path, "metadata.csv"))

# save raw adt to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          sample,
                          "ADT", "counts.rds", sep = "-")))

ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, assay = "ADT", meta.data = adt_subset_meta)

# save raw rna to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   sample, 
                                   "ADT", "counts.h5Seurat", sep = "-")))

# save raw adt to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], sample, "ADT", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ADT

Adding data for ADT

No variable features found for ADT

No feature-level metadata found for ADT

Validating h5Seurat file

Adding data from ADT as X

Adding counts from ADT as raw

Transfering meta.data to obs



# Data Manipulation: Load and Downsample BMMC

In [5]:
library(scuttle)

## Downsample n percents counts of BMMC multiome p10 data

In [3]:
library(scuttle)
input_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/p10"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/p10.json"))

In [55]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-multiome-p10-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
metadata['barcode'] <- rownames(metadata)

In [88]:
# 设置提取比例
proportion = 0.75
process = "p10_ds75"

output_path <- paste0("/home/wsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/", process)

In [89]:
# RNA_counts
dim(RNA_counts)
nnzero(RNA_counts)
sum(RNA_counts)

In [90]:
RNA_subset_counts <- downsampleMatrix(RNA_counts, prop = proportion, bycol = T)

In [1]:
??downsampleMatrix

In [91]:
# RNA_subset_counts
dim(RNA_subset_counts)
nnzero(RNA_subset_counts)
sum(RNA_subset_counts)

In [92]:
# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = metadata)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [62]:
# Load binarized ATAC
ATAC_Dir <- paste0(input_path, "/BMMC-multiome-p10-ATAC-peaks.mtx")
ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

metadata <- read.csv(paste0(ATAC_Dir, "/metadata.csv"), row.names = "barcode")
# 添加barcode到metadata
metadata['barcode'] <- rownames(metadata)

In [93]:
# ATAC_counts
dim(ATAC_counts)
nnzero(ATAC_counts)
sum(ATAC_counts)

In [94]:
ATAC_subset_counts <- downsampleMatrix(ATAC_counts, prop = proportion, bycol = T)

In [95]:
# ATAC_subset_counts
dim(ATAC_subset_counts)
nnzero(ATAC_subset_counts)
sum(ATAC_subset_counts)

In [96]:
# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ATAC", "peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = metadata)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



## Downsample n percents counts of BMMC multiome single sample

In [326]:
library(scuttle)
input_path <- "/home/wsg/BM/data/BMMC/RNA+ATAC/s1d1_s3d10"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/data/BMMC/RNA+ATAC/s1d1_s3d10/s1d1_s3d10.json"))

In [327]:
# 设置提取比例
proportion = 0.75
process = "s1d1_R75_A75_s3d10_R75_A75"

output_path <- paste0("/home/wsg/BM/data/BMMC/RNA+ATAC/s1d1_s3d10_robust/", process)

In [296]:
# Load RNA
RNA_Dir <- list.files(input_path, pattern = "\\RNA-counts.mtx$", full.names = TRUE)
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# add metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
metadata['barcode'] <- rownames(metadata)

In [328]:
# RNA_counts
dim(RNA_counts)
nnzero(RNA_counts)
sum(RNA_counts)

In [329]:
RNA_subset_counts <- downsampleMatrix(RNA_counts, prop = proportion, bycol = T)

In [330]:
# RNA_subset_counts
dim(RNA_subset_counts)
nnzero(RNA_subset_counts)
sum(RNA_subset_counts)

In [331]:
# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = metadata)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [301]:
# Load ATAC
ATAC_Dir <- list.files(input_path, pattern = "\\ATAC-peaks.mtx$", full.names = TRUE)
ATAC_counts <- Read10X(data.dir = ATAC_Dir, gene.column = 1)

# add metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
metadata['barcode'] <- rownames(metadata)

In [332]:
# ATAC_counts
dim(ATAC_counts)
nnzero(ATAC_counts)
sum(ATAC_counts)

In [333]:
ATAC_subset_counts <- downsampleMatrix(ATAC_counts, prop = proportion, bycol = T)

In [334]:
# ATAC_subset_counts
dim(ATAC_subset_counts)
nnzero(ATAC_subset_counts)
sum(ATAC_subset_counts)

In [335]:
# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ATAC", "peaks.mtx", sep = "-"))
write10xCounts(x = ATAC_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ATAC_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ATAC", "peaks.rds", sep = "-")))

ATAC_subset <- CreateSeuratObject(counts = ATAC_subset_counts, assay = "ATAC", meta.data = metadata)

# save raw rna to h5Seurat
SaveH5Seurat(ATAC_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ATAC", "peaks.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ATAC", "peaks.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



## Downsample n percents counts of BMMC CITE-seq p10 data

In [4]:
input_path <- "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/p10"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/p10.json"))

In [6]:
# Load RNA
RNA_Dir <- paste0(input_path, "/BMMC-CITE_seq-p10-RNA-counts.mtx")
RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
metadata['barcode'] <- rownames(metadata)

In [8]:
# 设置提取比例
proportion = 0.5
process = "ds50"

output_path <- paste0("/home/wsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq/p10/downsample/", process)

In [47]:
# RNA_counts
dim(RNA_counts)
nnzero(RNA_counts)
sum(RNA_counts)

In [48]:
RNA_subset_counts <- downsampleMatrix(RNA_counts, prop = proportion, bycol = T)

In [49]:
# RNA_subset_counts
dim(RNA_subset_counts)
nnzero(RNA_subset_counts)
sum(RNA_subset_counts)

In [50]:
# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = metadata)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [6]:
# Load ADT
ADT_Dir <- paste0(input_path, "/BMMC-CITE_seq-p10-ADT-counts.mtx")
ADT_counts <- Read10X(data.dir = ADT_Dir, gene.column = 1)

# 添加barcode到metadata
metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
metadata['barcode'] <- rownames(metadata)

In [7]:
ADT_counts

  [[ suppressing 33 column names ‘TTGGATGGTTGCCGAC-1-s1d1’, ‘TACGCTCTCTTTGCTA-1-s1d1’, ‘TGTTGGAAGGTCTGGA-1-s1d1’ ... ]]



134 x 9026 sparse Matrix of class "dgCMatrix"
                                                                             
CD86          .   4   .    .   3   1  42   2  72   .   2    3  10   .  55  67
CD274         3   8   4    .   3   3   7   3   9   5   2    7   2   5   5   2
CD270         6  40  34   10  49  31  37  33  40  13  31   21  26  30  22  14
CD155         .   .   1    7   1   .  32   .  20   3   1   10   2   2  19   2
CD112         7   4  12    3   2   6  11  10  22   6   7   12   5   9  22   7
CD47         70 217 389  218 172  97 189 184 262 239 140   83 126 231 244 175
CD48          4 122 338   31 217  82 429 119 536 191  85    2 158 108 389 183
CD40          3  40   2    1 172  61  25  87  13   .  43    6   2  98   7  14
CD154        37  14  16    8  10   7  23  10  32  12   9   27  15  17  18   4
CD52          3 104 183    4 220  60 125 151  22   3  70    7 108 142  45  38
CD3           4   . 193    3   2   5   5   2   2   3   1    3 252   1   7   4
CD8           1   

In [9]:
downsampleMatrix(ADT_counts, prop = proportion, bycol = T)

  [[ suppressing 33 column names ‘TTGGATGGTTGCCGAC-1-s1d1’, ‘TACGCTCTCTTTGCTA-1-s1d1’, ‘TGTTGGAAGGTCTGGA-1-s1d1’ ... ]]



134 x 9026 sparse Matrix of class "dgCMatrix"
                                                                            
CD86          .   3   .   .   2   1  23   1  31   .   1    2   4   .  26  29
CD274         1   4   2   .   1   .   5   1   2   1   1    2   1   3   .   2
CD270         3  19  15   4  25  24  23  15  23   9  13   11  15  14  15   4
CD155         .   .   .   6   .   .  13   .  14   .   1    6   2   2  12   1
CD112         4   3   7   1   2   2   7   4  13   4   2    5   .   6   8   4
CD47         35  98 194  99  86  55 103 102 134 131  74   41  62 107 122  87
CD48          4  59 163  15 100  32 222  61 265  92  42    2  88  56 201  84
CD40          .  18   .   .  76  28  16  39   8   .  21    5   1  48   4   8
CD154        14   6   5   5   4   2   8   3  14   5   4   12   9  10  10   1
CD52          1  53  82   2 112  30  68  70  12   .  42    4  47  77  25  19
CD3           2   .  98   .   1   3   3   .   1   3   .    2 134   .   5   2
CD8           1   1   1   .   

In [51]:
# ADT_counts
dim(ADT_counts)
nnzero(ADT_counts)
sum(ADT_counts)

In [52]:
ADT_subset_counts <- downsampleMatrix(ADT_counts, prop = proportion, bycol = T)

In [53]:
# ADT_subset_counts
dim(ADT_subset_counts)
nnzero(ADT_subset_counts)
sum(ADT_subset_counts)

In [54]:
# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ADT", "counts.mtx", sep = "-"))
write10xCounts(x = ADT_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ADT", "counts.rds", sep = "-")))

ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, assay = "ADT", meta.data = metadata)

# save raw rna to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ADT", "counts.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ADT", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ADT

Adding data for ADT

No variable features found for ADT

No feature-level metadata found for ADT

Validating h5Seurat file

Adding data from ADT as X

Adding counts from ADT as raw

Transfering meta.data to obs



 ## Downsample n percents counts of BMMC CITE-seq single sample

In [582]:
library(scuttle)
input_path <- "/home/wsg/BM/data/BMMC/RNA+ADT/s2d1_s3d6"
dataset <- unlist(fromJSON(file = "/home/wsg/BM/data/BMMC/RNA+ADT/s2d1_s3d6/s2d1_s3d6.json"))

In [603]:
# 设置提取比例
proportion = 0.10
process = "s2d1_R10_A10_s3d6_R10_A10"

output_path <- paste0("/home/wsg/BM/data/BMMC/RNA+ADT/s2d1_s3d6_robust/", process)

In [604]:
# # Load RNA
# RNA_Dir <- list.files(input_path, pattern = "\\RNA-counts.mtx$", full.names = TRUE)
# RNA_counts <- Read10X(data.dir = RNA_Dir, gene.column = 1)

# # add metadata
# metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
# metadata['barcode'] <- rownames(metadata)

In [605]:
# RNA_counts
dim(RNA_counts)
nnzero(RNA_counts)
sum(RNA_counts)

In [606]:
RNA_subset_counts <- downsampleMatrix(RNA_counts, prop = proportion, bycol = T)

In [607]:
# RNA_subset_counts
dim(RNA_subset_counts)
nnzero(RNA_subset_counts)
sum(RNA_subset_counts)

In [608]:
# Make Dir
if (!dir.exists(output_path)){
    dir.create(output_path)
}

# save raw rna to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "RNA", "counts.mtx", sep = "-"))
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "RNA", "counts.rds", sep = "-")))
# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = metadata)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "RNA", "counts.h5Seurat", sep = "-")))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "RNA", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [609]:
# # Load ADT
# ADT_Dir <- list.files(input_path, pattern = "\\ADT-counts.mtx$", full.names = TRUE)
# ADT_counts <- Read10X(data.dir = ADT_Dir, gene.column = 1)

# # add metadata
# metadata <- read.csv(paste0(input_path, "/metadata.csv"), row.names = "barcode")
# metadata['barcode'] <- rownames(metadata)

In [610]:
# ADT_counts
dim(ADT_counts)
nnzero(ADT_counts)
sum(ADT_counts)

In [611]:
ADT_subset_counts <- downsampleMatrix(ADT_counts, prop = proportion, bycol = T)

In [612]:
# ADT_subset_counts
dim(ADT_subset_counts)
nnzero(ADT_subset_counts)
sum(ADT_subset_counts)

In [613]:
# save raw atac to mtx
data_path <- here(output_path,
                 paste(dataset["data_name"], 
                       dataset["task_type"], 
                       process,
                       "ADT", "counts.mtx", sep = "-"))
write10xCounts(x = ADT_subset_counts, path = data_path, version = "3")
write_csv(metadata, here(output_path, "metadata.csv"))

# save raw atac to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, 
                    paste(dataset["data_name"], 
                          dataset["task_type"], 
                          process,
                          "ADT", "counts.rds", sep = "-")))

ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, assay = "ADT", meta.data = metadata)

# save raw rna to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, 
                             paste(dataset["data_name"], 
                                   dataset["task_type"], 
                                   process, 
                                   "ADT", "counts.h5Seurat", sep = "-")))

# save raw atac to h5ad
setwd(output_path)
Convert(paste(dataset["data_name"], dataset["task_type"], process, "ADT", "counts.h5Seurat", sep = "-"), 
        dest = "h5ad")

Creating h5Seurat file for version 3.1.5.9900

Adding counts for ADT

Adding data for ADT

No variable features found for ADT

No feature-level metadata found for ADT

Validating h5Seurat file

Adding data from ADT as X

Adding counts from ADT as raw

Transfering meta.data to obs



# Function: convert mtx to rds and h5


In [None]:
mtx_to_rds_h5 <- function(input_path,
                          output_path,
                          dataset){
    # check the species of data
    if (dataset['species'] == "human") { 
        genome = "GRCh38"
    } else if (dataset['species'] == "mouse") {
        genome = "mm10"
    } else {
        stop(paste0("species should be human or mouse, not ", dataset['species']))
    }

    # load the rna data
    rna <- readMM(here(input_path, dataset["gene_expression"]))
    rna_gene <- read.table(here(input_path, dataset["gene_names"]))
    cells_label <- read.table(here(input_path, dataset["gene_barcodes"]))

    rownames(rna) <- rna_gene[, 1]
    colnames(rna) <- cells_label[, 1]
    
    # load the atac data
    atac <- readMM(here(input_path, dataset["atac_expression"]))
    atac_peak <- read.table(here(input_path, dataset["atac_names"]))
    cells_label <- read.table(here(input_path, dataset["atac_barcodes"]))

    rownames(atac) <- atac_peak[, 1]
    colnames(atac) <- cells_label[, 1]
    
    # save raw rna to rds
    saveRDS(rna, 
            file = here(output_path, 
                        paste(dataset["data_name"], 
                              "raw",
                              dataset["task_type"], 
                              "RNA", "count.rds", sep = "-"))
           )
    # save raw atac to rds
    saveRDS(atac, 
            file = here(output_path, 
                        paste(dataset["data_name"], 
                              "raw",
                              dataset["task_type"], 
                              "ATAC", "peaks.rds", sep = "-"))
           )
    
    # save raw rna to h5Seurat
    rna_seurat <- CreateSeuratObject(counts = rna, project = "snare_p0_rna")
    SaveH5Seurat(rna_seurat, overwrite = TRUE, 
                 filename = here(output_path, 

                                 paste(dataset["data_name"], 
                                       "raw", 
                                       dataset["task_type"], 
                                       "RNA", "counts.h5Seurat", sep = "-")))
    # save raw atac to h5Seurat
    chrom_assay <- CreateChromatinAssay(
       counts = atac,
       sep = c("-", "-")
    )
    atac_seurat <- CreateSeuratObject(counts = chrom_assay, assay = "ATAC", project = "snare_p0_atac")
    SaveH5Seurat(atac_seurat, overwrite = TRUE, 
                 filename = here(output_path, 
                                 paste(dataset["data_name"], 
                                       "raw", 
                                       dataset["task_type"], 
                                       "ATAC", "peaks.h5Seurat", sep = "-")))
    
    # save raw rna to h5ad
    setwd(output_path)
    Convert(paste(dataset["data_name"],"raw", dataset["task_type"], "RNA", "counts.h5Seurat", sep = "-"), 
            dest = "h5ad")
    # save raw atac to h5ad
    Convert(paste(dataset["data_name"],"raw", dataset["task_type"], "ATAC", "peaks.h5Seurat", sep = "-"), 
            dest = "h5ad")

}

In [52]:
input_path <- "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess/RawData" 
output_path <- "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
dataset <- unlist(fromJSON(file = "/Data/wangsg/BM/pipeline/example/snare.json"))

In [53]:
mtx_to_rds_h5(input_path, output_path, dataset)

Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

"Keys should be one or more alphanumeric characters followed by an underscore, setting key from atac to atac_"
Creating h5Seurat file for version 3.1.5.9900

Adding counts for ATAC

Adding data for ATAC

No variable features found for ATAC

No feature-level metadata found for ATAC

Writing out ranges for ATAC

Writing out motifs for ATAC

Writing out fragments for ATAC

Writing out seqinfo for ATAC

Writing out annotation for ATAC

Writing out bias for ATAC

Writing out positionEnrichment for ATAC

Writing out links for ATAC

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs

Validating h5Seurat file

Adding data from ATAC as X

Adding counts from ATAC as raw

Transfering meta.data to obs



# Function: create rds data of MOFA2


In [73]:
MOFA2_rds_data <- function(input_path,
                           output_path,
                           dataset){
    library(MOFA2)
    library(Seurat)
    library(Signac)
    library(tidyverse)
    
    # check the species of data
    if (dataset['species'] == "human") { 
        genome = "hg38"
        library(EnsDb.Hsapiens.v86)
        lib <- EnsDb.Hsapiens.v86
    } else if (dataset['species'] == "mouse") {
        genome = "mm10"
        library(EnsDb.Mmusculus.v79)
        lib <- EnsDb.Mmusculus.v79
    } else {
        stop(paste0("species should be human or mouse, not ", dataset['species']))
    }
    
    # Load Data
    rna <- readRDS(here(output_path, 
                    paste(dataset["data_name"], 
                          "raw", 
                          dataset["task_type"],
                          "RNA", "counts.rds", sep = "-")
                   )
              )
    
    atac <- readRDS(here(output_path, 
                    paste(dataset["data_name"], 
                          "raw", 
                          dataset["task_type"],
                          "ATAC", "peaks.rds", sep = "-")
                   )
              )
    
    # SCTransform on RNA
    mofa_data <- CreateSeuratObject(counts = rna)
    DefaultAssay(mofa_data) <- "RNA"
    mofa_data <- SCTransform(mofa_data, verbose = FALSE) %>% RunPCA() %>% RunUMAP(dims = 1:50, reduction.name = 'umap.rna', reduction.key = 'rnaUMAP_')
    mofa_data <- FindVariableFeatures(mofa_data, selection.method = "vst", nfeatures = 3000) 
    
    # Annotate ATAC
    chrom_assay <- CreateChromatinAssay(counts = atac, sep = c("-", "-"), )
    mofa_data[["ATAC"]] <- chrom_assay    
    
    DefaultAssay(mofa_data) <- "ATAC"
    annotations <- GetGRangesFromEnsDb(ensdb = lib)
    seqlevelsStyle(annotations) <- 'Ensembl'
    genome(annotations) <- genome
    Annotation(mofa_data) <- annotations
    
    # Filter Data
    mofa_data <- subset(x = mofa_data, 
                        subset = nCount_ATAC < 70000 & nCount_ATAC > 10 & 
                        nCount_RNA < 25000 & nCount_RNA > 10)
    
    # RunTFIDF on ATAC
    DefaultAssay(mofa_data) <- "ATAC"
    mofa_data <- RunTFIDF(mofa_data)
    mofa_data <- FindTopFeatures(mofa_data, min.cutoff = 'q98')
    
    # Merge Data
    mofa <- create_mofa(mofa_data, assays = c("SCT","ATAC"))
    print(mofa)
    plot_data_overview(mofa)
    
    # Save Data
    saveRDS(mofa, 
            file = here(output_path, 
                        paste(dataset["data_name"], 
                              "MOFA2",
                              dataset["task_type"], 
                              "multi", "filtered.rds", sep = "-"))
           )   
    
}

In [74]:
input_path <- "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess" 
output_path <- "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
dataset <- unlist(fromJSON(file = "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess/BMMC.json"))

In [75]:
MOFA2_rds_data(input_path, output_path, dataset)

PC_ 1 
Positive:  HBA2, HBA1, HBD, SLC4A1, SOX6, SLC25A37, HBM, TSPAN5, ANK1, RGS6 
	   SPTA1, SNCA, CA1, AHSP, TMCC2, HEMGN, XPO7, AC079804.3, TRAK2, HBB 
	   ALAS2, MARCH3, CR1L, HECTD4, TFRC, CA2, CPEB4, BLVRB, SPECC1, RAPGEF2 
Negative:  BACH2, AFF3, B2M, ARHGAP15, PTPRC, BANK1, HLA-B, UTRN, DPYD, AOAH 
	   GNLY, RIPOR2, CD74, PCDH9, PRKCB, EBF1, VCAN, PRKCH, LYN, NAMPT 
	   HLA-C, NKG7, ZEB2, HLA-A, SKAP1, FYN, PARP8, JUN, CD69, CCL5 
PC_ 2 
Positive:  GNLY, B2M, NKG7, CCL5, CCL4, HLA-C, PRKCH, CD247, TMSB4X, IL32 
	   AOAH, HLA-A, PRF1, GZMA, RPL13, HLA-B, GZMH, TGFBR3, FYN, S100A4 
	   RORA, STAT4, ID2, TNFAIP3, NCALD, SRGN, RPL13A, GZMB, IL7R, BCL11B 
Negative:  AFF3, BACH2, PCDH9, EBF1, ACSM3, BANK1, TCF4, LINC01374, PAX5, MME 
	   RALGPS2, IGHM, AL589693.1, FCRL1, ROR1, COL19A1, ARPP21, RAPGEF5, RUBCNL, KLHL14 
	   KHDRBS2, ADAM23, NIBAN3, PDE4D, PLEKHG1, CD74, SSBP2, LIX1-AS1, OSBPL10, MS4A1 
PC_ 3 
Positive:  VCAN, NAMPT, PLXDC2, LRMDA, SLC8A1, DPYD, NEAT1, PID1, ARHGAP26, 

"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
"The 2 combined objects have no sequence levels in common. (Use
Performing TF-IDF normalization

Creating MOFA object from a Seurat object...

No features specified, using variable features from the Seurat object...



Untrained MOFA model with the following characteristics: 
 Number of views: 2 
 Views names: SCT ATAC 
 Number of features (per view): 3000 2330 
 Number of groups: 1 
 Groups names: group1 
 Number of samples (per group): 69240 



"sparse->dense coercion: allocating vector of size 1.5 GiB"
"sparse->dense coercion: allocating vector of size 1.2 GiB"
"`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead."


# Function: create mtx data of scDEC


In [77]:
scDEC_mtx_data <- function(input_path,
                           output_path,
                           dataset){
    # Make Dir
    system(paste0("mkdir -p ", output_path))   
    data_dir=paste(dataset["data_name"], "scDEC", 
                   dataset["task_type"], "multi", "raw.mtx", sep = "-")
    data_path=here(output_path, data_dir)
    system(paste0("mkdir -p ", data_path))
    
    # Load Data
    rna <- as.matrix(readMM(here(input_path, dataset["gene_expression"])))
    rna_gene <- read.table(here(input_path, dataset["gene_names"]))

    atac <- as.matrix(readMM(here(input_path, dataset["atac_expression"])))
    atac_peak <- read.table(here(input_path, dataset["atac_names"]))

    # Merge Data
    merge_mat <- rbind(rna, atac)
    out_tab <- as(as.matrix(merge_mat), "dgCMatrix")

    feat_tab <- data.frame(a = 0, 
                           name = c(rna_gene[,1], 
                                    atac_peak[,1]),
                           group = c(rep("Gene Expression", nrow(rna_gene)),
                                     rep("Peaks", nrow(atac_peak))))

    # Save Data
    writeMM(out_tab, here(data_path, "matrix.mtx"))
    write.table(feat_tab, file = paste0(data_path, "/features.tsv"), 
                row.names = F, col.names = F, sep='\t', quote=F)
    system(paste("cp", 
                 here(input_path, dataset["gene_barcodes"]), 
                 here(data_path, "barcodes.tsv.gz")))
    
}

In [78]:
input_path <- "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess" 
output_path <- "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
dataset <- unlist(fromJSON(file = "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess/BMMC.json"))

In [79]:
scDEC_mtx_data(input_path, output_path, dataset)

"sparse->dense coercion: allocating vector of size 6.9 GiB"
"sparse->dense coercion: allocating vector of size 60.1 GiB"


# Function: create rds data of SeuratV4

In [84]:
SeuratV4_rds_data <- function(input_path, 
                              output_path,
                              dataset){
    library(Seurat)
    library(Signac)
    library(tidyverse)
    
    # check the species of data
    if (dataset['species'] == "human") { 
        genome = "hg38"
        library(EnsDb.Hsapiens.v86)
        lib <- EnsDb.Hsapiens.v86
    } else if (dataset['species'] == "mouse") {
        genome = "mm10"
        library(EnsDb.Mmusculus.v79)
        lib <- EnsDb.Mmusculus.v79
    } else {
        stop(paste0("species should be human or mouse, not ", dataset['species']))
    }
    
    # Load Data
    rna <- readRDS(here(output_path, 
                    paste(dataset["data_name"], 
                          "raw", 
                          dataset["task_type"],
                          "RNA", "counts.rds", sep = "-")
                   )
              )
    
    atac <- readRDS(here(output_path, 
                    paste(dataset["data_name"], 
                          "raw", 
                          dataset["task_type"],
                          "ATAC", "peaks.rds", sep = "-")
                   )
              )
    
    # Merge Data
    wnn_data <- CreateSeuratObject(counts = rna)
    
    # Annotate ATAC
    chrom_assay <- CreateChromatinAssay(counts = atac, sep = c("-", "-"), )
    wnn_data[["ATAC"]] <- chrom_assay
    DefaultAssay(wnn_data) <- "ATAC"
    
    annotations <- GetGRangesFromEnsDb(ensdb = lib)
    seqlevelsStyle(annotations) <- 'Ensembl'
    genome(annotations) <- genome
    Annotation(mofa_data) <- annotations
    
    # Filter Data
    wnn_data <- subset(x = wnn_data,
        subset = nCount_ATAC < 7e4 & nCount_ATAC > 10 &
        nCount_RNA < 25000 & nCount_RNA > 10 
    )
    
    # Save Data
    saveRDS(wnn_data, 
            file = here(output_path, 
                        paste(dataset["data_name"], 
                              "SeuratV4",
                              dataset["task_type"], 
                              "multi", "filtered.rds", sep = "-"))
           )   
    
}

In [85]:
input_path <- "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess" 
output_path <- "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
dataset <- unlist(fromJSON(file = "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess/BMMC.json"))

In [86]:
SeuratV4_rds_data(input_path, output_path, dataset)

Fetching data...
OK

Parsing exons...
OK
Defining introns...
OK
Defining UTRs...
OK
Defining CDS...
OK

aggregating...

Done

Fetching data...
OK

Parsing exons...
OK
Defining introns...
OK
Defining UTRs...
OK
Defining CDS...
OK

aggregating...

Done

Fetching data...
OK

Parsing exons...
OK
Defining introns...
OK
Defining UTRs...
OK
Defining CDS...
OK

aggregating...

Done

Fetching data...
OK

Parsing exons...
OK
Defining introns...
OK
Defining UTRs...
OK
Defining CDS...
OK

aggregating...

Done

Fetching data...
OK

Parsing exons...
OK
Defining introns...
OK
Defining UTRs...
OK
Defining CDS...
OK

aggregating...

Done

Fetching data...
OK

Parsing exons...
OK
Defining introns...
OK
Defining UTRs...
OK
Defining CDS...
OK

aggregating...

Done

Fetching data...
OK

Parsing exons...
OK
Defining introns...
OK
Defining UTRs...
OK
Defining CDS...
OK

aggregating...

Done

Fetching data...
OK

Parsing exons...
OK
Defining introns...
OK
Defining UTRs...
OK
Defining CDS...
OK

aggregating...