## generate h5 peak x cell matrix

#### reference: https://bioconductor.org/packages/release/bioc/manuals/DropletUtils/man/DropletUtils.pdf

In [2]:
library(DropletUtils)

library(tidyverse)
library(Matrix)

In [3]:
scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix = read.csv("../../../data/Buenrostro2018_FACS_BM_scATAC/scATACseq/02_ENCODE_coverage_by_cell_matrix/scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_complete.csv")

In [4]:
scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix = scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix %>% column_to_rownames(var="X")

In [5]:
head(scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix) 

Unnamed: 0_level_0,CLP_0,CLP_1,CMP_0,CMP_1,CMP_2,CMP_3,CMP_4,CMP_5,CMP_6,CMP_7,⋯,LMPP_87,LMPP_88,LMPP_89,LMPP_90,LMPP_91,LMPP_92,LMPP_93,LMPP_94,LMPP_95,LMPP_96
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1_181251_181601,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1_190865_191071,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1_778562_778912,0,4,0,0,0,0,0,0,0,0,⋯,0,4,6,0,0,2,4,6,0,0
chr1_779086_779355,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1_779727_780060,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1_790397_790626,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


### some cells don't have any reads in the given regions. Exclude these cells from the analysis because there are no counts in the fragment file for these cells and it causes an error

In [6]:
## remove all zero cells:
scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero = scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix[, colSums(scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix != 0) > 0]

In [7]:
dim(scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero)

In [8]:
scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero_Sparse <- as(as.matrix(scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero), "CsparseMatrix")

In [9]:
cell.ids = colnames(scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero_Sparse)


In [10]:
ngenes <- nrow(scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero_Sparse)
gene.ids <- rownames(scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero_Sparse)


In [11]:
gene.symb <- rownames(scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero_Sparse)

In [12]:
# Creating a version 3 HDF5 file:
#tmph5 <- tempfile(tmpdir ="/project/scATAC_analysis/Corces2018_BM_FACS_scATAC/analysis/annotate_with_SeuratV4/outs/scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix" ,fileext=".h5")
out_path = '../../../results/Fig1_Fig2_Fig3_SFig1-FACS_BM_scATAC/Fig3-apply_seurat_label_transfer/01_prepare_input_files/scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero.h5'
write10xCounts(path = out_path, scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero_Sparse, gene.id=gene.ids,
gene.symbol=gene.symb, barcodes=cell.ids, version='3')

You created a large dataset with compression and chunking.
The chunk size is equal to the dataset dimensions.
If you want to read subsets of the dataset, you should testsmaller chunk sizes to improve read times.

You created a large dataset with compression and chunking.
The chunk size is equal to the dataset dimensions.
If you want to read subsets of the dataset, you should testsmaller chunk sizes to improve read times.



> write10xCounts(
path,
x,
barcodes = colnames(x),
gene.id = rownames(x),
gene.symbol = gene.id,
gene.type = "Gene Expression",
overwrite = FALSE,
type = c("auto", "sparse", "HDF5"),
genome = "unknown",
version = c("2", "3"),
chemistry = "Single Cell 3' v3",
original.gem.groups = 1L,
library.ids = "custom"
)

## generate fragment file

./01b_make_fragments_file.sh



## generate singlecell.csv file

singlecell.csv file includes per-parcode-info, like "is_cell_barcode", "number of fragments". I will aggregate the info from the fragments file

In [28]:
# create single cell file

In [13]:
singleCellFile = read.table("../../../results/Fig1_Fig2_Fig3_SFig1-FACS_BM_scATAC/Fig3-apply_seurat_label_transfer/01_prepare_input_files/sum.bed",sep = "\t", header = F)

In [14]:
setdiff(colnames(scATAC_BM_Corces2018_ENCODE_coverage_per_cell_matrix_nonZero), singleCellFile$V1)


In [15]:
rep(1, times = NROW(singleCellFile))

In [16]:
singleCellFile_df = data.frame(barcode= singleCellFile$V1,
                               is_cell = rep(1, times = NROW(singleCellFile)),
                               atac_fragments = singleCellFile$V2)

In [17]:
head(singleCellFile_df)

Unnamed: 0_level_0,barcode,is_cell,atac_fragments
Unnamed: 0_level_1,<chr>,<dbl>,<int>
1,CLP_0,1,743
2,CLP_1,1,2822
3,CLP_10,1,648
4,CLP_100,1,3072
5,CLP_101,1,4386
6,CLP_102,1,4088


In [18]:
dim(singleCellFile_df)

In [19]:
write.csv(singleCellFile_df, "../../../results/Fig1_Fig2_Fig3_SFig1-FACS_BM_scATAC/Fig3-apply_seurat_label_transfer/01_prepare_input_files/singlecell.csv",row.names=FALSE)