In [2]:
args <- commandArgs(T) 

In [5]:
library(here)
library(rjson)
library(Matrix)
library(tidyverse)
library(dplyr)
library(DropletUtils) 

library(Seurat)
library(Signac)
library(SeuratDisk)

library(rhdf5)
library(anndata)

# convert H5Seurat

## Multiome

In [6]:
input_path <- "/home/wsg/BM/data/10x_NSCLC/RawData"
output_path <- "/home/wsg/BM/data/10x_NSCLC/RNA+ADT/RawData"

In [7]:
NSCLC_multi <- Read10X_h5(file = here(input_path, "20k_NSCLC_DTC_3p_nextgem_intron_Multiplex_count_raw_feature_bc_matrix.h5"))

Genome matrix has multiple modalities, returning a list of matrices for this genome



In [8]:
NSCLC_table <- read.csv(file = here(input_path, "20k_NSCLC_DTC_3p_nextgem_intron_Multiplex_multiplexing_analysis_assignment_confidence_table.csv"))
table(NSCLC_table$Assignment)


    Blanks     CMO301     CMO302     CMO303     CMO304     CMO306     CMO307 
       541       1901       2360       1687       2078       2617       3317 
    CMO308  Multiplet Unassigned 
      1658       3245       1067 

In [9]:
metadata <- NSCLC_table[,c("Barcodes", "Assignment", "Assignment_Probability")]
colnames(metadata) <- c("barcode", "donor", "donor_probability")
metadata <- metadata[-which(metadata$donor %in% c("Blanks", "Multiplet", "Unassigned")),]

In [7]:
# metadata
metadata
write_csv(metadata, here(output_path, "metadata.csv"))

Unnamed: 0_level_0,barcode,donor,donor_probability
Unnamed: 0_level_1,<chr>,<chr>,<dbl>
1,AAACCCAAGAATTTGG-1,CMO303,0.9999541
3,AAACCCAAGCATGGGT-1,CMO307,0.9999984
4,AAACCCAAGCCTCCAG-1,CMO302,0.9941245
5,AAACCCAAGGGCTTCC-1,CMO307,0.9999991
6,AAACCCACAGACATCT-1,CMO302,0.9980274
7,AAACCCACAGAGGCTA-1,CMO307,0.9999990
8,AAACCCACAGCCGGTT-1,CMO306,0.9997826
11,AAACCCAGTACGATGG-1,CMO308,0.9999986
13,AAACCCAGTCTTCGAA-1,CMO301,0.9999974
16,AAACCCATCACATACG-1,CMO301,0.9999394


In [10]:
RNA_counts <- NSCLC_multi$`Gene Expression`

In [11]:
RNA_counts

  [[ suppressing 32 column names ‘AAACCCAAGAAACACT-1’, ‘AAACCCAAGAAACCAT-1’, ‘AAACCCAAGAAACCCA-1’ ... ]]

  [[ suppressing 32 column names ‘AAACCCAAGAAACACT-1’, ‘AAACCCAAGAAACCAT-1’, ‘AAACCCAAGAAACCCA-1’ ... ]]

  [[ suppressing 32 column names ‘AAACCCAAGAAACACT-1’, ‘AAACCCAAGAAACCAT-1’, ‘AAACCCAAGAAACCCA-1’ ... ]]



36601 x 3862363 sparse Matrix of class "dgCMatrix"
                                                                                   
MIR1302-2HG  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
FAM138A      . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
OR4F5        . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL627309.1   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL627309.3   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL627309.2   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL627309.5   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL627309.4   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AP006222.2   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL732372.1   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
OR4F29       . . . . . . 

In [12]:
sum(metadata$barcode %in% colnames(RNA_counts))

In [13]:
RNA_subset_counts <- RNA_counts[, metadata$barcode]

In [63]:
process = "raw"

# save raw rna to mtx
data_path <- here(output_path, "NSCLC-CITE_seq-raw-RNA-counts.mtx")
write10xCounts(x = RNA_subset_counts, path = data_path, version = "3")

# save raw rna to rds
saveRDS(RNA_subset_counts, 
        file = here(output_path, "NSCLC-CITE_seq-raw-RNA-counts.rds"))

# Create Seurat Object
RNA_subset <- CreateSeuratObject(counts = RNA_subset_counts, meta.data = metadata)

# save Seurat to h5Seurat
SaveH5Seurat(RNA_subset, overwrite = TRUE, 
             filename = here(output_path, "NSCLC-CITE_seq-raw-RNA-counts.h5Seurat"))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(here(output_path, "NSCLC-CITE_seq-raw-RNA-counts.h5Seurat"), dest = "h5ad")

“Some cells in meta.data not present in provided counts matrix”
Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs



In [None]:
# ADT

In [14]:
ADT_counts <- NSCLC_multi$`Antibody Capture`

In [15]:
ADT_counts

  [[ suppressing 34 column names ‘AAACCCAAGAAACACT-1’, ‘AAACCCAAGAAACCAT-1’, ‘AAACCCAAGAAACCCA-1’ ... ]]



9 x 3862363 sparse Matrix of class "dgCMatrix"
                                                                          
CD3    . . . . . . . . . . . . . . . . . . . . . . . . . 1 . . . . . 1 . .
CD4.1  . . . . . . . . . . . . . . . . . 1 1 . . . . . . . . . . . . . . .
CD8    . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
CD11c  . . . . . . . . . . . . . 1 . . . . . . . . . . . . . . 1 . . . . .
CD14.1 . . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . . . . . .
CD16   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
CD19.1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
CD56   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
CD45   . . . . . 1 . . . . . . . . . . . . . . . . . . . . . . . . . . . .
             
CD3    ......
CD4.1  ......
CD8    ......
CD11c  ......
CD14.1 ......
CD16   ......
CD19.1 ......
CD56   ......
CD45   ......

 .....suppressing 3862329 columns in show(); maybe adjust 'opt

In [16]:
ADT_subset_counts <- ADT_counts[, metadata$barcode]

In [17]:
ADT_subset_counts

  [[ suppressing 34 column names ‘AAACCCAAGAATTTGG-1’, ‘AAACCCAAGCATGGGT-1’, ‘AAACCCAAGCCTCCAG-1’ ... ]]



9 x 15618 sparse Matrix of class "dgCMatrix"
                                                                            
CD3     1  1 1  1 214   12 303   13  3  .   5   5   65   14  1   23    7 237
CD4.1   4 59 3  6 410  195 411  225  3 12   7  10 1036  303  6  690   16 409
CD8     2  . 1  .   1    2   2    .  2  .   .   2   10    2  .    6    1   .
CD11c   8  6 8  6   8  988  10 2505 11 10   3 237 1138 1098  2 2155   12  14
CD14.1  4  9 7 11   6 1394  11  554  4 12   5  15  674 2453  1  746   12   4
CD16    1  3 3  .   .   26   1    5  2  4   . 108   72   41  .   12    6   1
CD19.1  3  4 3  3   2   44   1    6  6 10 180   7   65   15  1   15 1253   .
CD56   12  1 4  2   1   26   1    5  2 19   3   8  102   17 40   19    7   .
CD45   44  1 .  1  84  148  66  152  5  .  22  22  143  150  1  334  157  47
                                                                             
CD3      4 334  5 297   18   44  8 5   6   24  5  3   14 332    6   10 ......
CD4.1    5   6  3   9  801  6

In [42]:
gene_adt_tab <- read.table("/home/wsg/BM/pipeline/config/gene_adt_tab.tsv", header = 1)

In [48]:
adt_raw <- rownames(ADT_subset_counts)
adt_raw

In [49]:
adt_new <- sapply(strsplit(adt_raw, "\\."), '[', 1)
adt_new

In [50]:
gene_adt_tab_sub <- gene_adt_tab[which(gene_adt_tab$ADT %in% adt_new), ]
gene_adt_tab_sub

Unnamed: 0_level_0,Symbol,ADT,ADT_Type
Unnamed: 0_level_1,<chr>,<chr>,<chr>
2,CD4,CD4,Approved.symbol
6,CD14,CD14,Approved.symbol
7,CD19,CD19,Approved.symbol
34,CD8A,CD8,Previous.symbols
40,PTPRC,CD45,Previous.symbols
44,ITGAX,CD11c,Aliases
45,FCGR3A,CD16,Aliases
46,FCGR3B,CD16,Aliases
61,NCAM1,CD56,Aliases


In [53]:
adt_new

In [56]:
adt_gene <- c('CD3', 'CD4', 'CD8A', 'ITGAX', 'CD14', 'FCGR3A', 'CD19', 'NCAM1', 'PTPRC')
adt_gene

In [57]:
rownames(ADT_subset_counts) <- adt_gene
ADT_subset_counts

  [[ suppressing 34 column names ‘AAACCCAAGAATTTGG-1’, ‘AAACCCAAGCATGGGT-1’, ‘AAACCCAAGCCTCCAG-1’ ... ]]



9 x 15618 sparse Matrix of class "dgCMatrix"
                                                                            
CD3     1  1 1  1 214   12 303   13  3  .   5   5   65   14  1   23    7 237
CD4     4 59 3  6 410  195 411  225  3 12   7  10 1036  303  6  690   16 409
CD8A    2  . 1  .   1    2   2    .  2  .   .   2   10    2  .    6    1   .
ITGAX   8  6 8  6   8  988  10 2505 11 10   3 237 1138 1098  2 2155   12  14
CD14    4  9 7 11   6 1394  11  554  4 12   5  15  674 2453  1  746   12   4
FCGR3A  1  3 3  .   .   26   1    5  2  4   . 108   72   41  .   12    6   1
CD19    3  4 3  3   2   44   1    6  6 10 180   7   65   15  1   15 1253   .
NCAM1  12  1 4  2   1   26   1    5  2 19   3   8  102   17 40   19    7   .
PTPRC  44  1 .  1  84  148  66  152  5  .  22  22  143  150  1  334  157  47
                                                                             
CD3      4 334  5 297   18   44  8 5   6   24  5  3   14 332    6   10 ......
CD4      5   6  3   9  801  6

In [58]:
# save raw ADT to mtx
data_path <- here(output_path, "NSCLC-CITE_seq-raw-ADT-counts.mtx")
write10xCounts(x = ADT_subset_counts, path = data_path, version = "3")

# save raw ADT to rds
saveRDS(ADT_subset_counts, 
        file = here(output_path, "NSCLC-CITE_seq-raw-ADT-counts.rds"))

# Create Seurat Object
ADT_subset <- CreateSeuratObject(counts = ADT_subset_counts, meta.data = metadata)

# save Seurat to h5Seurat
SaveH5Seurat(ADT_subset, overwrite = TRUE, 
             filename = here(output_path, "NSCLC-CITE_seq-raw-ADT-counts.h5Seurat"))

# Convert h5Seurat to h5ad
setwd(output_path)
Convert(here(output_path, "NSCLC-CITE_seq-raw-ADT-counts.h5Seurat"), dest = "h5ad")

“Some cells in meta.data not present in provided counts matrix”
Creating h5Seurat file for version 3.1.5.9900

Adding counts for RNA

Adding data for RNA

No variable features found for RNA

No feature-level metadata found for RNA

Validating h5Seurat file

Adding data from RNA as X

Adding counts from RNA as raw

Transfering meta.data to obs

