In [5]:
import os
import sys 
import json
import h5py
import pickle

import torch
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

from scipy.io import mmread

# BMMC

## RNA+ATAC

In [3]:
BMMC_multi_h5ad = "/home/wsg/BM/pipeline/data/BMMC/RawData/GSE194122_openproblems_neurips2021_multiome_BMMC_processed.h5ad"

In [4]:
import scanpy as sc
from scipy import io
BMMC_multi = sc.read_h5ad(BMMC_multi_h5ad)

BMMC_rna = BMMC_multi[:, BMMC_multi.var["feature_types"] == "GEX"]
BMMC_atac = BMMC_multi[:, BMMC_multi.var["feature_types"] == "ATAC"]

In [5]:
BMMC_rna.shape

(69249, 13431)

In [11]:
BMMC_rna.obs.cell_type

TAGTTGTCACCCTCAC-1-s1d1     Naive CD20+ B
CTATGGCCATAACGGG-1-s1d1        CD14+ Mono
CCGCACACAGGTTAAA-1-s1d1            CD8+ T
TCATTTGGTAATGGAA-1-s1d1            CD8+ T
ACCACATAGGTGTCCA-1-s1d1        CD16+ Mono
                                ...      
AAACCGCGTTTGAGGC-12-s4d9     CD8+ T naive
TGACTTAAGTTCCCGT-12-s4d9       Lymph prog
GCTGTACCACCGTTCC-12-s4d9           CD8+ T
ACACTTGCAACTAGAA-12-s4d9             cDC2
CACTTAAAGTCTGGGC-12-s4d9    Naive CD20+ B
Name: cell_type, Length: 69249, dtype: category
Categories (22, object): ['B1 B', 'CD4+ T activated', 'CD4+ T naive', 'CD8+ T', ..., 'Proerythroblast', 'Transitional B', 'cDC2', 'pDC']

In [6]:
BMMC_atac.shape

(69249, 116490)

In [13]:
BMMC_atac.obs.cell_type

TAGTTGTCACCCTCAC-1-s1d1     Naive CD20+ B
CTATGGCCATAACGGG-1-s1d1        CD14+ Mono
CCGCACACAGGTTAAA-1-s1d1            CD8+ T
TCATTTGGTAATGGAA-1-s1d1            CD8+ T
ACCACATAGGTGTCCA-1-s1d1        CD16+ Mono
                                ...      
AAACCGCGTTTGAGGC-12-s4d9     CD8+ T naive
TGACTTAAGTTCCCGT-12-s4d9       Lymph prog
GCTGTACCACCGTTCC-12-s4d9           CD8+ T
ACACTTGCAACTAGAA-12-s4d9             cDC2
CACTTAAAGTCTGGGC-12-s4d9    Naive CD20+ B
Name: cell_type, Length: 69249, dtype: category
Categories (22, object): ['B1 B', 'CD4+ T activated', 'CD4+ T naive', 'CD8+ T', ..., 'Proerythroblast', 'Transitional B', 'cDC2', 'pDC']

## RNA+ADT

In [5]:
BMMC_cite_h5ad = "/home/wsg/BM/pipeline/data/BMMC/RawData/GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad"

In [6]:
import scanpy as sc
from scipy import io
BMMC_cite = sc.read_h5ad(BMMC_cite_h5ad)

BMMC_rna = BMMC_cite[:, BMMC_cite.var["feature_types"] == "GEX"]
BMMC_adt = BMMC_cite[:, BMMC_cite.var["feature_types"] == "ADT"]

  utils.warn_names_duplicates("var")


In [20]:
BMMC_rna.shape

(90261, 13953)

In [7]:
BMMC_rna.obs.cell_type

GCATTAGCATAAGCGG-1-s1d1    Naive CD20+ B IGKC+
TACAGGTGTTAGAGTA-1-s1d1             CD14+ Mono
AGGATCTAGGTCTACT-1-s1d1    Naive CD20+ B IGKC+
GTAGAAAGTGACACAG-1-s1d1                    HSC
TCCGAAAAGGATCATA-1-s1d1           Reticulocyte
                                  ...         
GAATCACCACGGAAGT-1-s4d9             Lymph prog
GCTGGGTGTACGGATG-1-s4d9           CD8+ T naive
TCGAAGTGTGACAGGT-1-s4d9                  T reg
GCAGGCTGTTGCATAC-1-s4d9           CD4+ T naive
ACGTAACAGGTCTACT-1-s4d9           CD8+ T naive
Name: cell_type, Length: 90261, dtype: category
Categories (45, object): ['B1 B IGKC+', 'B1 B IGKC-', 'CD4+ T CD314+ CD45RA+', 'CD4+ T activated', ..., 'dnT', 'gdT CD158b+', 'gdT TCRVD2+', 'pDC']

In [21]:
BMMC_adt.shape

(90261, 134)

In [8]:
BMMC_adt.obs.cell_type

GCATTAGCATAAGCGG-1-s1d1    Naive CD20+ B IGKC+
TACAGGTGTTAGAGTA-1-s1d1             CD14+ Mono
AGGATCTAGGTCTACT-1-s1d1    Naive CD20+ B IGKC+
GTAGAAAGTGACACAG-1-s1d1                    HSC
TCCGAAAAGGATCATA-1-s1d1           Reticulocyte
                                  ...         
GAATCACCACGGAAGT-1-s4d9             Lymph prog
GCTGGGTGTACGGATG-1-s4d9           CD8+ T naive
TCGAAGTGTGACAGGT-1-s4d9                  T reg
GCAGGCTGTTGCATAC-1-s4d9           CD4+ T naive
ACGTAACAGGTCTACT-1-s4d9           CD8+ T naive
Name: cell_type, Length: 90261, dtype: category
Categories (45, object): ['B1 B IGKC+', 'B1 B IGKC-', 'CD4+ T CD314+ CD45RA+', 'CD4+ T activated', ..., 'dnT', 'gdT CD158b+', 'gdT TCRVD2+', 'pDC']

# HSPC

In [2]:
FP_CELL_METADATA = '/home/wsg/BM/pipeline/data/HSPC/RawData/metadata.csv'

df_cell = pd.read_csv(FP_CELL_METADATA)
df_cell_multi = df_cell[df_cell.technology=="multiome"]
df_cell_cite = df_cell[df_cell.technology=="citeseq"]
df_cell_cite.shape, df_cell_multi.shape

((119651, 5), (161877, 5))

In [8]:
df_cell_cite.cell_type.value_counts()

HSC     42874
EryP    24344
NeuP    21418
MasP    18090
MkP     10800
MoP      1822
BP        303
Name: cell_type, dtype: int64

In [21]:
119651+161877

281528

## RNA+ATAC

In [16]:
# ATAC-seq peak counts transformed
train_multi_inputs ='/home/wsg/BM/pipeline/data/HSPC/RawData/train_multi_inputs.h5' # 训练集的ATAC模态
## test_multi_inputs ='/home/wsg/BM/pipeline/data/HSPC/RawData/test_multi_inputs.h5' # 测试集的ATAC模态

# RNA gene expression levels as library-size normalized and log1p transformed counts for the same cells
train_multi_targets ='/home/wsg/BM/pipeline/data/HSPC/RawData/train_multi_targets.h5' # 训练集的RNA模态
# 由于测试集的RNA模态并未提供，因此在benchmark任务中我们只使用HSPC数据集的训练集

In [17]:
train_multi_targets = pd.read_hdf(train_multi_targets)
train_multi_targets.shape

(105942, 23418)

In [20]:
train_multi_targets

gene_id,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000184389,ENSG00000128274,ENSG00000094914,ENSG00000081760,...,ENSG00000086827,ENSG00000174442,ENSG00000122952,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56390cf1b95e,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,4.893861,0.0,0.0,0.000000,0.0,5.583255,0.000000,4.893861
fc0c60183c33,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000
9b4a87e22ad0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,5.107832,0.0,0.0,0.000000,0.0,0.000000,0.000000,5.107832
81cccad8cd81,0.0,4.507936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,5.195558,4.507936,0.0,0.0,0.000000,0.0,0.000000,0.000000,5.195558
15cb3d85c232,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,5.531572,0.0,0.000000,4.842377,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
063cead1a4ea,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,5.829234,0.0,0.000000,0.000000,5.139023
553bca99ba78,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.091473,0.000000,5.091473,0.0,0.0,5.091473,0.0,0.000000,5.091473,0.000000
00783f28b463,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,4.500950,0.0,0.000000,0.000000,0.000000
e7abb1a0f251,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,4.871117,0.000000,0.000000


In [22]:
train_multi_inputs = pd.read_hdf(train_multi_inputs)
train_multi_inputs.shape

(105942, 228942)

In [23]:
train_multi_inputs

gene_id,GL000194.1:114519-115365,GL000194.1:55758-56597,GL000194.1:58217-58957,GL000194.1:59535-60431,GL000195.1:119766-120427,GL000195.1:120736-121603,GL000195.1:137437-138345,GL000195.1:15901-16653,GL000195.1:22357-23209,GL000195.1:23751-24619,...,chrY:7722278-7723128,chrY:7723971-7724880,chrY:7729854-7730772,chrY:7731785-7732664,chrY:7810142-7811040,chrY:7814107-7815018,chrY:7818751-7819626,chrY:7836768-7837671,chrY:7869454-7870371,chrY:7873814-7874709
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56390cf1b95e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,4.428336,0.0,0.0,0.0,0.0
fc0c60183c33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9b4a87e22ad0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
81cccad8cd81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
15cb3d85c232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
063cead1a4ea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.573818,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
553bca99ba78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
00783f28b463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
e7abb1a0f251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [18]:
test_multi_inputs = pd.read_hdf(test_multi_inputs)
test_multi_inputs.shape

(55935, 228942)

In [19]:
test_multi_inputs

gene_id,GL000194.1:114519-115365,GL000194.1:55758-56597,GL000194.1:58217-58957,GL000194.1:59535-60431,GL000195.1:119766-120427,GL000195.1:120736-121603,GL000195.1:137437-138345,GL000195.1:15901-16653,GL000195.1:22357-23209,GL000195.1:23751-24619,...,chrY:7722278-7723128,chrY:7723971-7724880,chrY:7729854-7730772,chrY:7731785-7732664,chrY:7810142-7811040,chrY:7814107-7815018,chrY:7818751-7819626,chrY:7836768-7837671,chrY:7869454-7870371,chrY:7873814-7874709
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
458c2ae2c9b1,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01a0659b0710,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
028a8bc3f2ba,0.000000,0.0,0.0,0.0,0.0,0.0,2.951019,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7ec0ca8bb863,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
caa0b0022cdc,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96a60b026659,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d493e546991e,1.796489,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
05666c99aa48,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121f946642b5,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
train_multi_targets.index.intersection(train_multi_inputs.index)

Index(['56390cf1b95e', 'fc0c60183c33', '9b4a87e22ad0', '81cccad8cd81',
       '15cb3d85c232', 'a7791bcf1152', '072790e768b1', '404459b1005b',
       '627a5071cbd7', '00f283126092',
       ...
       'b7a0a9c04626', 'ba7d40e15f3d', '69451694ec4c', 'd3a6d3c17704',
       'eb1fc4a08eb2', '063cead1a4ea', '553bca99ba78', '00783f28b463',
       'e7abb1a0f251', '193992d571a5'],
      dtype='object', name='cell_id', length=105942)

In [27]:
train_multi_targets.index.intersection(test_multi_inputs.index)

Index([], dtype='object', name='cell_id')

## RNA+ADT

In [19]:
#  RNA library-size normalized and log1p transformed counts (gene expression levels)
train_cite_inputs ='/home/wsg/BM/pipeline/data/HSPC/RawData/train_cite_inputs.h5' # 训练集的RNA模态
## test_cite_inputs_day_2_donor_27678 ='/home/wsg/BM/pipeline/data/HSPC/RawData/test_cite_inputs_day_2_donor_27678.h5' # 测试集的RNA模态
## test_cite_inputs ='/home/wsg/BM/pipeline/data/HSPC/RawData/test_cite_inputs.h5' # 测试集的RNA模态

# Surface protein levels for the same cells that have been dsb normalized
train_cite_targets ='/home/wsg/BM/pipeline/data/HSPC/RawData/train_cite_targets.h5' # 训练集的ADT模态
# 由于测试集的ADT模态并未提供，因此在benchmark任务中我们只使用HSPC数据集的训练集

In [20]:
train_cite_inputs = pd.read_hdf(train_cite_inputs)
train_cite_inputs.shape

(70988, 22050)

In [21]:
train_cite_inputs

gene_id,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,4.090185,0.000000
d02759a80ba2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,4.039545,0.0,0.0,0.000000,0.000000,0.000000,0.000000
c016c6b0efa5,0.0,0.0,0.0,0.0,0.0,3.847321,0.000000,3.847321,3.847321,0.000000,...,0.000000,0.000000,3.847321,4.529743,0.0,0.0,0.000000,3.847321,3.847321,0.000000
ba7f733a4f75,0.0,0.0,0.0,0.0,0.0,0.000000,3.436846,3.436846,0.000000,0.000000,...,3.436846,0.000000,4.113780,5.020215,0.0,0.0,0.000000,3.436846,4.113780,0.000000
fbcf2443ffb2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.196826,0.000000,0.000000,...,0.000000,4.196826,4.196826,4.196826,0.0,0.0,3.518610,4.196826,3.518610,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650ee456f0f3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,4.397535,4.397535,5.084510,0.0,0.0,0.000000,0.000000,4.397535,4.397535
cc506e7707f5,0.0,0.0,0.0,0.0,0.0,0.000000,3.981467,4.665241,0.000000,0.000000,...,3.981467,0.000000,4.665241,3.981467,0.0,0.0,0.000000,0.000000,3.981467,0.000000
a91f1b55a520,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.497696,0.000000,4.497696,...,0.000000,0.000000,0.000000,4.497696,0.0,0.0,3.815622,4.497696,0.000000,0.000000
3a9882c98205,0.0,0.0,0.0,0.0,0.0,0.000000,3.900907,0.000000,0.000000,4.583891,...,0.000000,0.000000,4.583891,4.985945,0.0,0.0,0.000000,0.000000,0.000000,3.900907


In [22]:
train_cite_targets = pd.read_hdf(train_cite_targets)
train_cite_targets.shape

(70988, 140)

In [23]:
train_cite_targets

gene_id,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.167804,0.622530,0.106959,0.324989,3.331674,6.426002,1.480766,-0.728392,-0.468851,-0.073285,...,-0.448390,3.220174,-0.533004,0.674956,-0.006187,0.682148,1.398105,0.414292,1.780314,0.548070
d02759a80ba2,0.818970,0.506009,1.078682,6.848758,3.524885,5.279456,4.930438,2.069372,0.333652,-0.468088,...,0.323613,8.407108,0.131301,0.047607,-0.243628,0.547864,1.832587,0.982308,2.736507,2.184063
c016c6b0efa5,-0.356703,-0.422261,-0.824493,1.137495,0.518924,7.221962,-0.375034,1.738071,0.142919,-0.971460,...,1.348692,4.888579,-0.279483,-0.131097,-0.177604,-0.689188,9.013709,-1.182975,3.958148,2.868600
ba7f733a4f75,-1.201507,0.149115,2.022468,6.021595,7.258670,2.792436,21.708519,-0.137913,1.649969,-0.754680,...,1.504426,12.391979,0.511394,0.587863,-0.752638,1.714851,3.893782,1.799661,1.537249,4.407671
fbcf2443ffb2,-0.100404,0.697461,0.625836,-0.298404,1.369898,3.254521,-1.659380,0.643531,0.902710,1.291877,...,0.777023,6.496499,0.279898,-0.841950,-0.869419,0.675092,5.259685,-0.835379,9.631781,1.765445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650ee456f0f3,0.905420,0.386141,0.961590,5.090580,2.854346,6.093729,-0.586178,0.452389,0.040806,0.191407,...,1.261118,3.092832,0.003275,0.278930,-0.272002,0.249477,3.789460,0.138330,1.466193,4.278504
cc506e7707f5,2.101247,2.117462,0.112699,2.065512,2.176803,3.900090,-0.586001,-0.175479,1.363232,0.109905,...,0.714624,5.029233,0.909861,0.057322,2.633387,1.340077,11.456146,-1.431453,5.275882,2.510530
a91f1b55a520,1.221313,0.476566,1.437551,5.135631,2.926102,1.615081,-0.586910,1.760421,1.944711,-0.095096,...,-0.176027,5.027534,-0.703609,1.139491,-0.078092,1.592960,9.358179,0.981883,6.911032,3.415310
3a9882c98205,-0.151433,-0.850024,0.461556,3.546561,1.996473,5.702821,0.883038,1.309014,1.029737,-0.072851,...,-0.484493,12.883892,1.579381,-0.382835,-0.065286,-0.021458,7.372662,1.010247,1.864805,3.449289


In [24]:
test_cite_inputs = pd.read_hdf(test_cite_inputs)
test_cite_inputs.shape

(48663, 22050)

In [25]:
test_cite_inputs

gene_id,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c2150f55becb,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,4.090185,0.000000
65b7edf8a4da,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,4.039545,0.000000,0.0,0.00000,0.000000,0.000000,0.000000
c1b26cb1057b,0.0,0.0,0.0,0.0,0.0,3.847321,0.000000,3.847321,3.847321,0.000000,...,0.000000,0.000000,3.847321,4.529743,0.000000,0.0,0.00000,3.847321,3.847321,0.000000
917168fa6f83,0.0,0.0,0.0,0.0,0.0,0.000000,3.436846,3.436846,0.000000,0.000000,...,3.436846,0.000000,4.113780,5.020215,0.000000,0.0,0.00000,3.436846,4.113780,0.000000
2b29feeca86d,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.196826,0.000000,0.000000,...,0.000000,4.196826,4.196826,4.196826,0.000000,0.0,3.51861,4.196826,3.518610,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
a9b4d99f1f50,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3.719836,...,0.000000,0.000000,0.000000,3.719836,0.000000,0.0,0.00000,0.000000,3.719836,0.000000
0e2c1d0782af,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,4.026206,0.000000,...,4.026206,0.000000,0.000000,4.026206,0.000000,0.0,0.00000,0.000000,4.710393,0.000000
a3cbc5aa0ec3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,4.306634,0.000000,0.000000,0.0,0.00000,4.306634,6.933096,4.993019
75b350243add,0.0,0.0,0.0,0.0,0.0,0.000000,3.624848,3.624848,0.000000,0.000000,...,0.000000,0.000000,0.000000,3.624848,3.624848,0.0,0.00000,3.624848,0.000000,0.000000


In [26]:
test_cite_inputs_day_2_donor_27678 = pd.read_hdf(test_cite_inputs_day_2_donor_27678)
test_cite_inputs_day_2_donor_27678.shape

(7016, 22085)

In [27]:
test_cite_inputs_day_2_donor_27678

gene_ids,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
83d6659a6a32,0.0,0.000000,0.0,3.131642,0.0,0.0,4.200725,3.802724,3.131642,0.000000,...,0.000000,3.802724,4.484654,5.039423,0.0,0.0,0.000000,0.000000,4.200725,0.000000
d98594f13d2e,0.0,0.000000,0.0,0.000000,0.0,0.0,3.320383,3.320383,0.000000,3.995295,...,3.320383,3.995295,3.995295,4.394607,0.0,0.0,0.000000,3.320383,4.679199,3.320383
5f93d8ffc72f,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,4.141674,0.000000,3.464298,...,0.000000,3.464298,0.000000,4.826841,0.0,0.0,0.000000,0.000000,4.141674,3.464298
7dfa2699d351,0.0,0.000000,0.0,0.000000,0.0,0.0,3.110534,3.781140,0.000000,3.110534,...,3.110534,3.110534,0.000000,4.683659,0.0,0.0,0.000000,0.000000,4.178977,3.110534
6d2533edd0e0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,3.943663,3.943663,0.000000,0.000000,0.0,0.0,0.000000,0.000000,4.627074,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
be92120b3a00,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,4.370335,0.000000,4.370335,0.000000,0.0,0.0,4.370335,4.370335,0.000000,0.000000
396d0c31d41c,0.0,3.603896,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,3.603896,...,3.603896,0.000000,0.000000,5.659234,0.0,0.0,0.000000,0.000000,3.603896,0.000000
ef6bf272cdcf,0.0,0.000000,0.0,0.000000,0.0,0.0,4.739065,0.000000,0.000000,3.657795,...,0.000000,3.657795,0.000000,0.000000,0.0,0.0,3.657795,3.657795,0.000000,0.000000
6339da0de3a0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000


In [31]:
train_cite_targets.index.intersection(train_cite_inputs.index)

Index(['45006fe3e4c8', 'd02759a80ba2', 'c016c6b0efa5', 'ba7f733a4f75',
       'fbcf2443ffb2', 'd80d84ca8e89', '1ac2049b4c98', '33fb0c29e2e4',
       'b329261bd0ee', '703762287e88',
       ...
       'f901120ab887', 'e3780c598532', '663da056425f', '10b466d6898b',
       '4d22a430b2b8', '650ee456f0f3', 'cc506e7707f5', 'a91f1b55a520',
       '3a9882c98205', 'c91b6b2ccd3d'],
      dtype='object', name='cell_id', length=70988)

In [30]:
train_cite_targets.index.intersection(test_cite_inputs.index)

Index([], dtype='object', name='cell_id')

In [33]:
train_cite_targets.index.intersection(test_cite_inputs_day_2_donor_27678.index)

Index([], dtype='object')

In [34]:
test_cite_inputs.index.intersection(test_cite_inputs_day_2_donor_27678.index)

Index([], dtype='object')

# 10x PBMC dataset

## RNA+ATAC

In [2]:
# 10x官方没有做细胞类型注释，因此不使用
filtered_10k = '/home/wsg/BM/pipeline/data/10x_PBMC/10x/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5'
# raw_10k = '/home/wsg/BM/pipeline/data/10x_PBMC/10x/pbmc_granulocyte_sorted_10k_raw_feature_bc_matrix.h5'
gex_10k ='/home/wsg/BM/pipeline/data/10x_PBMC/10x/pbmc_granulocyte_sorted_10k_gex_molecule_info.h5'

# GB作者使用Seurat进行了细胞类型注释，使用该版本
rna = '/home/wsg/BM/pipeline/data/10x_PBMC/GB/pbmc_10x_rna_public.h5ad'
atac = '/home/wsg/BM/pipeline/data/10x_PBMC/GB/pbmc_10x_atac_public.h5ad'

In [9]:
# gex_10k = sc.read_10x_h5(gex_10k,  genome='features')
# gex_10k
??read10xMolInfo()

Object `read10xMolInfo()` not found.


In [44]:
rna = sc.read_h5ad(rna)
rna.shape

(10085, 36601)

In [9]:
# rna.obs
# sum(rna.obs['wsnn_res.0.8'] == rna.obs['seurat_clusters'])
# rna.obs['seurat_clusters'].value_counts()

rna.obs['ct3'].value_counts()

In [60]:
atac = sc.read_h5ad(atac)
atac.shape

(10085, 106056)

# SHARE-seq

## RNA+ATAC

In [12]:
rna = '/home/wsg/BM/pipeline/data/SHARE-seq/GB/shareseq_mouse_skin_rna.h5ad'
atac = '/home/wsg/BM/pipeline/data/SHARE-seq/GB/shareseq_mouse_skin_atac.h5ad'

rna_10k = '/home/wsg/BM/pipeline/data/SHARE-seq/GB/mouse_skin_shareseq_rna_10k.h5ad'
atac_10k = '/home/wsg/BM/pipeline/data/SHARE-seq/GB/mouse_skin_shareseq_atac_10k.h5ad'

In [13]:
rna = sc.read_h5ad(rna)
rna.shape

(42948, 23296)

In [15]:
rna.obs

Unnamed: 0,rna.bc
0,"R1.01,R2.01,R3.06,P1.55"
1,"R1.01,R2.01,R3.36,P1.53"
2,"R1.01,R2.01,R3.42,P1.55"
3,"R1.01,R2.01,R3.43,P1.56"
4,"R1.01,R2.01,R3.64,P1.53"
...,...
42943,"R1.96,R2.96,R3.01,P1.55"
42944,"R1.96,R2.96,R3.05,P1.55"
42945,"R1.96,R2.96,R3.12,P1.55"
42946,"R1.96,R2.96,R3.23,P1.53"


In [19]:
atac = sc.read_h5ad(atac)
atac.shape

(34774, 344592)

In [21]:
atac.obs

Unnamed: 0_level_0,rna.bc
rna.bc,Unnamed: 1_level_1
R1.01.R2.01.R3.06.P1.55,R1.01.R2.01.R3.06.P1.55
R1.01.R2.03.R3.68.P1.55,R1.01.R2.03.R3.68.P1.55
R1.01.R2.05.R3.15.P1.53,R1.01.R2.05.R3.15.P1.53
R1.01.R2.05.R3.40.P1.55,R1.01.R2.05.R3.40.P1.55
R1.01.R2.05.R3.49.P1.55,R1.01.R2.05.R3.49.P1.55
...,...
R1.92.R2.79.R3.05.P1.56,R1.92.R2.79.R3.05.P1.56
R1.93.R2.20.R3.18.P1.53,R1.93.R2.20.R3.18.P1.53
R1.93.R2.80.R3.62.P1.55,R1.93.R2.80.R3.62.P1.55
R1.93.R2.91.R3.82.P1.56,R1.93.R2.91.R3.82.P1.56


In [64]:
# intersection = series1[atac.obs['rna.bc'].isin(rna.obs['rna.bc'])]
rna_index = rna.obs['rna.bc'].values
rna_index = rna_index.astype(str)
rna_index = np.char.replace(rna_index, ',', '.')
rna_index = rna_index.astype(object)

atac_index = atac.obs['rna.bc'].values

intersection = np.intersect1d(rna_index, atac_index)
len(intersection)

34774

In [16]:
rna_10k = sc.read_h5ad(rna_10k)
rna_10k.shape

(10000, 21080)

In [18]:
rna_10k.obs.celltype

R1-80-R2-72-R3-02-P1-56    ahighCD34+ bulge
R1-37-R2-77-R3-12-P1-54               Basal
R1-21-R2-47-R3-39-P1-55               TAC-1
R1-16-R2-48-R3-04-P1-55               Basal
R1-08-R2-50-R3-28-P1-54                 ORS
                                 ...       
R1-46-R2-27-R3-20-P1-54                Endo
R1-15-R2-58-R3-89-P1-56                Endo
R1-92-R2-49-R3-88-P1-54                Endo
R1-09-R2-74-R3-05-P1-55                Endo
R1-30-R2-05-R3-77-P1-54                Endo
Name: celltype, Length: 10000, dtype: category
Categories (12, object): ['Basal', 'Dermal Fibroblast', 'Endo', 'HS', ..., 'TAC-1', 'TAC-2', 'ahighCD34+ bulge', 'alowCD34+ bulge']

# HPAP

## RNA+ATAC

In [81]:
paired_RNA_mtx = '/home/wsg/BM/pipeline/data/HPAP/GB/paired_RNA/RNA_counts.mtx'
paired_RNA_genes = '/home/wsg/BM/pipeline/data/HPAP/GB/paired_RNA/gene.tsv'
paired_RNA_barcodes = '/home/wsg/BM/pipeline/data/HPAP/GB/paired_RNA/barcodes.tsv'
paired_RNA_batch = '/home/wsg/BM/pipeline/data/HPAP/GB/paired_RNA/batch.tsv'

In [76]:
# 读取数据
adata = sc.read_mtx(paired_RNA_mtx)  # 替换为你的文件路径

# 读取基因和细胞条形码作为索引和列名
adata.var_names = pd.read_csv(paired_RNA_barcodes, header=None)[0]
adata.obs_names = pd.read_csv(paired_RNA_genes, header=None)[0]

# 查看AnnData对象
print(adata)


AnnData object with n_obs × n_vars = 30557 × 13109


In [74]:
pd.read_csv(paired_RNA_barcodes, header=None)

Unnamed: 0,0
0,multi-HPAP097-GTCATCACATTAGCGC-1
1,multi-HPAP104-CGCATTTGTGCTTAGA-1
2,multi-HPAP097-TACCTCATCAGGCTAT-1
3,multi-HPAP097-TGCTTAAAGTAACTCA-1
4,multi-HPAP096-AGCGGATAGTTACCGG-1
...,...
13104,multi-HPAP104-GGCCTTAAGCGCCTTT-1
13105,multi-HPAP104-CCAACCAAGTTAGTGC-1
13106,multi-HPAP104-AGGCGGATCACGCATG-1
13107,multi-HPAP097-GTTAGGCGTAACAGGG-1


# SNARE-seq

## RNA+ATAC

In [84]:
## AdBrainCortex
paired_RNA_mtx = '/home/wsg/BM/pipeline/data/SNARE-seq/AdBrainCortex/RNA/GSE126074_AdBrainCortex_SNAREseq_cDNA.counts.mtx'
paired_RNA_genes = '/home/wsg/BM/pipeline/data/SNARE-seq/AdBrainCortex/RNA/GSE126074_AdBrainCortex_SNAREseq_cDNA.genes.tsv'
paired_RNA_barcodes = '/home/wsg/BM/pipeline/data/SNARE-seq/AdBrainCortex/RNA/GSE126074_AdBrainCortex_SNAREseq_cDNA.barcodes.tsv'

In [85]:
# 读取数据
adata = sc.read_mtx(paired_RNA_mtx)  # 替换为你的文件路径
adata = adata.T

# 读取基因和细胞条形码作为索引和列名
adata.var_names = pd.read_csv(paired_RNA_genes, header=None)[0]
adata.obs_names = pd.read_csv(paired_RNA_barcodes, header=None)[0]

# 查看AnnData对象
print(adata)


AnnData object with n_obs × n_vars = 10309 × 33160


In [86]:
## AdBrainCortex
paired_ATAC_mtx = '/home/wsg/BM/pipeline/data/SNARE-seq/AdBrainCortex/ATAC/GSE126074_AdBrainCortex_SNAREseq_chromatin.counts.mtx'
paired_ATAC_peaks = '/home/wsg/BM/pipeline/data/SNARE-seq/AdBrainCortex/ATAC/GSE126074_AdBrainCortex_SNAREseq_chromatin.peaks.tsv'
paired_ATAC_barcodes = '/home/wsg/BM/pipeline/data/SNARE-seq/AdBrainCortex/ATAC/GSE126074_AdBrainCortex_SNAREseq_chromatin.barcodes.tsv'

In [87]:
# 读取数据
adata = sc.read_mtx(paired_ATAC_mtx)  # 替换为你的文件路径
adata = adata.T

# 读取基因和细胞条形码作为索引和列名
adata.var_names = pd.read_csv(paired_ATAC_peaks, header=None)[0]
adata.obs_names = pd.read_csv(paired_ATAC_barcodes, header=None)[0]

# 查看AnnData对象
print(adata)

AnnData object with n_obs × n_vars = 10309 × 244544


In [88]:
## P0_BrainCortex
paired_RNA_mtx = '/home/wsg/BM/pipeline/data/SNARE-seq/P0_BrainCortex/RNA/GSE126074_P0_BrainCortex_SNAREseq_cDNA.counts.mtx'
paired_RNA_genes = '/home/wsg/BM/pipeline/data/SNARE-seq/P0_BrainCortex/RNA/GSE126074_P0_BrainCortex_SNAREseq_cDNA.genes.tsv'
paired_RNA_barcodes = '/home/wsg/BM/pipeline/data/SNARE-seq/P0_BrainCortex/RNA/GSE126074_P0_BrainCortex_SNAREseq_cDNA.barcodes.tsv'

In [89]:
# 读取数据
adata = sc.read_mtx(paired_RNA_mtx)  # 替换为你的文件路径
adata = adata.T

# 读取基因和细胞条形码作为索引和列名
adata.var_names = pd.read_csv(paired_RNA_genes, header=None)[0]
adata.obs_names = pd.read_csv(paired_RNA_barcodes, header=None)[0]

# 查看AnnData对象
print(adata)


AnnData object with n_obs × n_vars = 5081 × 19322


In [92]:
## P0_BrainCortex
paired_ATAC_mtx = '/home/wsg/BM/pipeline/data/SNARE-seq/P0_BrainCortex/ATAC/GSE126074_P0_BrainCortex_SNAREseq_chromatin.counts.mtx'
paired_ATAC_peaks = '/home/wsg/BM/pipeline/data/SNARE-seq/P0_BrainCortex/ATAC/GSE126074_P0_BrainCortex_SNAREseq_chromatin.peaks.tsv'
paired_ATAC_barcodes = '/home/wsg/BM/pipeline/data/SNARE-seq/P0_BrainCortex/ATAC/GSE126074_P0_BrainCortex_SNAREseq_chromatin.barcodes.tsv'

In [94]:
# 读取数据
adata = sc.read_mtx(paired_ATAC_mtx)  # 替换为你的文件路径
adata = adata.T

# 读取基因和细胞条形码作为索引和列名
adata.var_names = pd.read_csv(paired_ATAC_peaks, header=None)[0]
adata.obs_names = pd.read_csv(paired_ATAC_barcodes, header=None)[0]

# 查看AnnData对象
print(adata)


AnnData object with n_obs × n_vars = 5081 × 229429


In [95]:
## CellLineMixture
paired_RNA = '/home/wsg/BM/pipeline/data/SNARE-seq/CellLineMixture/GSE126074_CellLineMixture_SNAREseq_cDNA_counts.tsv'
paired_ATAC = '/home/wsg/BM/pipeline/data/SNARE-seq/CellLineMixture/GSE126074_CellLineMixture_SNAREseq_chromatin_counts.tsv'

adata_rna = sc.read_csv(paired_RNA, delimiter='\t')
adata_atac = sc.read_csv(paired_ATAC, delimiter='\t')

print(adata_rna)
print(adata_atac)

AnnData object with n_obs × n_vars = 18666 × 1047
AnnData object with n_obs × n_vars = 136771 × 1047


# SPOTS

## Spatial RNA + ADT

In [9]:
adata_RNA_path = '/home/wsg/BM/data/SPOTS/Mouse_Spleen/adata_RNA.h5ad'
adata_Pro_path = '/home/wsg/BM/data/SPOTS/Mouse_Spleen/adata_Pro.h5ad'

In [10]:
adata_RNA = sc.read_h5ad(adata_RNA_path)
adata_RNA

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 2653 × 32285
    var: 'gene_ids', 'feature_types', 'genome'
    obsm: 'spatial'

In [11]:
adata_RNA.obsm['spatial']

array([[59, 19],
       [14, 94],
       [43,  9],
       ...,
       [58, 42],
       [60, 30],
       [45, 27]])

In [16]:
adata_Pro = sc.read_h5ad(adata_Pro_path)
adata_Pro.X

array([[103.,  72., 326., ...,  24.,  35.,  62.],
       [ 38.,  37., 128., ...,   9.,  12.,  17.],
       [124.,  75., 373., ...,  29.,  27.,  72.],
       ...,
       [ 28.,  38.,  37., ...,  13.,  12.,  22.],
       [ 96.,  68., 152., ...,   9.,  15.,  50.],
       [ 57.,  57.,  64., ...,  13.,   4.,  41.]], dtype=float32)

In [14]:
adata_Pro.obsm['spatial']

array([[59, 19],
       [14, 94],
       [43,  9],
       ...,
       [58, 42],
       [60, 30],
       [45, 27]])

# Stereo-CITE-seq

## Spatial RNA + ADT

In [115]:
adata_RNA_path = '/home/wsg/BM/pipeline/data/Spatial/Stereo-CITE-seq/Mouse_Thymus/adata_RNA.h5ad'
adata_ADT_path = '/home/wsg/BM/pipeline/data/Spatial/Stereo-CITE-seq/Mouse_Thymus/adata_ADT.h5ad'

In [116]:
adata_RNA = sc.read_h5ad(adata_RNA_path)
adata_RNA

AnnData object with n_obs × n_vars = 4697 × 23622
    obs: 'orig.ident', 'x', 'y'
    obsm: 'spatial'

In [117]:
adata_RNA.obs

Unnamed: 0,orig.ident,x,y
0_22,sample,8919,13270
0_33,sample,8919,14370
0_40,sample,8919,15070
0_41,sample,8919,15170
0_42,sample,8919,15270
...,...,...,...
9_73,sample,9819,18370
9_74,sample,9819,18470
9_75,sample,9819,18570
9_8,sample,9819,11870


In [120]:
adata_RNA.obsm['spatial']

array([[15319, 17370],
       [13319, 14170],
       [10419, 12170],
       ...,
       [13319, 13870],
       [17019, 15870],
       [13119, 16070]])

In [118]:
adata_ADT = sc.read_h5ad(adata_ADT_path)
adata_ADT

AnnData object with n_obs × n_vars = 4697 × 51
    obs: 'orig.ident', 'x', 'y'
    obsm: 'spatial'

In [119]:
adata_ADT.obsm['spatial']

array([[15317, 17384],
       [13317, 14184],
       [10417, 12184],
       ...,
       [13317, 13884],
       [17017, 15884],
       [13117, 16084]])

# spatial-ATAC-RNA-seq

## Spatial RNA + ATAC

In [121]:
adata_RNA_path = '/home/wsg/BM/pipeline/data/Spatial/spatial-ATAC-RNA-seq/Mouse_Brain/adata_RNA.h5ad'
adata_ATAC_path = '/home/wsg/BM/pipeline/data/Spatial/spatial-ATAC-RNA-seq/Mouse_Brain/adata_peaks_normalized.h5ad'

In [122]:
adata_RNA = sc.read_h5ad(adata_RNA_path)
adata_RNA

AnnData object with n_obs × n_vars = 9215 × 22914
    obs: 'nCount_Spatial', 'nFeature_Spatial', 'nCount_SCT', 'nFeature_SCT', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_peaks', 'nFeature_peaks', 'RNA_clusters', 'ATAC_clusters'
    var: 'name'
    obsm: 'X_pca', 'X_umap', 'spatial'

In [123]:
adata_ATAC = sc.read_h5ad(adata_ATAC_path)
adata_ATAC

AnnData object with n_obs × n_vars = 9215 × 121068
    obs: 'nCount_Spatial', 'nFeature_Spatial', 'nCount_SCT', 'nFeature_SCT', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_peaks', 'nFeature_peaks', 'RNA_clusters', 'ATAC_clusters'
    var: 'count', 'percentile'
    uns: 'ATAC', 'ATAC_clusters_colors', 'umap'
    obsm: 'X_lsi', 'X_pca', 'X_umap', 'spatial'
    obsp: 'ATAC_connectivities', 'ATAC_distances'