In [11]:
import scanpy as sc
import h5py
import numpy as np

In [13]:
data_mat = h5py.File('./datasets/Spatial_CITE_seq_HumanTonsil_RNA_Protein.h5', 'r')

In [16]:
data_mat

<HDF5 file "Spatial_CITE_seq_HumanTonsil_RNA_Protein.h5" (mode r)>

In [17]:
df_data_RNA = np.array(data_mat['X_gene']).astype('float64')

In [18]:
import scanpy as sc
import h5py
import numpy as np

def load_tonsil_data():
    """
    Load D2: Human Tonsil Spatial CITE-seq dataset (RNA + Protein).
    Formatted to match the Mouse Brain data structure.
    """
    print("\nLoading D2: Human Tonsil (RNA + Protein)...")
    
    # Use the Tonsil file path
    file_path = "./datasets/Spatial_CITE_seq_HumanTonsil_RNA_Protein.h5"
    
    # Open file with context manager for safety
    with h5py.File(file_path, 'r') as data_mat:
        # 1. Load Matrices
        # Mapping 'X_gene' to the first slot (RNA)
        df_data_RNA = np.array(data_mat['X_gene']).astype('float64')
        
        # Mapping 'X_protein' to the second slot (previously ATAC)
        df_data_Protein = np.array(data_mat['X_protein']).astype('float64')
        
        # 2. Load Metadata and Positions
        loc = np.array(data_mat['pos']).astype('float64')
        
        # The Tonsil dataset might use 'cell' (barcodes) if 'LayerName' isn't present.
        # We load 'cell' to serve as the label/ID vector.
        if 'cell' in data_mat:
            LayerName = [item.decode("utf-8") for item in list(data_mat['cell'])]
        else:
            # Fallback if no cell IDs are found, create generic IDs
            LayerName = [f"Cell_{i}" for i in range(df_data_RNA.shape[0])]

        # 3. Create AnnData Object 1 (RNA)
        adata1 = sc.AnnData(df_data_RNA, dtype="float64")
        adata1.obsm['spatial'] = np.array(loc)
        adata1.obs['LayerName'] = LayerName
        adata1.obs['x_pos'] = np.array(loc)[:, 0]
        adata1.obs['y_pos'] = np.array(loc)[:, 1]
        
        # 4. Create AnnData Object 2 (Protein)
        adata2 = sc.AnnData(df_data_Protein, dtype="float64")
        adata2.obsm['spatial'] = np.array(loc)
        adata2.obs['LayerName'] = LayerName
        adata2.obs['x_pos'] = np.array(loc)[:, 0]
        adata2.obs['y_pos'] = np.array(loc)[:, 1]
        
        # 5. Print Statistics
        print(f"  - Cells: {adata1.shape[0]}")
        print(f"  - RNA features: {adata1.shape[1]}")
        print(f"  - Protein features: {adata2.shape[1]}")
        print(f"  - Unique IDs (LayerName): {len(np.unique(LayerName))}")
        
        # Return exact same tuple structure as MouseBrain function
        # adata1, adata2, Label_List, Loc_Array, Dataset_Name
        return adata1, adata2, np.array(LayerName), loc, 'HumanTonsil'

In [19]:
load_tonsil_data()


Loading D2: Human Tonsil (RNA + Protein)...
  - Cells: 2492
  - RNA features: 984
  - Protein features: 283
  - Unique IDs (LayerName): 2492




(AnnData object with n_obs × n_vars = 2492 × 984
     obs: 'LayerName', 'x_pos', 'y_pos'
     obsm: 'spatial',
 AnnData object with n_obs × n_vars = 2492 × 283
     obs: 'LayerName', 'x_pos', 'y_pos'
     obsm: 'spatial',
 array(['29x42', '44x36', '34x6', ..., '23x48', '8x44', '5x44'],
       shape=(2492,), dtype='<U5'),
 array([[29., 42.],
        [44., 36.],
        [34.,  6.],
        ...,
        [23., 48.],
        [ 8., 44.],
        [ 5., 44.]], shape=(2492, 2)),
 'HumanTonsil')

In [None]:


df_data_ATAC = np.array(data_mat['X_ATAC']).astype('float64')
loc = np.array(data_mat['Pos']).astype('float64')
LayerName = [item.decode("utf-8") for item in list(data_mat['LayerName'])]

adata1 = sc.AnnData(df_data_RNA, dtype="float64")
adata1.obsm['spatial'] = np.array(loc)
adata1.obs['LayerName'] = LayerName
adata1.obs['x_pos'] = np.array(loc)[:, 0]
adata1.obs['y_pos'] = np.array(loc)[:, 1]

adata2 = sc.AnnData(df_data_ATAC, dtype="float64")
adata2.obsm['spatial'] = np.array(loc)
adata2.obs['LayerName'] = LayerName
adata2.obs['x_pos'] = np.array(loc)[:, 0]
adata2.obs['y_pos'] = np.array(loc)[:, 1]

KeyError: "Unable to synchronously open object (object 'X_RNA' doesn't exist)"

In [2]:
adt_ad = sc.read_h5ad("spatialGlue_sim_adata_ADT.h5ad")
rna_ad = sc.read_h5ad("spatialGlue_sim_adata_RNA.h5ad")

In [3]:
adt_ad

AnnData object with n_obs × n_vars = 1296 × 100
    uns: 'log1p'
    obsm: 'nsfac', 'spatial', 'spfac'
    varm: 'nsload', 'spload'
    layers: 'counts'

In [4]:
rna_ad

AnnData object with n_obs × n_vars = 1296 × 1000
    uns: 'log1p'
    obsm: 'nsfac', 'spatial', 'spfac'
    varm: 'nsload', 'spload'
    layers: 'counts'