In [7]:
 %cd /sci/labs/yotamd/lab_share/avishai.wizel/eRNA/

/sci/labs/yotamd/lab_share/avishai.wizel/eRNA


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [15]:
import h5py
import anndata as ad
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix, csr_matrix # Import csc_matrix

In [16]:
file_path = "./10X_PBMC/01_raw_data/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5"

In [19]:
def split_multiome_h5_to_anndata(h5_file_path: str) -> tuple[ad.AnnData, ad.AnnData]:
    """
    Splits a 10x Genomics Multiome H5 file (containing both Gene Expression and ATAC data)
    into two separate AnnData objects. This version correctly handles 10x's CSC matrix
    storage format.

    Args:
        h5_file_path (str): The full path to the combined Multiome H5 file.

    Returns:
        tuple[ad.AnnData, ad.AnnData]: A tuple containing:
            - adata_rna (ad.AnnData): AnnData object for Gene Expression (cells x genes).
            - adata_atac (ad.AnnData): AnnData object for ATAC (cells x peaks).
            Both AnnData objects will have cleaned cell barcodes (without '-1' suffix).
    """
    print(f"Loading and splitting Multiome data from: {h5_file_path}")

    with h5py.File(h5_file_path, 'r') as f:
        # Load sparse matrix components
        data = f['matrix']['data'][:]
        indices = f['matrix']['indices'][:]  # Row indices for CSC
        indptr = f['matrix']['indptr'][:]    # Column pointers for CSC
        original_shape_h5 = f['matrix']['shape'][:] # (n_features, n_cells) as stored in H5

        # Load barcodes (cells)
        barcodes_raw = f['matrix']['barcodes'][:].astype(str)

        # Load features (genes/peaks) and their types
        feature_names_raw = f['matrix']['features']['name'][:].astype(str)
        feature_types = f['matrix']['features']['feature_type'][:].astype(str)
        
        # Load other feature metadata (common in 10x Multiome H5)
        feature_ids_raw = f['matrix']['features']['id'][:].astype(str)
        feature_genomes_raw = f['matrix']['features']['genome'][:].astype(str)
        # Add more if other fields like 'interval' are consistently useful
        # e.g., if 'interval' in f['matrix']['features']:
        #    intervals_raw = f['matrix']['features']['interval'][:].astype(str)

    print(f"Original shape from H5 (features x cells): {original_shape_h5}")
    print(f"Length of loaded data: {len(data)}")
    print(f"Length of loaded indices: {len(indices)}")
    print(f"Length of loaded indptr: {len(indptr)} (should be n_cells + 1 if CSC)")
    print(f"Number of loaded barcodes (cells): {len(barcodes_raw)}")
    print(f"Number of loaded features (genes+peaks): {len(feature_names_raw)}")

    # Crucial step: Construct as CSC matrix first
    # The (data, indices, indptr) from 10x H5 are structured for CSC (features x cells)
    print(f"Constructing initial CSC matrix with shape {original_shape_h5} (features x cells)...")
    full_sparse_matrix_features_cells_csc = csc_matrix(
        (data, indices, indptr), 
        shape=original_shape_h5
    )
    

    print(f"Successfully constructed CSC matrix: {full_sparse_matrix_features_cells_csc.shape}")

    # 1. Clean cell barcodes (remove '-1' suffix)
    cleaned_barcodes = pd.Index(barcodes_raw).str.replace(r'-\d+$', '', regex=True)
    print(f"First 5 raw barcodes: {barcodes_raw[:5].tolist()}")
    print(f"First 5 cleaned barcodes: {cleaned_barcodes[:5].tolist()}")

    # 2. Filter features by 'feature_type'
    rna_mask = (feature_types == 'Gene Expression')
    atac_mask = (feature_types == 'Peaks')

    rna_feature_names = pd.Index(feature_names_raw[rna_mask])
    atac_peak_names = pd.Index(feature_names_raw[atac_mask])
    
    print(f"Found {len(rna_feature_names)} Gene Expression features (genes).")
    print(f"Found {len(atac_peak_names)} ATAC features (peaks).")

    # Prepare feature metadata DataFrames for .var
    rna_var_df = pd.DataFrame({
        'feature_id': feature_ids_raw[rna_mask],
        'feature_type': feature_types[rna_mask],
        'genome': feature_genomes_raw[rna_mask]
    }, index=rna_feature_names)

    atac_var_df = pd.DataFrame({
        'feature_id': feature_ids_raw[atac_mask],
        'feature_type': feature_types[atac_mask],
        'genome': feature_genomes_raw[atac_mask]
        # Example for adding 'interval' if it's there and useful:
        # 'interval': intervals_raw[atac_mask] if 'interval' in locals() else None
    }, index=atac_peak_names)


    # 3. Create sparse matrices for each modality
    # Slice the full CSC matrix (features x cells) by feature type.
    # Then transpose the result to get (cells x features) and convert to CSR for AnnData.X.
    
    # RNA data
    # Select rows (features) corresponding to RNA genes, keep all columns (cells)
    rna_sparse_matrix_features_cells_csc = full_sparse_matrix_features_cells_csc[rna_mask, :]
    adata_rna = ad.AnnData(
        X=rna_sparse_matrix_features_cells_csc.T.tocsr(), # Transpose to (cells x genes) and convert to CSR
        obs=pd.DataFrame(index=cleaned_barcodes), # Cells are observations
        var=rna_var_df # Genes are variables/features
    )
    print(f"\nCreated adata_rna: {adata_rna.shape} (cells x genes)")
    print(f"adata_rna .X is of type: {type(adata_rna.X)}")

    # ATAC data
    # Select rows (features) corresponding to ATAC peaks, keep all columns (cells)
    atac_sparse_matrix_features_cells_csc = full_sparse_matrix_features_cells_csc[atac_mask, :]
    adata_atac = ad.AnnData(
        X=atac_sparse_matrix_features_cells_csc.T.tocsr(), # Transpose to (cells x peaks) and convert to CSR
        obs=pd.DataFrame(index=cleaned_barcodes), # Cells are observations
        var=atac_var_df # Peaks are variables/features
    )
    print(f"Created adata_atac: {adata_atac.shape} (cells x peaks)")
    print(f"adata_atac .X is of type: {type(adata_atac.X)}")

    return adata_rna, adata_atac


In [20]:
adata_rna, adata_atac = split_multiome_h5_to_anndata(file_path)


Loading and splitting Multiome data from: ./10X_PBMC/01_raw_data/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5
Original shape from H5 (features x cells): [144978  11909]
Length of loaded data: 109104829
Length of loaded indices: 109104829
Length of loaded indptr: 11910 (should be n_cells + 1 if CSC)
Number of loaded barcodes (cells): 11909
Number of loaded features (genes+peaks): 144978
Constructing initial CSC matrix with shape [144978  11909] (features x cells)...
Successfully constructed CSC matrix: (144978, 11909)
First 5 raw barcodes: ['AAACAGCCAAGGAATC-1', 'AAACAGCCAATCCCTT-1', 'AAACAGCCAATGCGCT-1', 'AAACAGCCACACTAAT-1', 'AAACAGCCACCAACCG-1']
First 5 cleaned barcodes: ['AAACAGCCAAGGAATC', 'AAACAGCCAATCCCTT', 'AAACAGCCAATGCGCT', 'AAACAGCCACACTAAT', 'AAACAGCCACCAACCG']
Found 36601 Gene Expression features (genes).
Found 108377 ATAC features (peaks).


  utils.warn_names_duplicates("var")



Created adata_rna: (11909, 36601) (cells x genes)
adata_rna .X is of type: <class 'scipy.sparse._csr.csr_matrix'>
Created adata_atac: (11909, 108377) (cells x peaks)
adata_atac .X is of type: <class 'scipy.sparse._csr.csr_matrix'>


In [21]:
print("\n--- Summary of RNA AnnData ---")
print(adata_rna)
print(f"RNA matrix shape: {adata_rna.shape}")
print(f"RNA obs_names (first 5): {adata_rna.obs_names[:5].tolist()}")
print(f"RNA var_names (first 5): {adata_rna.var_names[:5].tolist()}")
print("RNA var columns:", adata_rna.var.columns.tolist())

print("\n--- Summary of ATAC AnnData ---")
print(adata_atac)
print(f"ATAC matrix shape: {adata_atac.shape}")
print(f"ATAC obs_names (first 5): {adata_atac.obs_names[:5].tolist()}")
print(f"ATAC var_names (first 5): {adata_atac.var_names[:5].tolist()}")
print("ATAC var columns:", adata_atac.var.columns.tolist())

# Check for common cells
common_cells_multiome = adata_rna.obs_names.intersection(adata_atac.obs_names)
print(f"\nNumber of common cells between RNA and ATAC: {len(common_cells_multiome)}")
if len(common_cells_multiome) == adata_rna.shape[0] and len(common_cells_multiome) == adata_atac.shape[0]:
    print("All cells are common between modalities, as expected for Multiome.")



--- Summary of RNA AnnData ---
AnnData object with n_obs × n_vars = 11909 × 36601
    var: 'feature_id', 'feature_type', 'genome'
RNA matrix shape: (11909, 36601)
RNA obs_names (first 5): ['AAACAGCCAAGGAATC', 'AAACAGCCAATCCCTT', 'AAACAGCCAATGCGCT', 'AAACAGCCACACTAAT', 'AAACAGCCACCAACCG']
RNA var_names (first 5): ['MIR1302-2HG', 'FAM138A', 'OR4F5', 'AL627309.1', 'AL627309.3']
RNA var columns: ['feature_id', 'feature_type', 'genome']

--- Summary of ATAC AnnData ---
AnnData object with n_obs × n_vars = 11909 × 108377
    var: 'feature_id', 'feature_type', 'genome'
ATAC matrix shape: (11909, 108377)
ATAC obs_names (first 5): ['AAACAGCCAAGGAATC', 'AAACAGCCAATCCCTT', 'AAACAGCCAATGCGCT', 'AAACAGCCACACTAAT', 'AAACAGCCACCAACCG']
ATAC var_names (first 5): ['chr1:10109-10357', 'chr1:180730-181630', 'chr1:191491-191736', 'chr1:267816-268196', 'chr1:586028-586373']
ATAC var columns: ['feature_id', 'feature_type', 'genome']

Number of common cells between RNA and ATAC: 11909
All cells are common b

In [24]:
adata_rna.write_h5ad("./10X_PBMC/02_counts_data/adata_rna.h5ad")
adata_atac.write_h5ad("./10X_PBMC/02_counts_data/adata_atac.h5ad")