In [7]:
 %cd /sci/labs/yotamd/lab_share/avishai.wizel/eRNA/

/sci/labs/yotamd/lab_share/avishai.wizel/eRNA


In [8]:
import h5py
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
import os 

'/sci/labs/yotamd/lab_share/avishai.wizel/eRNA'

In [10]:
# Path to your HDF5 file
file_path = './10X_PBMC/01_raw_data/atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5'

# Read file content

In [11]:
with h5py.File(file_path, "r") as f:
    print(f"Keys at root level: {list(f.keys())}") # Show keys at the top level

    # Get the 'matrix' group object
    matrix_group = f['matrix']

    # Now, print the keys *inside* the 'matrix' group
    print(f"Keys inside 'matrix' group: {list(matrix_group.keys())}")

    # You can now access datasets within the 'matrix' group.
    # Common keys for sparse matrices in HDF5 (e.g., from 10x Genomics) are:
    # 'data', 'indices', 'indptr', 'shape'

    # Example: Accessing a dataset named 'data' within the 'matrix' group
    if 'data' in matrix_group:
        data_values = matrix_group['data'][:] # Read the 'data' dataset
        print(f"\nFirst 10 values from 'matrix/data': {data_values[:10]}")
        print(f"Shape of 'matrix/data' dataset: {data_values.shape}")
    else:
        print("\n'data' dataset not found within 'matrix' group.")

    # You can similarly access other datasets like 'indices', 'indptr', 'shape'
    if 'shape' in matrix_group:
        matrix_shape = matrix_group['shape'][:]
        print(f"Shape of the sparse matrix: {matrix_shape}")

    # If you want to list all items recursively (datasets and groups):
    print("\nAll items in the HDF5 file (recursive):")
    f.visititems(lambda name, obj: print(f"{name} ({'Group' if isinstance(obj, h5py.Group) else 'Dataset'})"))

Keys at root level: ['matrix']
Keys inside 'matrix' group: ['barcodes', 'data', 'features', 'indices', 'indptr', 'shape']

First 10 values from 'matrix/data': [2 2 1 2 2 4 2 2 2 1]
Shape of 'matrix/data' dataset: (51265105,)
Shape of the sparse matrix: [80234  8633]

All items in the HDF5 file (recursive):
matrix (Group)
matrix/barcodes (Dataset)
matrix/data (Dataset)
matrix/features (Group)
matrix/features/_all_tag_keys (Dataset)
matrix/features/derivation (Dataset)
matrix/features/feature_type (Dataset)
matrix/features/genome (Dataset)
matrix/features/id (Dataset)
matrix/features/name (Dataset)
matrix/indices (Dataset)
matrix/indptr (Dataset)
matrix/shape (Dataset)


# Build sparse matrix

In [12]:
import h5py
from scipy.sparse import csr_matrix # או csc_matrix
import numpy as np


with h5py.File(file_path, "r") as f:
    # Load the components of the sparse matrix
    data = f['matrix/data'][:]
    indices = f['matrix/indices'][:]
    indptr = f['matrix/indptr'][:]
    shape = f['matrix/shape'][:] # Returns an array like [rows, cols]

    # Create a sparse matrix (e.g., CSR format)
    # Note: Cell Ranger often outputs in CSC format for ATAC/Gene Exp,
    # so you might need to use csc_matrix if CSR doesn't yield expected results.
    # The 'shape' array from HDF5 usually represents (n_rows, n_cols)
    sparse_matrix = csc_matrix((data, indices, indptr), shape=shape)

    print(f"\nSparse Matrix created with shape: {sparse_matrix.shape}")
    print(f"Non-zero elements: {sparse_matrix.nnz}")
    # You can convert to a dense array (use with caution for very large matrices)
    # dense_matrix = sparse_matrix.toarray()
    # print(f"Dense matrix (first 5x5):\n{dense_matrix[:5, :5]}")
    
    # Access the 'barcodes' dataset
    barcodes_values_bytes  = f['matrix/barcodes'][:]
    barcodes_values = [x.decode('utf-8') for x in barcodes_values_bytes] #decode bytes
    print(f"Data from 'matrix/barcodes' (first 5 values): {barcodes_values[:5]}")
    print(f"Length of 'matrix/barcodes': {len(barcodes_values)}\n")
    
    # You can also access datasets within nested groups, like 'matrix/features/id'
    features_id_values_bytes = f['matrix/features/id'][:]
    features_id_values = [x.decode('utf-8') for x in features_id_values_bytes]
    print(f"Data from 'matrix/features/id' (first 5 values): {features_id_values[:5]}")
    print(f"Length of 'matrix/features/id': {len(features_id_values)}\n")





Sparse Matrix created with shape: (80234, 8633)
Non-zero elements: 51265105
Data from 'matrix/barcodes' (first 5 values): ['AAACGAAAGAGCGAAA-1', 'AAACGAAAGAGTTTGA-1', 'AAACGAAAGCGAGCTA-1', 'AAACGAAAGGCTTCGC-1', 'AAACGAAAGTGCTGAG-1']
Length of 'matrix/barcodes': 8633

Data from 'matrix/features/id' (first 5 values): ['chr1:565113-565543', 'chr1:569179-569635', 'chr1:713534-714806', 'chr1:752436-753020', 'chr1:762144-763353']
Length of 'matrix/features/id': 80234



In [13]:

# calculate percentage of zeros
percentage_zeros = (1 - sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1])) * 100

print(f"Non-zero elements fraction: {percentage_zeros:.2f}%")

Non-zero elements fraction: 92.60%


In [14]:
sc_atac_data = pd.DataFrame(
    data=sparse_matrix.toarray(),
    index=features_id_values,
    columns=barcodes_values 
    )
sc_atac_data

Unnamed: 0,AAACGAAAGAGCGAAA-1,AAACGAAAGAGTTTGA-1,AAACGAAAGCGAGCTA-1,AAACGAAAGGCTTCGC-1,AAACGAAAGTGCTGAG-1,AAACGAAGTCAGGCTC-1,AAACGAAGTGCCCGAT-1,AAACGAAGTTGTATCG-1,AAACGAATCAGTTGAC-1,AAACGAATCCTTACGC-1,...,TTTGTGTGTAACCCAT-1,TTTGTGTGTCGGCTGT-1,TTTGTGTGTGATGCTT-1,TTTGTGTGTTTAAGGA-1,TTTGTGTTCATCGCCT-1,TTTGTGTTCATGCTTT-1,TTTGTGTTCCGAGAGA-1,TTTGTGTTCGAAGCCC-1,TTTGTGTTCTACTTTG-1,TTTGTGTTCTTGTGCC-1
chr1:565113-565543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1:569179-569635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,2,0
chr1:713534-714806,0,0,2,8,0,2,2,0,0,2,...,0,0,0,2,0,0,2,2,0,4
chr1:752436-753020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,2
chr1:762144-763353,0,0,4,2,0,0,0,2,0,0,...,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrY:23418918-23419001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrY:23422186-23422618,0,0,0,2,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
chrY:23584049-23584422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrY:28816422-28818023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
sc_atac_data.to_pickle('./10X_PBMC/02_counts_data/sc_atac_data.pkl')
