In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import h5py
from scipy import sparse
from pathlib import Path
import scanpy as sc
# import rnanorm

## Subsetting AnnData object:
`row_filered = adata[adata.obs.my_rowname == 'value']`

`col_filtered = adata[:, adata.obs.my_colname == 'value']`

![image.png](attachment:image.png)

In [3]:
col_metadata = pd.read_csv(r'../data/column_metadata.csv', index_col=0)
row_metadata = pd.read_csv(r'../data/row_metadata.csv', index_col=0)

In [4]:
with h5py.File(r'../data/151507_raw_feature_bc_matrix.h5', "r") as f:
    f.visit(print)
    print(f['matrix']['features'].keys())

matrix
matrix/barcodes
matrix/data
matrix/features
matrix/features/_all_tag_keys
matrix/features/feature_type
matrix/features/genome
matrix/features/id
matrix/features/name
matrix/indices
matrix/indptr
matrix/shape
<KeysViewHDF5 ['_all_tag_keys', 'feature_type', 'genome', 'id', 'name']>


In [5]:
def load_sparse_mat(raw_filename, filtered_filename, sample_number):
    with h5py.File(raw_filename, 'r') as f:
        raw_barcodes = np.array(f['matrix']['barcodes']).astype('U13')
#         _id = np.array(f['matrix']['features']['id']).astype('U13')   #  b'ENSG00000275063' 33538 rows
#         name = np.array(f['matrix']['features']['name']).astype('U13')   # Gene name b'AL627309.3' 33538 rows
        
    with h5py.File(filtered_filename, 'r') as f:
        data = np.array(f['matrix']['data'])
        filtered_barcodes = np.array(f['matrix']['barcodes']).astype('U13')  # b'AAACAAGTATCTCCCA-1'  4992 columns
        _id = np.array(f['matrix']['features']['id']).astype('U13')   #  b'ENSG00000275063' 33538 rows
        name = np.array(f['matrix']['features']['name']).astype('U13')   # Gene name b'AL627309.3' 33538 rows
        indices = np.array(f['matrix']['indices'])
        indptr = np.array(f['matrix']['indptr'])
        shape = np.array(f['matrix']['shape'])
        
    barcodes = pd.DataFrame(filtered_barcodes)
    barcodes['in_filtered'] = barcodes[0].isin(filtered_barcodes).astype('int')
#     print(barcodes.shape)
    m = sparse.csr_matrix((data, indices, indptr), shape=(shape[1], shape[0]))
    index = [barcodes[0].astype('str').to_list(), 
#              barcodes['in_filtered'].astype('str').to_list(), 
             [str(sample_number)]*len(barcodes)]
    columns = [name, _id]
#     print(len(index[0]))
#     print(len(columns[0]))
#     m = pd.DataFrame(m.toarray(), index=index, columns=columns).T
    return m.toarray()
#     return m

In [11]:
rawfiles = [x for x in Path(r'..\data').glob(pattern='*raw*.h5')][::4]
filtfiles = [x for x in Path(r'..\data').glob('*filt*.h5')][::4]
display(filtfiles)

[WindowsPath('../data/151507_filtered_feature_bc_matrix.h5'),
 WindowsPath('../data/151669_filtered_feature_bc_matrix.h5'),
 WindowsPath('../data/151673_filtered_feature_bc_matrix.h5')]

In [13]:
dfs = []

for raw_file, filtered_file in zip(rawfiles, filtfiles):
    sample_number = raw_file.stem.split('_')[0]
    df = load_sparse_mat(raw_filename=raw_file, 
                         filtered_filename=filtered_file, 
                         sample_number=sample_number)
    dfs.append(df)
    
cdf = np.concatenate(dfs)
cdf.shape
adata = sc.AnnData(X=cdf, 
                   obs=col_metadata.loc[col_metadata.sample_name.isin([151507, 151669, 151673])], 
                   var=row_metadata)  # obs = rows      var = cols
del cdf
del dfs

  


In [14]:
adata

AnnData object with n_obs × n_vars = 11526 × 33538
    obs: 'barcode', 'sample_name', 'tissue', 'row', 'col', 'imagerow', 'imagecol', 'Cluster', 'height', 'width', 'sum_umi', 'sum_gene', 'subject', 'position', 'replicate', 'subject_position', 'discard', 'key', 'cell_count', 'SNN_k50_k4', 'SNN_k50_k5', 'SNN_k50_k6', 'SNN_k50_k7', 'SNN_k50_k8', 'SNN_k50_k9', 'SNN_k50_k10', 'SNN_k50_k11', 'SNN_k50_k12', 'SNN_k50_k13', 'SNN_k50_k14', 'SNN_k50_k15', 'SNN_k50_k16', 'SNN_k50_k17', 'SNN_k50_k18', 'SNN_k50_k19', 'SNN_k50_k20', 'SNN_k50_k21', 'SNN_k50_k22', 'SNN_k50_k23', 'SNN_k50_k24', 'SNN_k50_k25', 'SNN_k50_k26', 'SNN_k50_k27', 'SNN_k50_k28', 'GraphBased', 'Maynard', 'Martinowich', 'layer_guess', 'layer_guess_reordered', 'layer_guess_reordered_short', 'expr_chrM', 'expr_chrM_ratio', 'SpatialDE_PCA', 'SpatialDE_pool_PCA', 'HVG_PCA', 'pseudobulk_PCA', 'markers_PCA', 'SpatialDE_UMAP', 'SpatialDE_pool_UMAP', 'HVG_UMAP', 'pseudobulk_UMAP', 'markers_UMAP', 'SpatialDE_PCA_spatial', 'SpatialDE_pool_P

In [2]:
sc.AnnData

anndata._core.anndata.AnnData

In [None]:
adata = sc.read()