### **Curating HCAHeartST8795938(OCT).h5ad**

Article: Spatially resolved multiomics of human cardiac niches

DOI: https://doi.org/10.1038/s41586-023-06311-1 

Data Source : https://www.heartcellatlas.org

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [None]:
#Import all packages required for curation

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [None]:
# Load the AnnData object

In [None]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/data/HCAHeartST8795938.h5ad')

In [None]:
# View the AnnData object

In [None]:
adata

##### **X- expression matrix**

In [None]:
# View the expression matrix of the anndata object

In [None]:
adata.X

In [None]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [None]:
print(adata.X)

##### **Raw counts matrix**

In [None]:
# If X has normalized counts, check for the raw counts matrix.

In [None]:
#Here the raw counts are provided in a separate object, load the raw counts matrix

In [None]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/Raw_counts/visium-OCT_adult-8reg-revision_raw.h5ad')

In [None]:
# view raw object

In [None]:
araw

In [None]:
# view raw matrix

In [None]:
araw.X

In [None]:
print(araw.X)

In [None]:
# since the raw object is combined one, extract the raw counts for this dataset 

In [None]:
araw = araw[araw.obs['sample']=='HCAHeartST8795938']

In [None]:
araw

##### **Variables(var)**

In [None]:
#View the var of anndata and raw object

In [None]:
adata.var

In [None]:
araw.var

In [None]:
# Check the index column of var. Check whether ensembl ids are provided in the index column of var or not.

In [None]:
# If ensembl ids are not in the index column and is present in another column, set the ensembl ids column as the index column

In [None]:
adata.var['gene_symbols'] = adata.var_names

In [None]:
adata.var

In [None]:
adata.var_names = adata.var['gene_ids']

In [None]:
adata.var

In [None]:
# load the approved genes file

In [None]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [None]:
# Create a dictionary from the approved genes file using the symbols and feature id columns.

In [None]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [None]:
genedict

In [None]:
len(genedict)

In [None]:
# Filter out the genes which are not in the approved genes file

In [None]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [None]:
len(var_to_keep_adata)

In [None]:
len(var_to_keep_araw)

In [None]:
adata.var

In [None]:
araw.var

In [None]:
# Modify the anndata object by filtering out the filtered genes. copy the index column values to a new column called gene_symbols

In [None]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_araw].copy()

In [None]:
#  View the var

In [None]:
adata.var

In [None]:
araw.var

feature is filtered

In [None]:
# Assign False since the feature was not filtered out in the normalized matrix (X).

In [None]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [None]:
#View var

In [None]:
adata.var

In [None]:
araw.var

In [None]:
#  Delete the unwanted columns in adata and araw.

In [None]:
del araw.var['gene_ids']
del araw.var['SYMBOL']
del araw.var['feature_types']
del araw.var['genome']

In [None]:
del adata.var['gene_ids']
del adata.var['gene_symbols']
del adata.var['feature_types']
del adata.var['genome']

In [None]:
# view var

In [None]:
adata.var

In [None]:
araw.var

#### **Observations(obs) (Cell metadata)**

In [None]:
#view obs

In [None]:
adata.obs

In [None]:
adata.obs.columns

#### **assay_ontology_term_id**

In [None]:
# identify the column in adata which corresponds to assay

In [None]:
list(adata.obs['kit_10x'].unique())

In [None]:
# add the assay_ontology_term_id column

In [None]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [None]:
# change datatype of the column

In [None]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [None]:
# view adata.obs

In [None]:
adata.obs

#### **cell_type_ontology_term_id**

In [None]:
#get the column in adata.obs related. to cell type annotation

In [None]:
adata.obs.columns

In [None]:
adata.obsm

In [None]:
adata.obsm['means_cell_abundance_w_sf']

In [None]:
max_columns = adata.obsm['means_cell_abundance_w_sf'].apply(lambda x: x[x == x.max()].index.tolist(), axis=1)

In [None]:
max_columns

In [None]:
columns_with_multiple_max = max_columns.apply(lambda x: [col for col in x if x.count(col)>1])

In [None]:
columns_with_multiple_max 

In [None]:
adata.obs['columns_with_multiple_max'] = columns_with_multiple_max

In [None]:
non_empty_columns = columns_with_multiple_max[columns_with_multiple_max .apply(lambda x: len(x) > 0)]

In [None]:
len(non_empty_columns )

In [None]:
max_columns = [col[0].replace('meanscell_abundance_w_sf_','') if len(col) > 0 else '' for col in max_columns]

In [None]:
adata.obs['highest_cell_Density_columns'] = max_columns

In [None]:
adata.obs['highest_cell_Density_columns']

In [None]:
list(adata.obs['highest_cell_Density_columns'].unique())

In [None]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['highest_cell_Density_columns']

In [None]:
# create a dictionary of cell type and ontology term

In [None]:
mapping= {'aCM1': 'CL:0002129',
 'aCM2': 'CL:0002129',
 'aCM3': 'CL:0002129',
 'aCM4': 'CL:0002129',
 'aCM5': 'CL:0002129',
 'Adip1': 'CL:0000136',
 'Adip2': 'CL:0000136',
 'Adip3': 'CL:0000136',
 'Adip4': 'CL:0000136',
 'Adipocyte': 'CL:0000136',
 'Atrial Cardiomyocyte': 'CL:0002129',
 'AVN_bundle_cell': 'CL:0010005',
 'AVN_P_cell': 'CL:1000477',
 'B': 'CL:0000236',
 'B_plasma': 'CL:0000786',
 'CD14+Mo': 'CL:0001054',
 'CD16+Mo': 'CL:0002396',
 'CD4+T_act': 'CL:0000896',
 'CD4+T_naive': 'CL:0000895',
 'CD4+T_Th2': 'CL:0000546',
 'CD8+T_cytox': 'CL:0000794',
 'CD8+T_em': 'CL:0000913',
 'CD8+T_te': 'CL:0000625',
 'CD8+T_trans': 'CL:0000625',
 'DC': 'CL:0001056',
 'EC1_cap': 'CL:0002144',
 'EC10_CMC-like': 'CL:0000115',
 'EC2_cap': 'CL:0002144',
 'EC3_cap': 'CL:0002144',
 'EC4_immune': 'CL:0000115',
 'EC5_art': 'CL:1000413',
 'EC6_ven': 'CL:0002543',
 'EC7_atria': 'CL:0002350',
 'EC7_endocardial': 'CL:0002350',
 'EC8_ln': 'CL:0002138',
 'Endothelial cell': 'CL:0000115',
 'FB1': 'CL:0002548',
 'FB2': 'CL:0002548',
 'FB3': 'CL:0002548',
 'FB4': 'CL:0002548',
 'FB4_activated': 'CL:0002548',
 'FB5': 'CL:0002548',
 'FB6': 'CL:0002548',
 'Fibroblast': 'CL:0000057',
 'ILC': 'CL:0001065',
 'Lymphatic Endothelial cell': 'CL:0002138',
 'Lymphoid': 'CL:0000542',
 'LYVE1+IGF1+MP': 'CL:0000235',
 'LYVE1+MP_cycling': 'CL:0000235',
 'LYVE1+TIMD4+MP': 'CL:0000235',
 'MAIT-like': 'CL:0000940',
 'Mast': 'CL:0000097',
 'Mast cell': 'CL:0000097',
 'Meso': 'CL:0000077',
 'Mesothelial cell': 'CL:0000077',
 'MoMP': 'CL:0000576',
 'Mural cell': 'CL:0008034',
 'Myeloid': 'CL:0000763',
 'NC1': 'CL:0000125',
 'NC1_glial': 'CL:0000125',
 'NC2': 'CL:0000125',
 'NC2_glial_NGF+': 'CL:0000125',
 'Neural cell': 'CL:0002319',
 'Neut': 'CL:0000775',
 'NK_CD16hi': 'CL:0000939',
 'NK_CD56hi': 'CL:0000938',
 'PC1_vent': 'CL:0000669',
 'PC2_atria': 'CL:0000669',
 'PC3_str': 'CL:0000669',
 'PC4_CMC-like': 'CL:0000669',
 'SAN_P_cell': 'CL:1000477',
 'SMC1_basic': 'CL:0000192',
 'SMC2_art': 'CL:0002591',
 'T/NK_cycling': 'CL:0000814',
 'vCM1': 'CL:0002131',
 'vCM3_stressed': 'CL:0002131',
 'vCM4': 'CL:0002131',
 'Ventricular Cardiomyocyte': 'CL:0002131',
 'unclassified': 'CL:0000003'}

In [None]:
# add the cell_type_ontology_term_id column

In [None]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [None]:
# change datatype of the column

In [None]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [None]:
adata.obs

#### **donor_id**

In [None]:
#identify the column in adata.obs which provides donor information

In [None]:
adata.obs.columns

In [None]:
list(adata.obs['donor'].unique())

In [None]:
# add the donor_id column

In [None]:
adata.obs['donor_id'] = adata.obs['donor']

In [None]:
# change datatype of the column

In [None]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [None]:
# view unique values of donor_id column

In [None]:
list(adata.obs['donor_id'].unique())

In [None]:
#view obs

In [None]:
adata.obs

In [None]:
adata.obs.columns

#### **development_stage_ontology_term_id**

In [None]:
# identify the column in adata which corresponds to age

In [None]:
adata.obs.columns

In [None]:
list(adata.obs['age'].unique())

In [None]:
# create a dictionary for age and development stage ontology term id

In [None]:
mapping= {'50-55':'HsapDv:0000240', 
          '55-60':'HsapDv:0000240', 
          '70-75':'HsapDv:0000242', 
          '65-70':'HsapDv:0000241', 
          '60-65':'HsapDv:0000241',
          '40-45':'HsapDv:0000239', 
          '45-50':'HsapDv:0000239', 
          '20-25':'HsapDv:0000237'}

In [None]:
# add the development_stage_ontology_term_id column

In [None]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['age'].map(mapping)

In [None]:
# change datatype of the column

In [None]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [None]:
# view unique values of development_stage_ontology_term_id column

In [None]:
list(adata.obs['development_stage_ontology_term_id'].unique())

In [None]:
# view adata.obs

In [None]:
adata.obs

#### **disease_ontology_term_id**

In [None]:
# Assign normal since all are healthy patients

In [None]:
# add the disease_ontology_term_id column

In [None]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461']* len(adata.obs)

In [None]:
#change data type of column

In [None]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **is_primary_data**

In [None]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [None]:
adata.obs

In [None]:
#change data type of column

In [None]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

#### **organism_ontology_term_id**

In [None]:
# assign organism id 

In [None]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [None]:
#change data type of column

In [None]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **self_reported_ethnicity_ontology_term_id**

In [None]:
# create a dictionary of donor and ethinic_origin

In [None]:
mapping= {'D1': 'Caucasian',
 'D2': 'Caucasian',
 'D3': 'Caucasian',
 'D4': 'Caucasian',
 'D5': 'Caucasian',
 'D6': 'Caucasian',
 'D7': 'Caucasian',
 'D11': 'Caucasian',
 'H2': 'Caucasian',
 'H3': 'Asian',
 'H4': 'Caucasian',
 'H5': 'Caucasian',
 'H6': 'Caucasian',
 'H7': 'Caucasian',
 'A61': 'Caucasian',
 'AH1': 'Caucasian',
 'AH1-A61': 'unknown',
 'AH2': 'South Asian',
 'AV10': 'Caucasian',
 'AV13': 'Caucasian',
 'AV14': 'Caucasian',
 'AV3': 'Caucasian',
 'D8': 'Caucasian',
 'AH5': 'Caucasian',
 'AH6': 'Caucasian',
 'AV1': 'Caucasian'}

In [None]:
# add ethinic_origin column

In [None]:
adata.obs['ethinic_origin'] = adata.obs['donor'].map(mapping)

In [None]:
# create a dictionary of ethinic_origin and self_reported_ethnicity_ontology_term_id

In [None]:
mapping= {'Caucasian': 'HANCESTRO:0005',
 'Asian': 'HANCESTRO:0008',
 'unknown': 'unknown',
 'South Asian': 'HANCESTRO:0006'}

In [None]:
# add self_reported_ethnicity_ontology_term_id column

In [None]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['ethinic_origin'].map(mapping)

In [None]:
# change data type

In [None]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **sex_ontology_term_id**

In [None]:
# identify the column in adata.obs which corresponds to sex

In [None]:
adata.obs.columns

In [None]:
# list the unique values 

In [None]:
list(adata.obs['gender'].unique())

In [None]:
# create a dictionary of sex and sex ontology term id

In [None]:
mapping= {'Female': 'PATO:0000383', 'Male': 'PATO:0000384'}

In [None]:
# add sex_ontology_term_id column

In [None]:
adata.obs['sex_ontology_term_id'] = adata.obs['gender'].map(mapping)

In [None]:
# change data type

In [None]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [None]:
adata.obs

#### **suspension_type**

In [None]:
# since visium suspension type is 'na'

In [None]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [None]:
# change data type

In [None]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **tissue_ontology_term_id**

In [None]:
# identify the column in adata.obs which corresponds to tissue

In [None]:
adata.obs.columns

In [None]:
# list unique values

In [None]:
list(adata.obs['region'].unique())

In [None]:
# create a dictionary with tissue and corresponding tissue ontology term id

In [None]:
mapping= {'AX':'UBERON:0002098', 
          'LV':'UBERON:0002084', 
          'RV':'UBERON:0002080', 
          'LA':'UBERON:0002079', 
          'SP':'UBERON:0002094', 
          'RA':'UBERON:0002078', 
          'SAN':'UBERON:0002351', 
          'AVN':'UBERON:0002352'}

In [None]:
# add 'tissue_ontology_term_id' column

In [None]:
adata.obs['tissue_ontology_term_id'] = adata.obs['region'].map(mapping)

In [None]:
# change data type of column

In [None]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [None]:
#list the unique values in 'tissue_ontology_term_id' column

In [None]:
list(adata.obs['tissue_ontology_term_id'].unique())

In [None]:
# view obs

In [None]:
adata.obs

In [None]:
del adata.obs['donor']
del adata.obs['gender']
del adata.obs['highest_cell_Density_columns']
del adata.obs['columns_with_multiple_max']
del adata.obs['array_row']
del adata.obs['array_col']
del adata.obs['modality']
del adata.obs['kit_10x']
del adata.obs['ethinic_origin']

#### **obsm (Embeddings)**

In [None]:
adata.obsm

In [None]:
adata.obsm.keys()

In [None]:
adata.obsm['means_cell_abundance_w_sf']= adata.obsm['means_cell_abundance_w_sf'].values

In [None]:
adata.obsm['stds_cell_abundance_w_sf'] = adata.obsm['stds_cell_abundance_w_sf'].values

In [None]:
adata.obsm['prop']=adata.obsm['prop'].values

In [None]:
adata.obsm['q05_cell_abundance_w_sf'] = adata.obsm['q05_cell_abundance_w_sf'].values

In [None]:
adata.obsm['q95_cell_abundance_w_sf']= adata.obsm['q95_cell_abundance_w_sf'].values

In [None]:
adata.obsm['X_means_cell_abundance_w_sf'] = adata.obsm['means_cell_abundance_w_sf']
adata.obsm['X_prop'] = adata.obsm['prop']
adata.obsm['X_q05_cell_abundance_w_sf'] = adata.obsm['q05_cell_abundance_w_sf']
adata.obsm['X_q95_cell_abundance_w_sf'] = adata.obsm['q95_cell_abundance_w_sf']
adata.obsm['X_stds_cell_abundance_w_sf'] = adata.obsm['stds_cell_abundance_w_sf']

In [None]:
adata.obsm

In [None]:
del adata.obsm['MT']
del adata.obsm['means_cell_abundance_w_sf']
del adata.obsm['prop']
del adata.obsm['q05_cell_abundance_w_sf']
del adata.obsm['q95_cell_abundance_w_sf']
del adata.obsm['stds_cell_abundance_w_sf']

#### **uns (Dataset Metadata)**

In [None]:
adata.uns

In [None]:
adata.uns['image_caption'] = 'Shown here is an image of 10 μm thick slice of the interventricular septum region of the adult human heart stained with H&E'

In [None]:
adata.uns['title'] = 'Visium spatial - HCAHeartST8795938 (OCT)'

In [None]:
adata.uns['default_embedding'] = 'X_spatial'

In [None]:
adata.uns.keys()

### **Final checks and adjustments**

In [None]:
adata

In [None]:
adata.obs.dtypes

In [None]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [None]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

In [None]:
adata.obs

In [None]:
adata.obs.columns

In [None]:
adata.var

In [None]:
adata.obs

In [None]:
adata.obs.columns

In [None]:
#check the format of expression matrix

In [None]:
adata.X

In [None]:
araw.X

In [None]:
#Copy raw counts to adata.raw

In [None]:
adata.raw = araw

In [None]:
#write the curated object to final_objects folder

In [None]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/final_objects/HCAHeartST8795938(OCT).h5ad', compression = 'gzip')