### **Curating Heart_SAN_atrial_cardiomyocyte.h5ad**

Article: Spatially resolved multiomics of human cardiac niches

DOI: https://doi.org/10.1038/s41586-023-06311-1 

Data Source : https://www.heartcellatlas.org

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/data/Heart_SAN_atrial_cardiomyocyte.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 9857 × 32732
    obs: 'sangerID', 'donor', 'donor_type', 'region', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'cell_state', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'scrublet_score'
    var: 'gene_name_scRNA-0-original', 'gene_name_snRNA-1-original', 'gene_name_multiome-2-original', 'gene_id'
    uns: 'cell_state_colors'
    obsm: 'X_umap'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<9857x32732 sparse matrix of type '<class 'numpy.float32'>'
	with 29754014 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 32719)	0.2546146
  (0, 32724)	0.2546146
  (0, 32715)	0.2546146
  (0, 32713)	0.2546146
  (0, 32696)	0.4573797
  (0, 14132)	1.4326072
  (0, 14130)	0.2546146
  (0, 14121)	0.7700422
  (0, 14120)	0.2546146
  (0, 14118)	0.2546146
  (0, 14113)	0.62588125
  (0, 14111)	0.4573797
  (0, 14105)	0.2546146
  (0, 14103)	0.2546146
  (0, 14091)	0.62588125
  (0, 14088)	0.4573797
  (0, 14080)	0.62588125
  (0, 14078)	0.2546146
  (0, 14067)	0.2546146
  (0, 14044)	0.4573797
  (0, 14014)	0.4573797
  (0, 14013)	0.7700422
  (0, 14012)	0.4573797
  (0, 14010)	0.4573797
  (0, 14001)	0.4573797
  :	:
  (9856, 4465)	2.85147
  (9856, 4433)	2.85147
  (9856, 4125)	2.85147
  (9856, 3797)	2.85147
  (9856, 3550)	2.85147
  (9856, 2965)	4.413599
  (9856, 2962)	2.85147
  (9856, 2849)	2.85147
  (9856, 2711)	2.85147
  (9856, 2512)	4.413599
  (9856, 2413)	2.85147
  (9856, 2392)	2.85147
  (9856, 2356)	2.85147
  (9856, 2230)	2.85147
  (9856, 1951)	2.85147
  (9856, 1530)	2.85147
  (9856, 1470)	2.85147
  (9856, 1404)	2.85147


##### **Raw counts matrix**

In [11]:
# If X has normalized counts, check for the raw counts matrix.

In [12]:
# check whether raw counts are present in adata.raw

In [13]:
adata.raw

In [14]:
# print(adata.raw.X)

In [15]:
#Raw count matrix is not present in adata.raw. Here the raw counts are provided in a separate object, load the raw counts matrix

In [None]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/Raw_counts/RNA_adult-8reg_full_raw_cellstate-annotated.h5ad')

In [None]:
araw

In [None]:
# get raw count matrix only for this data

In [None]:
araw = araw[araw.obs['region']=='SAN']

In [None]:
araw = araw[araw.obs['cell_type']=='Atrial Cardiomyocyte']

In [None]:
# Check whether adata and araw has same dimensions.

In [None]:
araw.X

In [None]:
# print raw matrix

In [None]:
print(araw.X)

##### **Variables(var)**

In [None]:
# View the var of anndata and raw object

In [None]:
adata.var

In [None]:
araw.var

In [None]:
# If ensembl ids are not in the index column and is present in another column, set the ensembl ids column as the index column.

In [None]:
adata.var['gene_symbols'] = adata.var_names

In [None]:
adata.var

In [None]:
adata.var_names = adata.var['gene_id']

In [None]:
adata.var

In [None]:
# Load the approved genes file.

In [None]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [None]:
#Create a dictionary from the approved genes file 

In [None]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [None]:
genedict

In [None]:
len(genedict)

In [None]:
#Filter out the genes which are not in the approved genes file.

In [None]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [None]:
len(var_to_keep_adata)

In [None]:
len(var_to_keep_araw)

In [None]:
adata.var

In [None]:
araw.var

In [None]:
# Modify the anndata object by filtering out the filtered genes.

In [None]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_araw].copy()

In [None]:
adata.var

In [None]:
araw.var

feature is filtered

In [None]:
#Since feature was not filtered out in the normalized matrix (X), assign false

In [None]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [None]:
#View var

In [None]:
adata.var

In [None]:
araw.var

In [None]:
del araw.var['gene_name-new']
del araw.var['gene_name_scRNA-0-original']
del araw.var['gene_name_snRNA-1-original']
del araw.var['gene_name_multiome-2-original']

In [None]:
del adata.var['gene_name_scRNA-0-original']
del adata.var['gene_name_snRNA-1-original']
del adata.var['gene_name_multiome-2-original']
del adata.var['gene_id']
del adata.var['gene_symbols']

In [None]:
# View var

In [None]:
adata.var

In [None]:
araw.var

#### **obs (Cell metadata)**

In [None]:
#view obs

In [None]:
adata.obs

In [None]:
# view the column names in obs

In [None]:
adata.obs.columns

#### **assay_ontology_term_id**

In [None]:
# identify the column in adata which corresponds to assay

In [None]:
list(adata.obs['kit_10x'].unique())

In [None]:
# create a dictionary with assay and corresponding assay ontology term id

In [None]:
mapping= {'3prime-v2':'EFO:0009899', '3prime-v3':'EFO:0009922', 'Multiome-v1' : 'EFO:0030059'}

In [None]:
# add the assay_ontology_term_id column

In [None]:
adata.obs['assay_ontology_term_id'] = adata.obs['kit_10x'].map(mapping)

In [None]:
# change datatype of the column

In [None]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [None]:
# view adata.obs

In [None]:
adata.obs

#### **cell_type_ontology_term_id**

In [None]:
#identify the column in adata.obs related. to cell type annotation

In [None]:
adata.obs.columns

In [None]:
list(adata.obs['cell_state'].unique())

In [None]:
# add the cell_type_ontology_term_id column

In [None]:
adata.obs['cell_type_ontology_term_id'] = ['CL:0002129']* len(adata.obs)

In [None]:
# change datatype of the column

In [None]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [None]:
# view adata.obs

In [None]:
adata.obs

#### **development_stage_ontology_term_id**

In [None]:
# identify the column in adata which corresponds to age

In [None]:
adata.obs.columns

In [None]:
list(adata.obs['age'].unique())

In [None]:
# modify the age of donors based on supplementary info

In [None]:
adata.obs['age'] = np.where(adata.obs['donor'] == 'D6' , '70-75', adata.obs['age'])
adata.obs['age'] = np.where(adata.obs['donor'] == 'AH1' , '45-50', adata.obs['age'])
adata.obs['age'] = np.where(adata.obs['donor'] == 'A61' , '70-75', adata.obs['age'])

In [None]:
# view the modified age

In [None]:
age_value = adata.obs.loc[adata.obs['donor'] == 'A61', 'age'].values[0]

In [None]:
age_value

In [None]:
# Get unique values of age column

In [None]:
list(adata.obs['age'].unique())

In [None]:
# create a dictionary for age and development stage ontology term id

In [None]:
mapping= {'50-55':'HsapDv:0000240', 
          '55-60':'HsapDv:0000240', 
          '70-75':'HsapDv:0000242', 
          '65-70':'HsapDv:0000241', 
          '60-65':'HsapDv:0000241',
          '40-45':'HsapDv:0000239', 
          '45-50':'HsapDv:0000239', 
          '20-25':'HsapDv:0000237'}

In [None]:
# add the development_stage_ontology_term_id column

In [None]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['age'].map(mapping)

In [None]:
# change datatype of the column

In [None]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [None]:
# view unique values of development_stage_ontology_term_id column

In [None]:
list(adata.obs['development_stage_ontology_term_id'].unique())

In [None]:
# view adata.obs

In [None]:
adata.obs

#### **donor_id**

In [None]:
#identify the column in adata.obs which provides donor information

In [None]:
adata.obs.columns

In [None]:
list(adata.obs['donor'].unique())

In [None]:
# add the donor_id column

In [None]:
adata.obs['donor_id'] = adata.obs['donor']

In [None]:
# change datatype of the column

In [None]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [None]:
# view unique values of donor_id column

In [None]:
list(adata.obs['donor_id'].unique())

In [None]:
#view obs

In [None]:
adata.obs

#### **disease_ontology_term_id**

In [None]:
# Assign normal since all are healthy patients

In [None]:
# add the disease_ontology_term_id column

In [None]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461']* len(adata.obs)

In [None]:
# change datatype of the column

In [None]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **is_primary_data**

In [None]:
adata.obs['is_primary_data'] = [False]* len(adata.obs)

In [None]:
# view 'is_primary_data' unique values

In [None]:
list(adata.obs['is_primary_data'].unique())

In [None]:
#change data type of column

In [None]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [None]:
# view obs

In [None]:
adata.obs

#### **organism_ontology_term_id**

In [None]:
# assign organism id 

In [None]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [None]:
#change data type of column

In [None]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **self_reported_ethnicity_ontology_term_id**

In [None]:
# read the supplementary information from article

In [None]:
# create a dictionary of donor and ethinic_origin

In [None]:
mapping= {'D1': 'Caucasian',
 'D2': 'Caucasian',
 'D3': 'Caucasian',
 'D4': 'Caucasian',
 'D5': 'Caucasian',
 'D6': 'Caucasian',
 'D7': 'Caucasian',
 'D11': 'Caucasian',
 'H2': 'Caucasian',
 'H3': 'Asian',
 'H4': 'Caucasian',
 'H5': 'Caucasian',
 'H6': 'Caucasian',
 'H7': 'Caucasian',
 'A61': 'Caucasian',
 'AH1': 'Caucasian',
 'AH1-A61': 'unknown',
 'AH2': 'South Asian',
 'AV10': 'Caucasian',
 'AV13': 'Caucasian',
 'AV14': 'Caucasian',
 'AV3': 'Caucasian',
 'D8': 'Caucasian',
 'AH5': 'Caucasian',
 'AH6': 'Caucasian',
 'AV1': 'Caucasian'}

In [None]:
# add ethinic_origin column

In [None]:
adata.obs['ethinic_origin'] = adata.obs['donor'].map(mapping)

In [None]:
mapping= {'Caucasian': 'HANCESTRO:0005',
 'Asian': 'HANCESTRO:0008',
 'unknown': 'unknown',
 'South Asian': 'HANCESTRO:0006'}

In [None]:
# add self_reported_ethnicity_ontology_term_id column

In [None]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['ethinic_origin'].map(mapping)

In [None]:
# change data type

In [None]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **sex_ontology_term_id**

In [None]:
# identify the column in adata.obs which corresponds to sex

In [None]:
adata.obs.columns

In [None]:
# list the unique values 

In [None]:
list(adata.obs['gender'].unique())

In [None]:
# create a dictionary of sex and sex ontology term id

In [None]:
mapping= {'Female': 'PATO:0000383', 'Male': 'PATO:0000384'}

In [None]:
# add sex_ontology_term_id column

In [None]:
adata.obs['sex_ontology_term_id'] = adata.obs['gender'].map(mapping)

In [None]:
# change data type

In [None]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [None]:
adata.obs

#### **suspension_type**

In [None]:
# identify the column in adata.obs which corresponds to suspension type

In [None]:
adata.obs.columns

In [None]:
# list the unique values in the column

In [None]:
list(adata.obs['cell_or_nuclei'].unique())

In [None]:
# create a mapping dictionary

In [None]:
mapping= {'Cell':'cell', 'Nuclei':'nucleus'}

In [None]:
# add 'suspension_type' column

In [None]:
adata.obs['suspension_type'] = adata.obs['cell_or_nuclei'].map(mapping)

In [None]:
# change data type of column

In [None]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **tissue_ontology_term_id**

In [None]:
# identify the column in adata.obs which corresponds to tissue

In [None]:
adata.obs.columns

In [None]:
list(adata.obs['region'].unique())

In [None]:
# create a dictionary with tissue and corresponding tissue ontology term id

In [None]:
mapping= {'AX':'UBERON:0002098', 
          'LV':'UBERON:0002084', 
          'RV':'UBERON:0002080', 
          'LA':'UBERON:0002079', 
          'SP':'UBERON:0002094', 
          'RA':'UBERON:0002078', 
          'SAN':'UBERON:0002351', 
          'AVN':'UBERON:0002352'}

In [None]:
# add 'tissue_ontology_term_id' column

In [None]:
adata.obs['tissue_ontology_term_id'] = adata.obs['region'].map(mapping)

In [None]:
# change data type of column

In [None]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [None]:
#list the unique values in 'tissue_ontology_term_id' column

In [None]:
list(adata.obs['tissue_ontology_term_id'].unique())

In [None]:
# view obs

In [None]:
adata.obs

#### **obsm (Embeddings)**

In [None]:
# view obsm

In [None]:
# check whether all columns are prefixed with X

In [None]:
adata.obsm

#### **uns (Dataset Metadata)**

In [None]:
# View

In [None]:
adata.uns

In [None]:
adata.uns.keys

In [None]:
# Give a title for the dataset

In [None]:
adata.uns['title'] = 'snRNA-seq data - Heart SAN atrial cardiomyocytes'

In [None]:
# Set the default embedding

In [None]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [None]:
# view anndata object

In [None]:
adata

In [None]:
# view obs and var data types

In [None]:
adata.obs.dtypes

In [None]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [None]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

In [None]:
# view obs

In [None]:
adata.obs

In [None]:
adata.obs.columns

In [None]:
# delete unwanted columns in obs

In [None]:
del adata.obs['donor']
del adata.obs['cell_or_nuclei']
del adata.obs['kit_10x']
del adata.obs['gender']
del adata.obs['ethinic_origin']
del adata.obs['modality']

In [None]:
# view obs

In [None]:
adata.obs

In [None]:
adata.obs.columns

In [None]:
# view var

In [None]:
adata.var

In [None]:
#view uns

In [None]:
adata.uns

In [None]:
list(adata.uns.keys())

In [None]:
adata.obs.columns

In [None]:
#check the format of expression matrix

In [None]:
adata.X

In [None]:
araw.X

In [None]:
#Copy raw counts to adata.raw

In [None]:
adata.raw = araw

In [None]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/final_objects/Heart-SAN-atrial-cardiomyocyte.h5ad', compression = 'gzip')