### **Curating HCAHeartST11702010(FFPE).h5ad**

Article: Spatially resolved multiomics of human cardiac niches

DOI: https://doi.org/10.1038/s41586-023-06311-1 

Data Source : https://www.heartcellatlas.org

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/data/HCAHeartST11702010.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 3797 × 17922
    obs: 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'sangerID', 'region', 'donor', 'donor_type', 'age', 'gender', 'facility', 'modality', 'kit_10x', 'flushed', 'annotation_final', 'prop_Adip1', 'prop_Adip2', 'prop_B', 'prop_B_plasma', 'prop_CD14+Mo', 'prop_CD16+Mo', 'prop_CD4+T_act', 'prop_CD4+T_naive', 'prop_CD8+T_cytox', 'prop_CD8+T_em', 'prop_CD8+T_te', 'prop_CD8+T_trans', 'prop_DC', 'prop_EC10_CMC-like', 'prop_EC1_cap', 'prop_EC2_cap', 'prop_EC3_cap', 'prop_EC4_immune', 'prop_EC5_art', 'prop_EC6_ven', 'prop_EC7_endocardial', 'prop_EC8_ln', 'prop_FB1', 'prop_FB2', 'prop_FB3', 'prop_FB4_activated', 'prop_FB5', 'prop_FB6', 'prop_ILC', 'prop_LYVE1+IGF1+MP', 'prop_LYVE1+MP_cycling', 'prop_LYVE1+TIMD4+MP', 'prop_MAIT-like', 'prop_Mast', 'prop_MoMP', 'prop_NC1_glial', 'prop_NC2_glial_NGF+', 'prop_NK_CD16hi', 'prop_NK_CD56hi', 'prop_Neut', 'prop_PC2_atria', 'prop_PC3_str',

##### **X- expression matrix**

In [7]:
# View the expression matrix of the anndata object

In [8]:
adata.X

<3797x17922 sparse matrix of type '<class 'numpy.float32'>'
	with 12824133 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 1)	0.88374704
  (0, 5)	0.88374704
  (0, 6)	0.5364789
  (0, 12)	0.5364789
  (0, 13)	1.1410092
  (0, 18)	0.5364789
  (0, 20)	1.1410092
  (0, 23)	0.5364789
  (0, 24)	0.88374704
  (0, 25)	0.5364789
  (0, 26)	1.7867179
  (0, 29)	0.88374704
  (0, 31)	0.5364789
  (0, 32)	1.5151
  (0, 34)	0.5364789
  (0, 36)	0.5364789
  (0, 37)	0.88374704
  (0, 38)	0.5364789
  (0, 39)	0.5364789
  (0, 40)	1.1410092
  (0, 47)	1.1410092
  (0, 52)	0.5364789
  (0, 59)	0.5364789
  (0, 63)	0.5364789
  (0, 66)	0.5364789
  :	:
  (3796, 17822)	0.51640725
  (3796, 17826)	1.6205702
  (3796, 17832)	0.51640725
  (3796, 17835)	0.51640725
  (3796, 17838)	0.51640725
  (3796, 17843)	0.51640725
  (3796, 17844)	0.51640725
  (3796, 17845)	0.51640725
  (3796, 17846)	1.1078978
  (3796, 17854)	0.51640725
  (3796, 17856)	0.85526216
  (3796, 17871)	0.51640725
  (3796, 17874)	2.1325023
  (3796, 17878)	0.85526216
  (3796, 17881)	0.85526216
  (3796, 17900)	0.51640725
  (3796, 17901)	1.1078978
  (3796, 17903)	0.85526216
  (3796, 1790

##### **Raw counts matrix**

In [11]:
# If X has normalized counts, check for the raw counts matrix.

In [12]:
#Here the raw counts are provided in a separate object, load the raw counts matrix

In [13]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/Raw_counts/visium-FFPE_adult-8reg_raw.h5ad')

In [14]:
# view raw object

In [15]:
araw

AnnData object with n_obs × n_vars = 11312 × 17922
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac', 'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'region_cell2loc'
    var: 'gene_ids', 'feature_types', 'genome', 'SYMBOL'
    uns: 'spatial'
    obsm: 'MT', 'means_cell_abundance_w_sf', 'q05_cell_abundance_w_sf', 'q95_cell_abundance_w_sf', 'spatial', 'stds_cell_abundance_w_sf'

In [16]:
# view raw matrix

In [17]:
araw.X

<11312x17922 sparse matrix of type '<class 'numpy.float32'>'
	with 36178068 stored elements in Compressed Sparse Row format>

In [18]:
print(araw.X)

  (0, 1)	2.0
  (0, 4)	2.0
  (0, 5)	4.0
  (0, 12)	1.0
  (0, 13)	6.0
  (0, 14)	2.0
  (0, 16)	1.0
  (0, 18)	3.0
  (0, 20)	1.0
  (0, 23)	3.0
  (0, 24)	1.0
  (0, 25)	3.0
  (0, 26)	2.0
  (0, 29)	1.0
  (0, 32)	5.0
  (0, 36)	1.0
  (0, 39)	2.0
  (0, 40)	3.0
  (0, 45)	1.0
  (0, 47)	2.0
  (0, 49)	1.0
  (0, 67)	5.0
  (0, 68)	2.0
  (0, 72)	1.0
  (0, 75)	1.0
  :	:
  (11311, 17822)	1.0
  (11311, 17826)	6.0
  (11311, 17832)	1.0
  (11311, 17835)	1.0
  (11311, 17838)	1.0
  (11311, 17843)	1.0
  (11311, 17844)	1.0
  (11311, 17845)	1.0
  (11311, 17846)	3.0
  (11311, 17854)	1.0
  (11311, 17856)	2.0
  (11311, 17871)	1.0
  (11311, 17874)	11.0
  (11311, 17878)	2.0
  (11311, 17881)	2.0
  (11311, 17900)	1.0
  (11311, 17901)	3.0
  (11311, 17903)	2.0
  (11311, 17904)	1.0
  (11311, 17907)	1.0
  (11311, 17910)	3.0
  (11311, 17911)	8.0
  (11311, 17913)	1.0
  (11311, 17914)	1.0
  (11311, 17919)	1.0


In [19]:
# since the raw object is combined one, extract the raw counts for this dataset 

In [20]:
araw = araw[araw.obs['sample']=='HCAHeartST11702010']

In [21]:
araw

View of AnnData object with n_obs × n_vars = 3797 × 17922
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac', 'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'region_cell2loc'
    var: 'gene_ids', 'feature_types', 'genome', 'SYMBOL'
    uns: 'spatial'
    obsm: 'MT', 'means_cell_abundance_w_sf', 'q05_cell_abundance_w_sf', 'q95_cell_abundance_w_sf', 'spatial', 'stds_cell_abundance_w_sf'

##### **Variables(var)**

In [22]:
#View the var of anndata and raw object

In [23]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome
SYMBOL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SAMD11,ENSG00000187634,Gene Expression,GRCh38
NOC2L,ENSG00000188976,Gene Expression,GRCh38
KLHL17,ENSG00000187961,Gene Expression,GRCh38
PLEKHN1,ENSG00000187583,Gene Expression,GRCh38
PERM1,ENSG00000187642,Gene Expression,GRCh38
...,...,...,...
C21orf58,ENSG00000160298,Gene Expression,GRCh38
PCNT,ENSG00000160299,Gene Expression,GRCh38
DIP2A,ENSG00000160305,Gene Expression,GRCh38
S100B,ENSG00000160307,Gene Expression,GRCh38


In [24]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [25]:
# Check the index column of var. Check whether ensembl ids are provided in the index column of var or not.

In [26]:
# If ensembl ids are not in the index column and is present in another column, set the ensembl ids column as the index column

In [27]:
adata.var['gene_symbols'] = adata.var_names

In [28]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
SYMBOL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SAMD11,ENSG00000187634,Gene Expression,GRCh38,SAMD11
NOC2L,ENSG00000188976,Gene Expression,GRCh38,NOC2L
KLHL17,ENSG00000187961,Gene Expression,GRCh38,KLHL17
PLEKHN1,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
PERM1,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
C21orf58,ENSG00000160298,Gene Expression,GRCh38,C21orf58
PCNT,ENSG00000160299,Gene Expression,GRCh38,PCNT
DIP2A,ENSG00000160305,Gene Expression,GRCh38,DIP2A
S100B,ENSG00000160307,Gene Expression,GRCh38,S100B


In [29]:
adata.var_names = adata.var['gene_ids']

In [30]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [31]:
# load the approved genes file

In [32]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [33]:
# Create a dictionary from the approved genes file using the symbols and feature id columns.

In [34]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [35]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [36]:
len(genedict)

116184

In [37]:
# Filter out the genes which are not in the approved genes file

In [38]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [39]:
len(var_to_keep_adata)

17912

In [40]:
len(var_to_keep_araw)

17912

In [41]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [42]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [43]:
# Modify the anndata object by filtering out the filtered genes. copy the index column values to a new column called gene_symbols

In [44]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_araw].copy()

In [45]:
#  View the var

In [46]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [47]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


feature is filtered

In [48]:
# Assign False since the feature was not filtered out in the normalized matrix (X).

In [49]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [50]:
#View var

In [51]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11,False
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L,False
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17,False
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1,False
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1,False
...,...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58,False
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT,False
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A,False
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B,False


In [52]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [53]:
#  Delete the unwanted columns in adata and araw.

In [54]:
del araw.var['gene_ids']
del araw.var['SYMBOL']
del araw.var['feature_types']
del araw.var['genome']

In [55]:
del adata.var['gene_ids']
del adata.var['gene_symbols']
del adata.var['feature_types']
del adata.var['genome']

In [56]:
# view var

In [57]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
gene_ids,Unnamed: 1_level_1
ENSG00000187634,False
ENSG00000188976,False
ENSG00000187961,False
ENSG00000187583,False
ENSG00000187642,False
...,...
ENSG00000160298,False
ENSG00000160299,False
ENSG00000160305,False
ENSG00000160307,False


In [58]:
araw.var

ENSG00000187634
ENSG00000188976
ENSG00000187961
ENSG00000187583
ENSG00000187642
...
ENSG00000160298
ENSG00000160299
ENSG00000160305
ENSG00000160307
ENSG00000160310


#### **Observations(obs) (Cell metadata)**

In [59]:
#view obs

In [60]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,Neut_abundance,PC2_atria_abundance,PC3_str_abundance,SAN_P_cell_abundance,SMC1_basic_abundance,SMC2_art_abundance,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,0.086832,0.476005,0.057782,0.091358,0.655052,0.191568,0.188451,0.449563,0.514563,0.936007
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,0.097314,0.003057,0.002457,0.000478,0.002280,0.002917,0.000458,0.000582,0.002587,0.000734
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,0.110701,0.020407,0.458436,0.018573,0.015793,0.019516,0.066988,0.012308,0.228417,3.620176
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,0.059135,0.021715,0.076623,0.028184,0.008493,0.070514,0.006749,0.131797,0.228558,0.126935
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,0.093987,0.021712,0.201236,0.372203,0.028267,0.098915,0.551456,1.024366,0.154369,4.667572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,0.031198,0.007878,0.003221,0.066894,0.088568,0.003118,0.175014,0.104038,0.075937,0.027006
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,0.191738,0.026667,0.022528,0.036427,0.073528,0.011448,0.507058,0.069567,0.217755,0.843986
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,0.018993,0.081644,0.007158,0.008435,0.019979,0.093562,0.016823,0.197080,0.110074,0.185447
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,0.006262,0.009663,0.005767,0.051473,0.009939,0.001094,0.009529,0.382378,0.357928,0.007438


In [61]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'Neut_abundance', 'PC2_atria_abundance', 'PC3_str_abundance',
       'SAN_P_cell_abundance', 'SMC1_basic_abundance', 'SMC2_art_abundance',
       'aCM1_abundance', 'aCM2_abundance', 'aCM3_abundance', 'aCM4_abundance'],
      dtype='object', length=115)

#### **assay_ontology_term_id**

In [62]:
# identify the column in adata which corresponds to assay

In [63]:
list(adata.obs['kit_10x'].unique())

['Visium-v1-FFPE']

In [64]:
# add the assay_ontology_term_id column

In [65]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [66]:
# change datatype of the column

In [67]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [68]:
# view adata.obs

In [69]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,PC2_atria_abundance,PC3_str_abundance,SAN_P_cell_abundance,SMC1_basic_abundance,SMC2_art_abundance,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,0.476005,0.057782,0.091358,0.655052,0.191568,0.188451,0.449563,0.514563,0.936007,EFO:0010961
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,0.003057,0.002457,0.000478,0.002280,0.002917,0.000458,0.000582,0.002587,0.000734,EFO:0010961
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,0.020407,0.458436,0.018573,0.015793,0.019516,0.066988,0.012308,0.228417,3.620176,EFO:0010961
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,0.021715,0.076623,0.028184,0.008493,0.070514,0.006749,0.131797,0.228558,0.126935,EFO:0010961
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,0.021712,0.201236,0.372203,0.028267,0.098915,0.551456,1.024366,0.154369,4.667572,EFO:0010961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,0.007878,0.003221,0.066894,0.088568,0.003118,0.175014,0.104038,0.075937,0.027006,EFO:0010961
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,0.026667,0.022528,0.036427,0.073528,0.011448,0.507058,0.069567,0.217755,0.843986,EFO:0010961
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,0.081644,0.007158,0.008435,0.019979,0.093562,0.016823,0.197080,0.110074,0.185447,EFO:0010961
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,0.009663,0.005767,0.051473,0.009939,0.001094,0.009529,0.382378,0.357928,0.007438,EFO:0010961


#### **cell_type_ontology_term_id**

In [70]:
#get the column in adata.obs related. to cell type annotation

In [71]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'PC2_atria_abundance', 'PC3_str_abundance', 'SAN_P_cell_abundance',
       'SMC1_basic_abundance', 'SMC2_art_abundance', 'aCM1_abundance',
       'aCM2_abundance', 'aCM3_abundance', 'aCM4_abundance',
       'assay_ontology_term_id'],
      dtype='object', length=116)

In [72]:
adata.obsm

AxisArrays with keys: MT, X_spatial, means_cell_abundance_w_sf, prop, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, stds_cell_abundance_w_sf

In [73]:
adata.obsm['means_cell_abundance_w_sf']

Unnamed: 0_level_0,meanscell_abundance_w_sf_Adip1,meanscell_abundance_w_sf_Adip2,meanscell_abundance_w_sf_B,meanscell_abundance_w_sf_B_plasma,meanscell_abundance_w_sf_CD14+Mo,meanscell_abundance_w_sf_CD16+Mo,meanscell_abundance_w_sf_CD4+T_act,meanscell_abundance_w_sf_CD4+T_naive,meanscell_abundance_w_sf_CD8+T_cytox,meanscell_abundance_w_sf_CD8+T_em,...,meanscell_abundance_w_sf_Neut,meanscell_abundance_w_sf_PC2_atria,meanscell_abundance_w_sf_PC3_str,meanscell_abundance_w_sf_SAN_P_cell,meanscell_abundance_w_sf_SMC1_basic,meanscell_abundance_w_sf_SMC2_art,meanscell_abundance_w_sf_aCM1,meanscell_abundance_w_sf_aCM2,meanscell_abundance_w_sf_aCM3,meanscell_abundance_w_sf_aCM4
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0.074412,0.144087,0.131054,0.037014,0.154744,0.049578,0.081705,0.086086,0.121621,0.086695,...,0.203768,0.684926,0.174043,0.142312,0.866173,0.258087,0.313007,0.512560,0.619022,1.026571
HCAHeartST11702010_AAACAAGTATCTCCCA-1,0.348446,0.027221,0.023319,0.072910,0.116913,0.010248,0.045563,0.023478,0.021933,0.032655,...,0.204583,0.031665,0.025423,0.004780,0.019272,0.011628,0.006795,0.004744,0.012084,0.005961
HCAHeartST11702010_AAACAATCTACTAGCA-1,0.960353,0.072362,0.112037,0.154412,0.112066,0.019793,0.091912,0.094850,0.062719,0.076172,...,0.275649,0.106953,0.675461,0.062937,0.081329,0.053709,0.175614,0.044214,0.346346,3.735509
HCAHeartST11702010_AAACACCAATAACTGC-1,0.012572,0.011661,0.050159,0.012059,0.114915,0.052948,0.094921,0.089876,0.040625,0.075124,...,0.162308,0.098313,0.180480,0.055936,0.054595,0.108493,0.035930,0.172386,0.293320,0.175041
HCAHeartST11702010_AAACAGAGCGACTCCT-1,0.020833,0.016312,0.170944,0.262811,0.094762,0.134897,0.146473,0.179990,0.074068,0.140172,...,0.248619,0.132999,0.413460,0.488555,0.133858,0.186951,0.824711,1.169644,0.298196,4.853562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,0.014647,0.016158,0.141416,0.026997,0.043420,0.036142,0.043286,0.089484,0.101932,0.072491,...,0.095902,0.063026,0.029871,0.098360,0.168560,0.017221,0.249792,0.139493,0.126452,0.055456
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,0.009754,0.006928,0.056721,0.040988,0.156312,0.039324,0.064720,0.066373,0.109144,0.069932,...,0.391903,0.131205,0.100241,0.080365,0.177255,0.041504,0.647972,0.127906,0.309471,0.924165
HCAHeartST11702010_TTGTTTCCATACAACT-1,0.026111,0.025188,0.059657,0.016857,0.088733,0.037009,0.056217,0.059852,0.408165,0.053619,...,0.087774,0.201910,0.046899,0.029533,0.076989,0.132591,0.058321,0.238574,0.170422,0.237617
HCAHeartST11702010_TTGTTTGTATTACACG-1,0.042995,0.079800,0.129303,0.012647,0.023697,0.034052,0.042160,0.062178,0.048363,0.051912,...,0.056953,0.086132,0.064089,0.121906,0.080216,0.012073,0.068237,0.466279,0.494972,0.032782


In [74]:
max_columns = adata.obsm['means_cell_abundance_w_sf'].apply(lambda x: x[x == x.max()].index.tolist(), axis=1)

In [75]:
max_columns

spot_id
HCAHeartST11702010_AAACAACGAATAGTTC-1      [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702010_AAACAAGTATCTCCCA-1       [meanscell_abundance_w_sf_FB3]
HCAHeartST11702010_AAACAATCTACTAGCA-1      [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702010_AAACACCAATAACTGC-1       [meanscell_abundance_w_sf_FB3]
HCAHeartST11702010_AAACAGAGCGACTCCT-1      [meanscell_abundance_w_sf_aCM4]
                                                       ...                
HCAHeartST11702010_TTGTTTCACATCCAGG-1    [meanscell_abundance_w_sf_EC8_ln]
HCAHeartST11702010_TTGTTTCATTAGTCTA-1      [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702010_TTGTTTCCATACAACT-1       [meanscell_abundance_w_sf_FB3]
HCAHeartST11702010_TTGTTTGTATTACACG-1       [meanscell_abundance_w_sf_FB6]
HCAHeartST11702010_TTGTTTGTGTAAATTC-1      [meanscell_abundance_w_sf_aCM4]
Length: 3797, dtype: object

In [76]:
columns_with_multiple_max = max_columns.apply(lambda x: [col for col in x if x.count(col)>1])

In [77]:
columns_with_multiple_max 

spot_id
HCAHeartST11702010_AAACAACGAATAGTTC-1    []
HCAHeartST11702010_AAACAAGTATCTCCCA-1    []
HCAHeartST11702010_AAACAATCTACTAGCA-1    []
HCAHeartST11702010_AAACACCAATAACTGC-1    []
HCAHeartST11702010_AAACAGAGCGACTCCT-1    []
                                         ..
HCAHeartST11702010_TTGTTTCACATCCAGG-1    []
HCAHeartST11702010_TTGTTTCATTAGTCTA-1    []
HCAHeartST11702010_TTGTTTCCATACAACT-1    []
HCAHeartST11702010_TTGTTTGTATTACACG-1    []
HCAHeartST11702010_TTGTTTGTGTAAATTC-1    []
Length: 3797, dtype: object

In [78]:
adata.obs['columns_with_multiple_max'] = columns_with_multiple_max

In [79]:
non_empty_columns = columns_with_multiple_max[columns_with_multiple_max .apply(lambda x: len(x) > 0)]

In [80]:
len(non_empty_columns )

0

In [81]:
max_columns = [col[0].replace('meanscell_abundance_w_sf_','') if len(col) > 0 else '' for col in max_columns]

In [82]:
adata.obs['highest_cell_Density_columns'] = max_columns

In [83]:
adata.obs['highest_cell_Density_columns']

spot_id
HCAHeartST11702010_AAACAACGAATAGTTC-1      aCM4
HCAHeartST11702010_AAACAAGTATCTCCCA-1       FB3
HCAHeartST11702010_AAACAATCTACTAGCA-1      aCM4
HCAHeartST11702010_AAACACCAATAACTGC-1       FB3
HCAHeartST11702010_AAACAGAGCGACTCCT-1      aCM4
                                          ...  
HCAHeartST11702010_TTGTTTCACATCCAGG-1    EC8_ln
HCAHeartST11702010_TTGTTTCATTAGTCTA-1      aCM4
HCAHeartST11702010_TTGTTTCCATACAACT-1       FB3
HCAHeartST11702010_TTGTTTGTATTACACG-1       FB6
HCAHeartST11702010_TTGTTTGTGTAAATTC-1      aCM4
Name: highest_cell_Density_columns, Length: 3797, dtype: object

In [84]:
list(adata.obs['highest_cell_Density_columns'].unique())

['aCM4',
 'FB3',
 'FB4',
 'EC5_art',
 'Adip1',
 'aCM2',
 'Mast',
 'FB5',
 'SMC1_basic',
 'Adip2',
 'B',
 'CD8+T_cytox',
 'EC4_immune',
 'NC2',
 'NC1',
 'LYVE1+IGF1+MP',
 'Neut',
 'NK_CD16hi',
 'SMC2_art',
 'aCM3',
 'CD16+Mo',
 'EC2_cap',
 'EC7_atria',
 'EC8_ln',
 'EC1_cap',
 'EC10_CMC-like',
 'FB6',
 'FB2',
 'MoMP',
 'CD4+T_naive',
 'DC',
 'EC6_ven',
 'CD8+T_te',
 'SAN_P_cell',
 'FB1',
 'ILC',
 'LYVE1+MP_cycling',
 'PC2_atria',
 'PC3_str',
 'LYVE1+TIMD4+MP',
 'aCM1',
 'CD8+T_trans',
 'CD4+T_act',
 'CD8+T_em',
 'CD14+Mo']

In [85]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['highest_cell_Density_columns']

In [86]:
# create a dictionary of cell type and ontology term

In [87]:
mapping= {'aCM1': 'CL:0002129',
 'aCM2': 'CL:0002129',
 'aCM3': 'CL:0002129',
 'aCM4': 'CL:0002129',
 'aCM5': 'CL:0002129',
 'Adip1': 'CL:0000136',
 'Adip2': 'CL:0000136',
 'Adip3': 'CL:0000136',
 'Adip4': 'CL:0000136',
 'Adipocyte': 'CL:0000136',
 'Atrial Cardiomyocyte': 'CL:0002129',
 'AVN_bundle_cell': 'CL:0010005',
 'AVN_P_cell': 'CL:1000477',
 'B': 'CL:0000236',
 'B_plasma': 'CL:0000786',
 'CD14+Mo': 'CL:0001054',
 'CD16+Mo': 'CL:0002396',
 'CD4+T_act': 'CL:0000896',
 'CD4+T_naive': 'CL:0000895',
 'CD4+T_Th2': 'CL:0000546',
 'CD8+T_cytox': 'CL:0000794',
 'CD8+T_em': 'CL:0000913',
 'CD8+T_te': 'CL:0000625',
 'CD8+T_trans': 'CL:0000625',
 'DC': 'CL:0001056',
 'EC1_cap': 'CL:0002144',
 'EC10_CMC-like': 'CL:0000115',
 'EC2_cap': 'CL:0002144',
 'EC3_cap': 'CL:0002144',
 'EC4_immune': 'CL:0000115',
 'EC5_art': 'CL:1000413',
 'EC6_ven': 'CL:0002543',
 'EC7_atria': 'CL:0002350',
 'EC7_endocardial': 'CL:0002350',
 'EC8_ln': 'CL:0002138',
 'Endothelial cell': 'CL:0000115',
 'FB1': 'CL:0002548',
 'FB2': 'CL:0002548',
 'FB3': 'CL:0002548',
 'FB4': 'CL:0002548',
 'FB4_activated': 'CL:0002548',
 'FB5': 'CL:0002548',
 'FB6': 'CL:0002548',
 'Fibroblast': 'CL:0000057',
 'ILC': 'CL:0001065',
 'Lymphatic Endothelial cell': 'CL:0002138',
 'Lymphoid': 'CL:0000542',
 'LYVE1+IGF1+MP': 'CL:0000235',
 'LYVE1+MP_cycling': 'CL:0000235',
 'LYVE1+TIMD4+MP': 'CL:0000235',
 'MAIT-like': 'CL:0000940',
 'Mast': 'CL:0000097',
 'Mast cell': 'CL:0000097',
 'Meso': 'CL:0000077',
 'Mesothelial cell': 'CL:0000077',
 'MoMP': 'CL:0000576',
 'Mural cell': 'CL:0008034',
 'Myeloid': 'CL:0000763',
 'NC1': 'CL:0000125',
 'NC1_glial': 'CL:0000125',
 'NC2': 'CL:0000125',
 'NC2_glial_NGF+': 'CL:0000125',
 'Neural cell': 'CL:0002319',
 'Neut': 'CL:0000775',
 'NK_CD16hi': 'CL:0000939',
 'NK_CD56hi': 'CL:0000938',
 'PC1_vent': 'CL:0000669',
 'PC2_atria': 'CL:0000669',
 'PC3_str': 'CL:0000669',
 'PC4_CMC-like': 'CL:0000669',
 'SAN_P_cell': 'CL:1000477',
 'SMC1_basic': 'CL:0000192',
 'SMC2_art': 'CL:0002591',
 'T/NK_cycling': 'CL:0000814',
 'vCM1': 'CL:0002131',
 'vCM3_stressed': 'CL:0002131',
 'vCM4': 'CL:0002131',
 'Ventricular Cardiomyocyte': 'CL:0002131',
 'unclassified': 'CL:0000003'}

In [88]:
# add the cell_type_ontology_term_id column

In [89]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [90]:
# change datatype of the column

In [91]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [92]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,SMC1_basic_abundance,SMC2_art_abundance,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,0.655052,0.191568,0.188451,0.449563,0.514563,0.936007,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,0.002280,0.002917,0.000458,0.000582,0.002587,0.000734,EFO:0010961,[],FB3,CL:0002548
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,0.015793,0.019516,0.066988,0.012308,0.228417,3.620176,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,0.008493,0.070514,0.006749,0.131797,0.228558,0.126935,EFO:0010961,[],FB3,CL:0002548
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,0.028267,0.098915,0.551456,1.024366,0.154369,4.667572,EFO:0010961,[],aCM4,CL:0002129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,0.088568,0.003118,0.175014,0.104038,0.075937,0.027006,EFO:0010961,[],EC8_ln,CL:0002138
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,0.073528,0.011448,0.507058,0.069567,0.217755,0.843986,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,0.019979,0.093562,0.016823,0.197080,0.110074,0.185447,EFO:0010961,[],FB3,CL:0002548
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,0.009939,0.001094,0.009529,0.382378,0.357928,0.007438,EFO:0010961,[],FB6,CL:0002548


#### **donor_id**

In [93]:
#identify the column in adata.obs which provides donor information

In [94]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'SMC1_basic_abundance', 'SMC2_art_abundance', 'aCM1_abundance',
       'aCM2_abundance', 'aCM3_abundance', 'aCM4_abundance',
       'assay_ontology_term_id', 'columns_with_multiple_max',
       'highest_cell_Density_columns', 'cell_type_ontology_term_id'],
      dtype='object', length=119)

In [95]:
list(adata.obs['donor'].unique())

['AV1']

In [96]:
# add the donor_id column

In [97]:
adata.obs['donor_id'] = adata.obs['donor']

In [98]:
# change datatype of the column

In [99]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [100]:
# view unique values of donor_id column

In [101]:
list(adata.obs['donor_id'].unique())

['AV1']

In [102]:
#view obs

In [103]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,SMC2_art_abundance,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,0.191568,0.188451,0.449563,0.514563,0.936007,EFO:0010961,[],aCM4,CL:0002129,AV1
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,0.002917,0.000458,0.000582,0.002587,0.000734,EFO:0010961,[],FB3,CL:0002548,AV1
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,0.019516,0.066988,0.012308,0.228417,3.620176,EFO:0010961,[],aCM4,CL:0002129,AV1
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,0.070514,0.006749,0.131797,0.228558,0.126935,EFO:0010961,[],FB3,CL:0002548,AV1
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,0.098915,0.551456,1.024366,0.154369,4.667572,EFO:0010961,[],aCM4,CL:0002129,AV1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,0.003118,0.175014,0.104038,0.075937,0.027006,EFO:0010961,[],EC8_ln,CL:0002138,AV1
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,0.011448,0.507058,0.069567,0.217755,0.843986,EFO:0010961,[],aCM4,CL:0002129,AV1
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,0.093562,0.016823,0.197080,0.110074,0.185447,EFO:0010961,[],FB3,CL:0002548,AV1
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,0.001094,0.009529,0.382378,0.357928,0.007438,EFO:0010961,[],FB6,CL:0002548,AV1


In [104]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'SMC2_art_abundance', 'aCM1_abundance', 'aCM2_abundance',
       'aCM3_abundance', 'aCM4_abundance', 'assay_ontology_term_id',
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id'],
      dtype='object', length=120)

#### **development_stage_ontology_term_id**

In [105]:
# identify the column in adata which corresponds to age

In [106]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'SMC2_art_abundance', 'aCM1_abundance', 'aCM2_abundance',
       'aCM3_abundance', 'aCM4_abundance', 'assay_ontology_term_id',
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id'],
      dtype='object', length=120)

In [107]:
list(adata.obs['age'].unique())

['nan']

In [108]:
suppl_info = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/heart_cell_atlas/suppl_info_heart.csv')

In [109]:
# create a dictionary for age and development stage ontology term id

In [110]:
mapping = dict(zip(suppl_info['donor'], suppl_info['development_stage_ontology_term_id']))

In [111]:
# add the development_stage_ontology_term_id column

In [112]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['donor'].map(mapping)

In [113]:
# change datatype of the column

In [114]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [115]:
# view unique values of development_stage_ontology_term_id column

In [116]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000239']

In [117]:
# view adata.obs

In [118]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,0.188451,0.449563,0.514563,0.936007,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,0.000458,0.000582,0.002587,0.000734,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,0.066988,0.012308,0.228417,3.620176,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,0.006749,0.131797,0.228558,0.126935,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,0.551456,1.024366,0.154369,4.667572,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,0.175014,0.104038,0.075937,0.027006,EFO:0010961,[],EC8_ln,CL:0002138,AV1,HsapDv:0000239
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,0.507058,0.069567,0.217755,0.843986,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,0.016823,0.197080,0.110074,0.185447,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,0.009529,0.382378,0.357928,0.007438,EFO:0010961,[],FB6,CL:0002548,AV1,HsapDv:0000239


#### **disease_ontology_term_id**

In [119]:
# Assign normal since all are healthy patients

In [120]:
# add the disease_ontology_term_id column

In [121]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461']* len(adata.obs)

In [122]:
#change data type of column

In [123]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [124]:
# view obs

In [125]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,0.449563,0.514563,0.936007,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,0.000582,0.002587,0.000734,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,0.012308,0.228417,3.620176,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,0.131797,0.228558,0.126935,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,1.024366,0.154369,4.667572,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,0.104038,0.075937,0.027006,EFO:0010961,[],EC8_ln,CL:0002138,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,0.069567,0.217755,0.843986,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,0.197080,0.110074,0.185447,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,0.382378,0.357928,0.007438,EFO:0010961,[],FB6,CL:0002548,AV1,HsapDv:0000239,PATO:0000461


#### **is_primary_data**

In [126]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [127]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,0.514563,0.936007,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,0.002587,0.000734,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,0.228417,3.620176,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,0.228558,0.126935,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,0.154369,4.667572,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,0.075937,0.027006,EFO:0010961,[],EC8_ln,CL:0002138,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,0.217755,0.843986,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,0.110074,0.185447,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,0.357928,0.007438,EFO:0010961,[],FB6,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False


In [128]:
#change data type of column

In [129]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

#### **organism_ontology_term_id**

In [130]:
# assign organism id 

In [131]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [132]:
#change data type of column

In [133]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [134]:
# view obs

In [135]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,0.936007,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,0.000734,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,3.620176,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,0.126935,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,4.667572,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,0.027006,EFO:0010961,[],EC8_ln,CL:0002138,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,0.843986,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,0.185447,EFO:0010961,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,0.007438,EFO:0010961,[],FB6,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [136]:
# create a dictionary of donor and ethinic_origin

In [137]:
mapping= {'D1': 'Caucasian',
 'D2': 'Caucasian',
 'D3': 'Caucasian',
 'D4': 'Caucasian',
 'D5': 'Caucasian',
 'D6': 'Caucasian',
 'D7': 'Caucasian',
 'D11': 'Caucasian',
 'H2': 'Caucasian',
 'H3': 'Asian',
 'H4': 'Caucasian',
 'H5': 'Caucasian',
 'H6': 'Caucasian',
 'H7': 'Caucasian',
 'A61': 'Caucasian',
 'AH1': 'Caucasian',
 'AH1-A61': 'unknown',
 'AH2': 'South Asian',
 'AV10': 'Caucasian',
 'AV13': 'Caucasian',
 'AV14': 'Caucasian',
 'AV3': 'Caucasian',
 'D8': 'Caucasian',
 'AH5': 'Caucasian',
 'AH6': 'Caucasian',
 'AV1': 'Caucasian'}

In [138]:
# add ethinic_origin column

In [139]:
adata.obs['ethinic_origin'] = adata.obs['donor'].map(mapping)

In [140]:
# create a dictionary of ethinic_origin and self_reported_ethnicity_ontology_term_id

In [141]:
mapping= {'Caucasian': 'HANCESTRO:0005',
 'Asian': 'HANCESTRO:0008',
 'unknown': 'unknown',
 'South Asian': 'HANCESTRO:0006'}

In [142]:
# add self_reported_ethnicity_ontology_term_id column

In [143]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['ethinic_origin'].map(mapping)

In [144]:
# change data type

In [145]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [146]:
# view obs

In [147]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,[],EC8_ln,CL:0002138,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,[],FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,[],FB6,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005


#### **sex_ontology_term_id**

In [148]:
# identify the column in adata.obs which corresponds to sex

In [149]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object', length=126)

In [150]:
# list the unique values 

In [151]:
list(adata.obs['gender'].unique())

['nan']

In [152]:
# create a dictionary of sex and sex ontology term id

In [153]:
mapping = dict(zip(suppl_info['donor'], suppl_info['sex_ontology_term_id']))

In [154]:
# add sex_ontology_term_id column

In [155]:
adata.obs['sex_ontology_term_id'] = adata.obs['donor'].map(mapping)

In [156]:
# change data type

In [157]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [158]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,EC8_ln,CL:0002138,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,FB3,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,FB6,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383


#### **suspension_type**

In [159]:
# since visium suspension type is 'na'

In [160]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [161]:
# change data type

In [162]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [163]:
# view obs

In [164]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,CL:0002138,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na


#### **tissue_ontology_term_id**

In [165]:
# identify the column in adata.obs which corresponds to tissue

In [166]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'cell_type_ontology_term_id', 'donor_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type'],
      dtype='object', length=128)

In [167]:
# list unique values

In [168]:
list(adata.obs['region'].unique())

['SAN']

In [169]:
# create a dictionary with tissue and corresponding tissue ontology term id

In [170]:
mapping= {'AX':'UBERON:0002098', 
          'LV':'UBERON:0002084', 
          'RV':'UBERON:0002080', 
          'LA':'UBERON:0002079', 
          'SP':'UBERON:0002094', 
          'RA':'UBERON:0002078', 
          'SAN':'UBERON:0002351', 
          'AVN':'UBERON:0002352'}

In [171]:
# add 'tissue_ontology_term_id' column

In [172]:
adata.obs['tissue_ontology_term_id'] = adata.obs['region'].map(mapping)

In [173]:
# change data type of column

In [174]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [175]:
#list the unique values in 'tissue_ontology_term_id' column

In [176]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002351']

In [177]:
# view obs

In [178]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,0,16,5023,8.521982,14085.0,9.552937,HCAHeartST11702010,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACAAGTATCTCCCA-1,50,102,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACAATCTACTAGCA-1,3,43,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACACCAATAACTGC-1,59,19,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACAGAGCGACTCCT-1,14,94,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,58,42,2707,7.903966,4686.0,8.452548,HCAHeartST11702010,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,60,30,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCCATACAACT-1,45,27,2435,7.798113,4555.0,8.424200,HCAHeartST11702010,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTGTATTACACG-1,73,41,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351


In [179]:
del adata.obs['donor']
del adata.obs['gender']
del adata.obs['highest_cell_Density_columns']
del adata.obs['columns_with_multiple_max']
del adata.obs['array_row']
del adata.obs['array_col']
del adata.obs['modality']
del adata.obs['kit_10x']

#### **obsm (Embeddings)**

In [180]:
adata.obsm

AxisArrays with keys: MT, X_spatial, means_cell_abundance_w_sf, prop, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, stds_cell_abundance_w_sf

In [181]:
adata.obsm.keys()

KeysView(AxisArrays with keys: MT, X_spatial, means_cell_abundance_w_sf, prop, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, stds_cell_abundance_w_sf)

In [182]:
adata.obsm['means_cell_abundance_w_sf']= adata.obsm['means_cell_abundance_w_sf'].values

In [183]:
adata.obsm['stds_cell_abundance_w_sf'] = adata.obsm['stds_cell_abundance_w_sf'].values

In [184]:
adata.obsm['prop']=adata.obsm['prop'].values

In [185]:
adata.obsm['q05_cell_abundance_w_sf'] = adata.obsm['q05_cell_abundance_w_sf'].values

In [186]:
adata.obsm['q95_cell_abundance_w_sf']= adata.obsm['q95_cell_abundance_w_sf'].values

In [187]:
adata.obsm['X_means_cell_abundance_w_sf'] = adata.obsm['means_cell_abundance_w_sf']
adata.obsm['X_prop'] = adata.obsm['prop']
adata.obsm['X_q05_cell_abundance_w_sf'] = adata.obsm['q05_cell_abundance_w_sf']
adata.obsm['X_q95_cell_abundance_w_sf'] = adata.obsm['q95_cell_abundance_w_sf']
adata.obsm['X_stds_cell_abundance_w_sf'] = adata.obsm['stds_cell_abundance_w_sf']

In [188]:
adata.obsm

AxisArrays with keys: MT, X_spatial, means_cell_abundance_w_sf, prop, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, stds_cell_abundance_w_sf, X_means_cell_abundance_w_sf, X_prop, X_q05_cell_abundance_w_sf, X_q95_cell_abundance_w_sf, X_stds_cell_abundance_w_sf

In [189]:
del adata.obsm['MT']
del adata.obsm['means_cell_abundance_w_sf']
del adata.obsm['prop']
del adata.obsm['q05_cell_abundance_w_sf']
del adata.obsm['q95_cell_abundance_w_sf']
del adata.obsm['stds_cell_abundance_w_sf']

#### **uns (Dataset Metadata)**

In [190]:
adata.uns

OverloadedDict, wrapping:
	{'log1p': {}, 'spatial': {'HCAHeartST11702010': {'images': {'hires': array([[[0.9490196 , 0.9254902 , 0.92941177],
        [0.9490196 , 0.9254902 , 0.92941177],
        [0.9490196 , 0.9254902 , 0.92941177],
        ...,
        [0.93333334, 0.90588236, 0.92156863],
        [0.93333334, 0.90588236, 0.92156863],
        [0.93333334, 0.90588236, 0.92156863]],

       [[0.9490196 , 0.9254902 , 0.92941177],
        [0.9490196 , 0.9254902 , 0.92941177],
        [0.9490196 , 0.9254902 , 0.92941177],
        ...,
        [0.93333334, 0.90588236, 0.92156863],
        [0.93333334, 0.90588236, 0.92156863],
        [0.93333334, 0.90588236, 0.92156863]],

       [[0.9490196 , 0.9254902 , 0.92941177],
        [0.9490196 , 0.9254902 , 0.92941177],
        [0.9490196 , 0.9254902 , 0.92941177],
        ...,
        [0.93333334, 0.90588236, 0.92156863],
        [0.93333334, 0.90588236, 0.92156863],
        [0.93333334, 0.90588236, 0.92156863]],

       ...,

       [[0.9411765

In [191]:
adata.uns['image_caption'] = 'Shown here is an image of 10 μm thick slice of sinoatrial node region of the adult human heart stained with H&E'

In [192]:
adata.uns['title'] = 'Visium spatial - HCAHeartST11702010 (FFPE)'

In [193]:
adata.uns['default_embedding'] = 'X_spatial'

In [194]:
adata.uns.keys()

dict_keys(['log1p', 'spatial', 'image_caption', 'title', 'default_embedding'])

### **Final checks and adjustments**

In [195]:
adata

AnnData object with n_obs × n_vars = 3797 × 17912
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'sangerID', 'region', 'donor_type', 'age', 'facility', 'flushed', 'annotation_final', 'prop_Adip1', 'prop_Adip2', 'prop_B', 'prop_B_plasma', 'prop_CD14+Mo', 'prop_CD16+Mo', 'prop_CD4+T_act', 'prop_CD4+T_naive', 'prop_CD8+T_cytox', 'prop_CD8+T_em', 'prop_CD8+T_te', 'prop_CD8+T_trans', 'prop_DC', 'prop_EC10_CMC-like', 'prop_EC1_cap', 'prop_EC2_cap', 'prop_EC3_cap', 'prop_EC4_immune', 'prop_EC5_art', 'prop_EC6_ven', 'prop_EC7_endocardial', 'prop_EC8_ln', 'prop_FB1', 'prop_FB2', 'prop_FB3', 'prop_FB4_activated', 'prop_FB5', 'prop_FB6', 'prop_ILC', 'prop_LYVE1+IGF1+MP', 'prop_LYVE1+MP_cycling', 'prop_LYVE1+TIMD4+MP', 'prop_MAIT-like', 'prop_Mast', 'prop_MoMP', 'prop_NC1_glial', 'prop_NC2_glial_NGF+', 'prop_NK_CD16hi', 'prop_NK_CD56hi', 'prop_Neut', 'prop_PC2_atria', 'prop_PC3_str', 'prop_SAN_P_cell', 'prop_SMC1_basic', 'prop_SMC2_art', 'prop_aCM1',

In [196]:
adata.obs.dtypes

n_genes_by_counts                              int32
log1p_n_genes_by_counts                      float64
total_counts                                 float32
log1p_total_counts                           float32
sangerID                                    category
                                              ...   
ethinic_origin                              category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
suspension_type                             category
tissue_ontology_term_id                     category
Length: 121, dtype: object

In [197]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [198]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed log1p_n_genes_by_counts from float64 to float32
changed prop_Adip1 from float64 to float32
changed prop_Adip2 from float64 to float32
changed prop_B from float64 to float32
changed prop_B_plasma from float64 to float32
changed prop_CD14+Mo from float64 to float32
changed prop_CD16+Mo from float64 to float32
changed prop_CD4+T_act from float64 to float32
changed prop_CD4+T_naive from float64 to float32
changed prop_CD8+T_cytox from float64 to float32
changed prop_CD8+T_em from float64 to float32
changed prop_CD8+T_te from float64 to float32
changed prop_CD8+T_trans from float64 to float32
changed prop_DC from float64 to float32
changed prop_EC10_CMC-like from float64 to float32
changed prop_EC1_cap from float64 to float32
changed prop_EC2_cap from float64 to float32
changed prop_EC3_cap from float64 to float32
changed prop_EC4_immune from float64 to float32
changed prop_EC5_art from float64 to float32
changed prop_EC6_ven from float64 to float32
changed prop_EC7_endocardial from

In [199]:
adata.obs

Unnamed: 0_level_0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor_type,age,facility,flushed,...,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,5023,8.521981,14085.0,9.552937,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACAAGTATCTCCCA-1,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACAATCTACTAGCA-1,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACACCAATAACTGC-1,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACAGAGCGACTCCT-1,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,2707,7.903965,4686.0,8.452548,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCCATACAACT-1,2435,7.798112,4555.0,8.424200,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTGTATTACACG-1,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351


In [200]:
adata.obs.columns

Index(['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'sangerID', 'region', 'donor_type', 'age',
       'facility', 'flushed',
       ...
       'donor_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=121)

In [201]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
gene_ids,Unnamed: 1_level_1
ENSG00000187634,False
ENSG00000188976,False
ENSG00000187961,False
ENSG00000187583,False
ENSG00000187642,False
...,...
ENSG00000160298,False
ENSG00000160299,False
ENSG00000160305,False
ENSG00000160307,False


In [202]:
adata.obs

Unnamed: 0_level_0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor_type,age,facility,flushed,...,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702010_AAACAACGAATAGTTC-1,5023,8.521981,14085.0,9.552937,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACAAGTATCTCCCA-1,1888,7.543803,3137.0,8.051341,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACAATCTACTAGCA-1,4784,8.473241,14628.0,9.590761,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACACCAATAACTGC-1,2428,7.795235,4629.0,8.440312,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_AAACAGAGCGACTCCT-1,3969,8.286521,9764.0,9.186560,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,2707,7.903965,4686.0,8.452548,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,2518,7.831617,4859.0,8.488793,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCCATACAACT-1,2435,7.798112,4555.0,8.424200,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTGTATTACACG-1,929,6.835185,1201.0,7.091742,HCAHeartST11702010,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351


In [203]:
adata.obs.columns

Index(['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'sangerID', 'region', 'donor_type', 'age',
       'facility', 'flushed',
       ...
       'donor_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=121)

In [204]:
#check the format of expression matrix

In [205]:
adata.X

<3797x17912 sparse matrix of type '<class 'numpy.float32'>'
	with 12820350 stored elements in Compressed Sparse Row format>

In [206]:
araw.X

<3797x17912 sparse matrix of type '<class 'numpy.float32'>'
	with 12820350 stored elements in Compressed Sparse Row format>

In [207]:
#Copy raw counts to adata.raw

In [208]:
adata.raw = araw

In [209]:
#write the curated object to final_objects folder

In [210]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/final_objects/HCAHeartST11702010(FFPE).h5ad', compression = 'gzip')