### **Curating HCAHeartST11702008(FFPE).h5ad**

Article: Spatially resolved multiomics of human cardiac niches

DOI: https://doi.org/10.1038/s41586-023-06311-1 

Data Source : https://www.heartcellatlas.org

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/data/HCAHeartST11702008.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 3533 × 17922
    obs: 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'sangerID', 'region', 'donor', 'donor_type', 'age', 'gender', 'facility', 'modality', 'kit_10x', 'flushed', 'annotation_final', 'prop_Adip1', 'prop_Adip2', 'prop_B', 'prop_B_plasma', 'prop_CD14+Mo', 'prop_CD16+Mo', 'prop_CD4+T_act', 'prop_CD4+T_naive', 'prop_CD8+T_cytox', 'prop_CD8+T_em', 'prop_CD8+T_te', 'prop_CD8+T_trans', 'prop_DC', 'prop_EC10_CMC-like', 'prop_EC1_cap', 'prop_EC2_cap', 'prop_EC3_cap', 'prop_EC4_immune', 'prop_EC5_art', 'prop_EC6_ven', 'prop_EC7_endocardial', 'prop_EC8_ln', 'prop_FB1', 'prop_FB2', 'prop_FB3', 'prop_FB4_activated', 'prop_FB5', 'prop_FB6', 'prop_ILC', 'prop_LYVE1+IGF1+MP', 'prop_LYVE1+MP_cycling', 'prop_LYVE1+TIMD4+MP', 'prop_MAIT-like', 'prop_Mast', 'prop_MoMP', 'prop_NC1_glial', 'prop_NC2_glial_NGF+', 'prop_NK_CD16hi', 'prop_NK_CD56hi', 'prop_Neut', 'prop_PC2_atria', 'prop_PC3_str',

##### **X- expression matrix**

In [7]:
# View the expression matrix of the anndata object

In [8]:
adata.X

<3533x17922 sparse matrix of type '<class 'numpy.float32'>'
	with 12257294 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 1)	0.86263424
  (0, 4)	0.86263424
  (0, 5)	1.3187616
  (0, 12)	0.52158576
  (0, 13)	1.6308435
  (0, 14)	0.86263424
  (0, 16)	0.52158576
  (0, 18)	1.116482
  (0, 20)	0.52158576
  (0, 23)	1.116482
  (0, 24)	0.52158576
  (0, 25)	1.116482
  (0, 26)	0.86263424
  (0, 29)	0.52158576
  (0, 32)	1.4869279
  (0, 36)	0.52158576
  (0, 39)	0.86263424
  (0, 40)	1.116482
  (0, 45)	0.52158576
  (0, 47)	0.86263424
  (0, 49)	0.52158576
  (0, 67)	1.4869279
  (0, 68)	0.86263424
  (0, 72)	0.52158576
  (0, 75)	0.52158576
  :	:
  (3532, 17216)	4.7440205
  (3532, 17222)	2.5168512
  (3532, 17293)	2.5168512
  (3532, 17295)	2.5168512
  (3532, 17353)	2.5168512
  (3532, 17440)	3.1688046
  (3532, 17489)	2.5168512
  (3532, 17491)	2.5168512
  (3532, 17520)	2.5168512
  (3532, 17539)	4.0595393
  (3532, 17543)	2.5168512
  (3532, 17573)	2.5168512
  (3532, 17586)	2.5168512
  (3532, 17597)	2.5168512
  (3532, 17599)	2.5168512
  (3532, 17614)	2.5168512
  (3532, 17615)	2.5168512
  (3532, 17660)	2.5168512
  (3532, 17727)	

##### **Raw counts matrix**

In [11]:
# If X has normalized counts, check for the raw counts matrix.

In [12]:
#Here the raw counts are provided in a separate object, load the raw counts matrix

In [13]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/Raw_counts/visium-FFPE_adult-8reg_raw.h5ad')

In [14]:
# view raw object

In [15]:
araw

AnnData object with n_obs × n_vars = 11312 × 17922
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac', 'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'region_cell2loc'
    var: 'gene_ids', 'feature_types', 'genome', 'SYMBOL'
    uns: 'spatial'
    obsm: 'MT', 'means_cell_abundance_w_sf', 'q05_cell_abundance_w_sf', 'q95_cell_abundance_w_sf', 'spatial', 'stds_cell_abundance_w_sf'

In [16]:
araw.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,region,region_finest,age,gender,facility,cell_or_nuclei,modality,kit_10x,flushed,region_cell2loc
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,SAN-FFPE,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,SAN-FFPE,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,SAN-FFPE,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,SAN-FFPE,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE


In [17]:
list(araw.obs['region'].unique())

['SAN-FFPE']

In [18]:
# view raw matrix

In [19]:
araw.X

<11312x17922 sparse matrix of type '<class 'numpy.float32'>'
	with 36178068 stored elements in Compressed Sparse Row format>

In [20]:
print(araw.X)

  (0, 1)	2.0
  (0, 4)	2.0
  (0, 5)	4.0
  (0, 12)	1.0
  (0, 13)	6.0
  (0, 14)	2.0
  (0, 16)	1.0
  (0, 18)	3.0
  (0, 20)	1.0
  (0, 23)	3.0
  (0, 24)	1.0
  (0, 25)	3.0
  (0, 26)	2.0
  (0, 29)	1.0
  (0, 32)	5.0
  (0, 36)	1.0
  (0, 39)	2.0
  (0, 40)	3.0
  (0, 45)	1.0
  (0, 47)	2.0
  (0, 49)	1.0
  (0, 67)	5.0
  (0, 68)	2.0
  (0, 72)	1.0
  (0, 75)	1.0
  :	:
  (11311, 17822)	1.0
  (11311, 17826)	6.0
  (11311, 17832)	1.0
  (11311, 17835)	1.0
  (11311, 17838)	1.0
  (11311, 17843)	1.0
  (11311, 17844)	1.0
  (11311, 17845)	1.0
  (11311, 17846)	3.0
  (11311, 17854)	1.0
  (11311, 17856)	2.0
  (11311, 17871)	1.0
  (11311, 17874)	11.0
  (11311, 17878)	2.0
  (11311, 17881)	2.0
  (11311, 17900)	1.0
  (11311, 17901)	3.0
  (11311, 17903)	2.0
  (11311, 17904)	1.0
  (11311, 17907)	1.0
  (11311, 17910)	3.0
  (11311, 17911)	8.0
  (11311, 17913)	1.0
  (11311, 17914)	1.0
  (11311, 17919)	1.0


In [21]:
# since the raw object is combined one, extract the raw counts for this dataset 

In [22]:
araw = araw[araw.obs['sample']=='HCAHeartST11702008']

In [23]:
araw

View of AnnData object with n_obs × n_vars = 3533 × 17922
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac', 'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'region_cell2loc'
    var: 'gene_ids', 'feature_types', 'genome', 'SYMBOL'
    uns: 'spatial'
    obsm: 'MT', 'means_cell_abundance_w_sf', 'q05_cell_abundance_w_sf', 'q95_cell_abundance_w_sf', 'spatial', 'stds_cell_abundance_w_sf'

##### **Variables(var)**

In [24]:
#View the var of anndata and raw object

In [25]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome
SYMBOL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SAMD11,ENSG00000187634,Gene Expression,GRCh38
NOC2L,ENSG00000188976,Gene Expression,GRCh38
KLHL17,ENSG00000187961,Gene Expression,GRCh38
PLEKHN1,ENSG00000187583,Gene Expression,GRCh38
PERM1,ENSG00000187642,Gene Expression,GRCh38
...,...,...,...
C21orf58,ENSG00000160298,Gene Expression,GRCh38
PCNT,ENSG00000160299,Gene Expression,GRCh38
DIP2A,ENSG00000160305,Gene Expression,GRCh38
S100B,ENSG00000160307,Gene Expression,GRCh38


In [26]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [27]:
# Check the index column of var. Check whether ensembl ids are provided in the index column of var or not.

In [28]:
# If ensembl ids are not in the index column and is present in another column, set the ensembl ids column as the index column

In [29]:
adata.var['gene_symbols'] = adata.var_names

In [30]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
SYMBOL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SAMD11,ENSG00000187634,Gene Expression,GRCh38,SAMD11
NOC2L,ENSG00000188976,Gene Expression,GRCh38,NOC2L
KLHL17,ENSG00000187961,Gene Expression,GRCh38,KLHL17
PLEKHN1,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
PERM1,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
C21orf58,ENSG00000160298,Gene Expression,GRCh38,C21orf58
PCNT,ENSG00000160299,Gene Expression,GRCh38,PCNT
DIP2A,ENSG00000160305,Gene Expression,GRCh38,DIP2A
S100B,ENSG00000160307,Gene Expression,GRCh38,S100B


In [31]:
adata.var_names = adata.var['gene_ids']

In [32]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [33]:
# load the approved genes file

In [34]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [35]:
# Create a dictionary from the approved genes file using the symbols and feature id columns.

In [36]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [37]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [38]:
len(genedict)

116184

In [39]:
# Filter out the genes which are not in the approved genes file

In [40]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [41]:
len(var_to_keep_adata)

17912

In [42]:
len(var_to_keep_araw)

17912

In [43]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [44]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [45]:
# Modify the anndata object by filtering out the filtered genes. copy the index column values to a new column called gene_symbols

In [46]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_araw].copy()

In [47]:
#  View the var

In [48]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [49]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


feature is filtered

In [50]:
# Assign False since the feature was not filtered out in the normalized matrix (X).

In [51]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [52]:
#View var

In [53]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11,False
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L,False
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17,False
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1,False
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1,False
...,...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58,False
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT,False
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A,False
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B,False


In [54]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [55]:
#  Delete the unwanted columns in adata and araw.

In [56]:
del araw.var['gene_ids']
del araw.var['SYMBOL']
del araw.var['feature_types']
del araw.var['genome']

In [57]:
del adata.var['gene_ids']
del adata.var['gene_symbols']
del adata.var['feature_types']
del adata.var['genome']

In [58]:
# view var

In [59]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
gene_ids,Unnamed: 1_level_1
ENSG00000187634,False
ENSG00000188976,False
ENSG00000187961,False
ENSG00000187583,False
ENSG00000187642,False
...,...
ENSG00000160298,False
ENSG00000160299,False
ENSG00000160305,False
ENSG00000160307,False


In [60]:
araw.var

ENSG00000187634
ENSG00000188976
ENSG00000187961
ENSG00000187583
ENSG00000187642
...
ENSG00000160298
ENSG00000160299
ENSG00000160305
ENSG00000160307
ENSG00000160310


#### **Observations(obs) (Cell metadata)**

In [61]:
#view obs

In [62]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,Neut_abundance,PC2_atria_abundance,PC3_str_abundance,SAN_P_cell_abundance,SMC1_basic_abundance,SMC2_art_abundance,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,0.272716,0.038873,0.807573,0.475853,0.139569,0.253794,0.308251,0.035986,0.883052,8.300804
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,0.091488,0.012746,0.005400,0.021924,0.006722,0.072508,0.043105,0.376688,0.009017,0.099823
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,0.015911,0.005320,0.008204,0.116061,0.032636,0.023831,0.035329,0.202949,0.020117,0.039279
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,0.036688,0.010357,0.062135,0.168132,0.051892,0.155938,0.485509,1.166382,0.077411,4.716860
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,0.036299,0.010622,0.022249,0.044962,0.363823,0.014000,0.520046,2.091609,0.012129,1.431818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,0.008482,0.018499,0.011757,0.115073,0.065241,0.031092,0.028939,0.218065,0.058754,0.271111
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,0.021980,0.029552,0.050885,0.169612,0.034588,0.009102,0.033021,0.114498,0.051356,0.538820
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,0.198908,0.048121,0.414469,0.119339,0.039221,0.058593,0.211054,0.085760,0.371442,5.962911
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,0.051066,0.031652,0.120353,0.056511,0.085419,0.033264,0.172424,1.122119,0.021544,2.080328


In [63]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'Neut_abundance', 'PC2_atria_abundance', 'PC3_str_abundance',
       'SAN_P_cell_abundance', 'SMC1_basic_abundance', 'SMC2_art_abundance',
       'aCM1_abundance', 'aCM2_abundance', 'aCM3_abundance', 'aCM4_abundance'],
      dtype='object', length=115)

#### **assay_ontology_term_id**

In [64]:
# identify the column in adata which corresponds to assay

In [65]:
list(adata.obs['kit_10x'].unique())

['Visium-v1-FFPE']

In [66]:
# add the assay_ontology_term_id column

In [67]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [68]:
# change datatype of the column

In [69]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [70]:
# view adata.obs

In [71]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,PC2_atria_abundance,PC3_str_abundance,SAN_P_cell_abundance,SMC1_basic_abundance,SMC2_art_abundance,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,0.038873,0.807573,0.475853,0.139569,0.253794,0.308251,0.035986,0.883052,8.300804,EFO:0010961
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,0.012746,0.005400,0.021924,0.006722,0.072508,0.043105,0.376688,0.009017,0.099823,EFO:0010961
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,0.005320,0.008204,0.116061,0.032636,0.023831,0.035329,0.202949,0.020117,0.039279,EFO:0010961
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,0.010357,0.062135,0.168132,0.051892,0.155938,0.485509,1.166382,0.077411,4.716860,EFO:0010961
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,0.010622,0.022249,0.044962,0.363823,0.014000,0.520046,2.091609,0.012129,1.431818,EFO:0010961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,0.018499,0.011757,0.115073,0.065241,0.031092,0.028939,0.218065,0.058754,0.271111,EFO:0010961
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,0.029552,0.050885,0.169612,0.034588,0.009102,0.033021,0.114498,0.051356,0.538820,EFO:0010961
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,0.048121,0.414469,0.119339,0.039221,0.058593,0.211054,0.085760,0.371442,5.962911,EFO:0010961
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,0.031652,0.120353,0.056511,0.085419,0.033264,0.172424,1.122119,0.021544,2.080328,EFO:0010961


#### **cell_type_ontology_term_id**

In [72]:
#get the column in adata.obs related. to cell type annotation

In [73]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'PC2_atria_abundance', 'PC3_str_abundance', 'SAN_P_cell_abundance',
       'SMC1_basic_abundance', 'SMC2_art_abundance', 'aCM1_abundance',
       'aCM2_abundance', 'aCM3_abundance', 'aCM4_abundance',
       'assay_ontology_term_id'],
      dtype='object', length=116)

In [74]:
adata.obsm

AxisArrays with keys: MT, X_spatial, means_cell_abundance_w_sf, prop, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, stds_cell_abundance_w_sf

In [75]:
adata.obsm['means_cell_abundance_w_sf']

Unnamed: 0_level_0,meanscell_abundance_w_sf_Adip1,meanscell_abundance_w_sf_Adip2,meanscell_abundance_w_sf_B,meanscell_abundance_w_sf_B_plasma,meanscell_abundance_w_sf_CD14+Mo,meanscell_abundance_w_sf_CD16+Mo,meanscell_abundance_w_sf_CD4+T_act,meanscell_abundance_w_sf_CD4+T_naive,meanscell_abundance_w_sf_CD8+T_cytox,meanscell_abundance_w_sf_CD8+T_em,...,meanscell_abundance_w_sf_Neut,meanscell_abundance_w_sf_PC2_atria,meanscell_abundance_w_sf_PC3_str,meanscell_abundance_w_sf_SAN_P_cell,meanscell_abundance_w_sf_SMC1_basic,meanscell_abundance_w_sf_SMC2_art,meanscell_abundance_w_sf_aCM1,meanscell_abundance_w_sf_aCM2,meanscell_abundance_w_sf_aCM3,meanscell_abundance_w_sf_aCM4
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,0.034087,0.014062,0.131219,0.204413,0.202151,0.055197,0.221306,0.234631,0.074122,0.182979,...,0.557784,0.206627,1.181703,0.628595,0.364029,0.371932,0.577294,0.119928,1.146386,8.548860
HCAHeartST11702008_AAACACCAATAACTGC-1,0.012364,0.013477,0.147598,0.100063,0.054316,0.039809,0.081068,0.103140,0.078240,0.065698,...,0.211951,0.084845,0.047951,0.059058,0.050788,0.119662,0.120812,0.434308,0.042894,0.156782
HCAHeartST11702008_AAACAGAGCGACTCCT-1,0.007575,0.004719,0.087094,0.007829,0.081405,0.031330,0.051603,0.063728,0.108002,0.056766,...,0.071347,0.047041,0.054377,0.159433,0.106617,0.054258,0.098117,0.249715,0.058340,0.079322
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,0.017588,0.018783,0.086528,0.183383,0.190777,0.064008,0.086374,0.083343,0.057750,0.069885,...,0.134450,0.075085,0.188253,0.269785,0.172557,0.239707,0.754950,1.302178,0.215141,4.904526
HCAHeartST11702008_AAACAGGGTCTATATT-1,0.004345,0.002594,0.077050,0.020789,0.066558,0.097732,0.071563,0.117570,0.067732,0.069363,...,0.117135,0.072369,0.108168,0.097635,0.537598,0.047743,0.739938,2.193104,0.060727,1.561950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,0.006748,0.006050,0.072308,0.015206,0.013481,0.009935,0.049556,0.043300,0.120438,0.051399,...,0.049979,0.100049,0.081720,0.165691,0.177858,0.073830,0.099811,0.277975,0.125305,0.347445
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,0.014115,0.011588,0.040330,0.023690,0.014876,0.085471,0.041050,0.037960,0.050517,0.040234,...,0.078828,0.094768,0.137466,0.213180,0.097833,0.028563,0.092342,0.154893,0.100258,0.600970
HCAHeartST11702008_TTGTTTCACATCCAGG-1,0.003854,0.004211,0.275567,0.134961,0.225735,0.029708,0.259745,0.204421,0.106343,0.184463,...,0.386002,0.188070,0.691112,0.204387,0.146474,0.120207,0.417416,0.169662,0.540152,6.144556
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,0.006100,0.006970,0.121688,0.039102,0.107918,0.098166,0.111263,0.151679,0.140548,0.100378,...,0.155624,0.134518,0.271182,0.111799,0.202389,0.077980,0.318174,1.206596,0.077782,2.214696


In [76]:
max_columns = adata.obsm['means_cell_abundance_w_sf'].apply(lambda x: x[x == x.max()].index.tolist(), axis=1)

In [77]:
max_columns

spot_id
HCAHeartST11702008_AAACAAGTATCTCCCA-1          [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702008_AAACACCAATAACTGC-1    [meanscell_abundance_w_sf_EC4_immune]
HCAHeartST11702008_AAACAGAGCGACTCCT-1           [meanscell_abundance_w_sf_FB4]
HCAHeartST11702008_AAACAGCTTTCAGAAG-1          [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702008_AAACAGGGTCTATATT-1          [meanscell_abundance_w_sf_aCM2]
                                                         ...                  
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1          [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1          [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702008_TTGTTTCACATCCAGG-1          [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702008_TTGTTTCATTAGTCTA-1          [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702008_TTGTTTGTGTAAATTC-1    [meanscell_abundance_w_sf_SAN_P_cell]
Length: 3533, dtype: object

In [78]:
columns_with_multiple_max = max_columns.apply(lambda x: [col for col in x if x.count(col)>1])

In [79]:
columns_with_multiple_max 

spot_id
HCAHeartST11702008_AAACAAGTATCTCCCA-1    []
HCAHeartST11702008_AAACACCAATAACTGC-1    []
HCAHeartST11702008_AAACAGAGCGACTCCT-1    []
HCAHeartST11702008_AAACAGCTTTCAGAAG-1    []
HCAHeartST11702008_AAACAGGGTCTATATT-1    []
                                         ..
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1    []
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1    []
HCAHeartST11702008_TTGTTTCACATCCAGG-1    []
HCAHeartST11702008_TTGTTTCATTAGTCTA-1    []
HCAHeartST11702008_TTGTTTGTGTAAATTC-1    []
Length: 3533, dtype: object

In [80]:
adata.obs['columns_with_multiple_max'] = columns_with_multiple_max

In [81]:
non_empty_columns = columns_with_multiple_max[columns_with_multiple_max .apply(lambda x: len(x) > 0)]

In [82]:
len(non_empty_columns )

0

In [83]:
max_columns = [col[0].replace('meanscell_abundance_w_sf_','') if len(col) > 0 else '' for col in max_columns]

In [84]:
adata.obs['highest_cell_Density_columns'] = max_columns

In [85]:
adata.obs['highest_cell_Density_columns']

spot_id
HCAHeartST11702008_AAACAAGTATCTCCCA-1          aCM4
HCAHeartST11702008_AAACACCAATAACTGC-1    EC4_immune
HCAHeartST11702008_AAACAGAGCGACTCCT-1           FB4
HCAHeartST11702008_AAACAGCTTTCAGAAG-1          aCM4
HCAHeartST11702008_AAACAGGGTCTATATT-1          aCM2
                                            ...    
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1          aCM4
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1          aCM4
HCAHeartST11702008_TTGTTTCACATCCAGG-1          aCM4
HCAHeartST11702008_TTGTTTCATTAGTCTA-1          aCM4
HCAHeartST11702008_TTGTTTGTGTAAATTC-1    SAN_P_cell
Name: highest_cell_Density_columns, Length: 3533, dtype: object

In [86]:
list(adata.obs['highest_cell_Density_columns'].unique())

['aCM4',
 'EC4_immune',
 'FB4',
 'aCM2',
 'CD8+T_cytox',
 'ILC',
 'FB3',
 'Neut',
 'Adip1',
 'FB1',
 'aCM3',
 'DC',
 'NC1',
 'SAN_P_cell',
 'NC2',
 'EC10_CMC-like',
 'EC7_atria',
 'SMC2_art',
 'EC2_cap',
 'B',
 'PC2_atria',
 'EC5_art',
 'FB5',
 'Mast',
 'FB2',
 'FB6',
 'SMC1_basic',
 'EC1_cap',
 'NK_CD56hi',
 'aCM1',
 'EC3_cap',
 'EC8_ln',
 'PC3_str',
 'NK_CD16hi',
 'EC6_ven',
 'LYVE1+TIMD4+MP',
 'CD4+T_naive',
 'MoMP']

In [87]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['highest_cell_Density_columns']

In [88]:
# create a dictionary of cell type and ontology term

In [89]:
mapping= {'aCM1': 'CL:0002129',
 'aCM2': 'CL:0002129',
 'aCM3': 'CL:0002129',
 'aCM4': 'CL:0002129',
 'aCM5': 'CL:0002129',
 'Adip1': 'CL:0000136',
 'Adip2': 'CL:0000136',
 'Adip3': 'CL:0000136',
 'Adip4': 'CL:0000136',
 'Adipocyte': 'CL:0000136',
 'Atrial Cardiomyocyte': 'CL:0002129',
 'AVN_bundle_cell': 'CL:0010005',
 'AVN_P_cell': 'CL:1000477',
 'B': 'CL:0000236',
 'B_plasma': 'CL:0000786',
 'CD14+Mo': 'CL:0001054',
 'CD16+Mo': 'CL:0002396',
 'CD4+T_act': 'CL:0000896',
 'CD4+T_naive': 'CL:0000895',
 'CD4+T_Th2': 'CL:0000546',
 'CD8+T_cytox': 'CL:0000794',
 'CD8+T_em': 'CL:0000913',
 'CD8+T_te': 'CL:0000625',
 'CD8+T_trans': 'CL:0000625',
 'DC': 'CL:0001056',
 'EC1_cap': 'CL:0002144',
 'EC10_CMC-like': 'CL:0000115',
 'EC2_cap': 'CL:0002144',
 'EC3_cap': 'CL:0002144',
 'EC4_immune': 'CL:0000115',
 'EC5_art': 'CL:1000413',
 'EC6_ven': 'CL:0002543',
 'EC7_atria': 'CL:0002350',
 'EC7_endocardial': 'CL:0002350',
 'EC8_ln': 'CL:0002138',
 'Endothelial cell': 'CL:0000115',
 'FB1': 'CL:0002548',
 'FB2': 'CL:0002548',
 'FB3': 'CL:0002548',
 'FB4': 'CL:0002548',
 'FB4_activated': 'CL:0002548',
 'FB5': 'CL:0002548',
 'FB6': 'CL:0002548',
 'Fibroblast': 'CL:0000057',
 'ILC': 'CL:0001065',
 'Lymphatic Endothelial cell': 'CL:0002138',
 'Lymphoid': 'CL:0000542',
 'LYVE1+IGF1+MP': 'CL:0000235',
 'LYVE1+MP_cycling': 'CL:0000235',
 'LYVE1+TIMD4+MP': 'CL:0000235',
 'MAIT-like': 'CL:0000940',
 'Mast': 'CL:0000097',
 'Mast cell': 'CL:0000097',
 'Meso': 'CL:0000077',
 'Mesothelial cell': 'CL:0000077',
 'MoMP': 'CL:0000576',
 'Mural cell': 'CL:0008034',
 'Myeloid': 'CL:0000763',
 'NC1': 'CL:0000125',
 'NC1_glial': 'CL:0000125',
 'NC2': 'CL:0000125',
 'NC2_glial_NGF+': 'CL:0000125',
 'Neural cell': 'CL:0002319',
 'Neut': 'CL:0000775',
 'NK_CD16hi': 'CL:0000939',
 'NK_CD56hi': 'CL:0000938',
 'PC1_vent': 'CL:0000669',
 'PC2_atria': 'CL:0000669',
 'PC3_str': 'CL:0000669',
 'PC4_CMC-like': 'CL:0000669',
 'SAN_P_cell': 'CL:1000477',
 'SMC1_basic': 'CL:0000192',
 'SMC2_art': 'CL:0002591',
 'T/NK_cycling': 'CL:0000814',
 'vCM1': 'CL:0002131',
 'vCM3_stressed': 'CL:0002131',
 'vCM4': 'CL:0002131',
 'Ventricular Cardiomyocyte': 'CL:0002131',
 'unclassified': 'CL:0000003'}

In [90]:
# add the cell_type_ontology_term_id column

In [91]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [92]:
# change datatype of the column

In [93]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [94]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,SMC1_basic_abundance,SMC2_art_abundance,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,0.139569,0.253794,0.308251,0.035986,0.883052,8.300804,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,0.006722,0.072508,0.043105,0.376688,0.009017,0.099823,EFO:0010961,[],EC4_immune,CL:0000115
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,0.032636,0.023831,0.035329,0.202949,0.020117,0.039279,EFO:0010961,[],FB4,CL:0002548
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,0.051892,0.155938,0.485509,1.166382,0.077411,4.716860,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,0.363823,0.014000,0.520046,2.091609,0.012129,1.431818,EFO:0010961,[],aCM2,CL:0002129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,0.065241,0.031092,0.028939,0.218065,0.058754,0.271111,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,0.034588,0.009102,0.033021,0.114498,0.051356,0.538820,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,0.039221,0.058593,0.211054,0.085760,0.371442,5.962911,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,0.085419,0.033264,0.172424,1.122119,0.021544,2.080328,EFO:0010961,[],aCM4,CL:0002129


#### **donor_id**

In [95]:
#identify the column in adata.obs which provides donor information

In [96]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'SMC1_basic_abundance', 'SMC2_art_abundance', 'aCM1_abundance',
       'aCM2_abundance', 'aCM3_abundance', 'aCM4_abundance',
       'assay_ontology_term_id', 'columns_with_multiple_max',
       'highest_cell_Density_columns', 'cell_type_ontology_term_id'],
      dtype='object', length=119)

In [97]:
list(adata.obs['donor'].unique())

['AV1']

In [98]:
# add the donor_id column

In [99]:
adata.obs['donor_id'] = adata.obs['donor']

In [100]:
# change datatype of the column

In [101]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [102]:
# view unique values of donor_id column

In [103]:
list(adata.obs['donor_id'].unique())

['AV1']

In [104]:
#view obs

In [105]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,SMC2_art_abundance,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,0.253794,0.308251,0.035986,0.883052,8.300804,EFO:0010961,[],aCM4,CL:0002129,AV1
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,0.072508,0.043105,0.376688,0.009017,0.099823,EFO:0010961,[],EC4_immune,CL:0000115,AV1
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,0.023831,0.035329,0.202949,0.020117,0.039279,EFO:0010961,[],FB4,CL:0002548,AV1
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,0.155938,0.485509,1.166382,0.077411,4.716860,EFO:0010961,[],aCM4,CL:0002129,AV1
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,0.014000,0.520046,2.091609,0.012129,1.431818,EFO:0010961,[],aCM2,CL:0002129,AV1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,0.031092,0.028939,0.218065,0.058754,0.271111,EFO:0010961,[],aCM4,CL:0002129,AV1
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,0.009102,0.033021,0.114498,0.051356,0.538820,EFO:0010961,[],aCM4,CL:0002129,AV1
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,0.058593,0.211054,0.085760,0.371442,5.962911,EFO:0010961,[],aCM4,CL:0002129,AV1
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,0.033264,0.172424,1.122119,0.021544,2.080328,EFO:0010961,[],aCM4,CL:0002129,AV1


In [106]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'SMC2_art_abundance', 'aCM1_abundance', 'aCM2_abundance',
       'aCM3_abundance', 'aCM4_abundance', 'assay_ontology_term_id',
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id'],
      dtype='object', length=120)

#### **development_stage_ontology_term_id**

In [107]:
# identify the column in adata which corresponds to age

In [108]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'SMC2_art_abundance', 'aCM1_abundance', 'aCM2_abundance',
       'aCM3_abundance', 'aCM4_abundance', 'assay_ontology_term_id',
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id'],
      dtype='object', length=120)

In [109]:
suppl_info = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/heart_cell_atlas/suppl_info_heart.csv')

In [110]:
mapping = dict(zip(suppl_info['donor'], suppl_info['development_stage_ontology_term_id']))

In [111]:
mapping

{'A61': 'HsapDv:0000242',
 'AH1': 'HsapDv:0000239',
 'AH1-A61': 'unknown',
 'AH2': 'HsapDv:0000239',
 'AH5': 'HsapDv:0000237',
 'AH6': 'HsapDv:0000240',
 'AV1': 'HsapDv:0000239',
 'AV10': 'HsapDv:0000237',
 'AV13': 'HsapDv:0000242',
 'AV14': 'HsapDv:0000239',
 'AV3': 'HsapDv:0000241',
 'D1': 'HsapDv:0000240',
 'D11': 'HsapDv:0000241',
 'D2': 'HsapDv:0000241',
 'D3': 'HsapDv:0000240',
 'D4': 'HsapDv:0000242',
 'D5': 'HsapDv:0000241',
 'D6': 'HsapDv:0000242',
 'D7': 'HsapDv:0000241',
 'D8': 'HsapDv:0000239',
 'H2': 'HsapDv:0000240',
 'H3': 'HsapDv:0000240',
 'H4': 'HsapDv:0000240',
 'H5': 'HsapDv:0000240',
 'H6': 'HsapDv:0000239',
 'H7': 'HsapDv:0000239'}

In [112]:
# add the development_stage_ontology_term_id column

In [113]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['donor'].map(mapping)

In [114]:
# change datatype of the column

In [115]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [116]:
# view unique values of development_stage_ontology_term_id column

In [117]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000239']

In [118]:
# view adata.obs

In [119]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,aCM1_abundance,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,0.308251,0.035986,0.883052,8.300804,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,0.043105,0.376688,0.009017,0.099823,EFO:0010961,[],EC4_immune,CL:0000115,AV1,HsapDv:0000239
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,0.035329,0.202949,0.020117,0.039279,EFO:0010961,[],FB4,CL:0002548,AV1,HsapDv:0000239
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,0.485509,1.166382,0.077411,4.716860,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,0.520046,2.091609,0.012129,1.431818,EFO:0010961,[],aCM2,CL:0002129,AV1,HsapDv:0000239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,0.028939,0.218065,0.058754,0.271111,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,0.033021,0.114498,0.051356,0.538820,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,0.211054,0.085760,0.371442,5.962911,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,0.172424,1.122119,0.021544,2.080328,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239


#### **disease_ontology_term_id**

In [120]:
# Assign normal since all are healthy patients

In [121]:
# add the disease_ontology_term_id column

In [122]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461']* len(adata.obs)

In [123]:
#change data type of column

In [124]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [125]:
# view obs

In [126]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,aCM2_abundance,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,0.035986,0.883052,8.300804,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,0.376688,0.009017,0.099823,EFO:0010961,[],EC4_immune,CL:0000115,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,0.202949,0.020117,0.039279,EFO:0010961,[],FB4,CL:0002548,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,1.166382,0.077411,4.716860,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,2.091609,0.012129,1.431818,EFO:0010961,[],aCM2,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,0.218065,0.058754,0.271111,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,0.114498,0.051356,0.538820,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,0.085760,0.371442,5.962911,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,1.122119,0.021544,2.080328,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461


#### **is_primary_data**

In [127]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [128]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,aCM3_abundance,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,0.883052,8.300804,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,0.009017,0.099823,EFO:0010961,[],EC4_immune,CL:0000115,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,0.020117,0.039279,EFO:0010961,[],FB4,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,0.077411,4.716860,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,0.012129,1.431818,EFO:0010961,[],aCM2,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,0.058754,0.271111,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,0.051356,0.538820,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,0.371442,5.962911,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,0.021544,2.080328,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False


In [129]:
#change data type of column

In [130]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

#### **organism_ontology_term_id**

In [131]:
# assign organism id 

In [132]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [133]:
#change data type of column

In [134]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [135]:
# view obs

In [136]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,aCM4_abundance,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,8.300804,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,0.099823,EFO:0010961,[],EC4_immune,CL:0000115,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,0.039279,EFO:0010961,[],FB4,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,4.716860,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,1.431818,EFO:0010961,[],aCM2,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,0.271111,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,0.538820,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,5.962911,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,2.080328,EFO:0010961,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [137]:
# create a dictionary of donor and ethinic_origin

In [138]:
mapping= {'D1': 'Caucasian',
 'D2': 'Caucasian',
 'D3': 'Caucasian',
 'D4': 'Caucasian',
 'D5': 'Caucasian',
 'D6': 'Caucasian',
 'D7': 'Caucasian',
 'D11': 'Caucasian',
 'H2': 'Caucasian',
 'H3': 'Asian',
 'H4': 'Caucasian',
 'H5': 'Caucasian',
 'H6': 'Caucasian',
 'H7': 'Caucasian',
 'A61': 'Caucasian',
 'AH1': 'Caucasian',
 'AH1-A61': 'unknown',
 'AH2': 'South Asian',
 'AV10': 'Caucasian',
 'AV13': 'Caucasian',
 'AV14': 'Caucasian',
 'AV3': 'Caucasian',
 'D8': 'Caucasian',
 'AH5': 'Caucasian',
 'AH6': 'Caucasian',
 'AV1': 'Caucasian'}

In [139]:
# add ethinic_origin column

In [140]:
adata.obs['ethinic_origin'] = adata.obs['donor'].map(mapping)

In [141]:
# create a dictionary of ethinic_origin and self_reported_ethnicity_ontology_term_id

In [142]:
mapping= {'Caucasian': 'HANCESTRO:0005',
 'Asian': 'HANCESTRO:0008',
 'unknown': 'unknown',
 'South Asian': 'HANCESTRO:0006'}

In [143]:
# add self_reported_ethnicity_ontology_term_id column

In [144]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['ethinic_origin'].map(mapping)

In [145]:
# change data type

In [146]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [147]:
# view unique values

In [148]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['HANCESTRO:0005']

In [149]:
# view obs

In [150]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,[],EC4_immune,CL:0000115,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,[],FB4,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,[],aCM2,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,[],aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005


#### **sex_ontology_term_id**

In [151]:
# identify the column in adata.obs which corresponds to sex

In [152]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object', length=126)

In [153]:
# list the unique values 

In [154]:
list(adata.obs['gender'].unique())

['nan']

In [155]:
# create a dictionary of sex and sex ontology term id

In [156]:
mapping = dict(zip(suppl_info['donor'], suppl_info['sex_ontology_term_id']))

In [157]:
# add sex_ontology_term_id column

In [158]:
adata.obs['sex_ontology_term_id'] = adata.obs['donor'].map(mapping)

In [159]:
# change data type

In [160]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [161]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,EC4_immune,CL:0000115,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,FB4,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,aCM2,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,aCM4,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383


In [162]:
list(adata.obs['sex_ontology_term_id'].unique())

['PATO:0000383']

#### **suspension_type**

In [163]:
# since visium suspension type is 'na'

In [164]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [165]:
# change data type

In [166]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [167]:
# view obs

In [168]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,CL:0000115,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,CL:0002548,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,CL:0002129,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na


#### **tissue_ontology_term_id**

In [169]:
# identify the column in adata.obs which corresponds to tissue

In [170]:
adata.obs.columns

Index(['array_row', 'array_col', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'sangerID', 'region', 'donor', 'donor_type',
       ...
       'cell_type_ontology_term_id', 'donor_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type'],
      dtype='object', length=128)

In [171]:
# list unique values

In [172]:
list(adata.obs['region'].unique())

['SAN']

In [173]:
# create a dictionary with tissue and corresponding tissue ontology term id

In [174]:
mapping= {'AX':'UBERON:0002098', 
          'LV':'UBERON:0002084', 
          'RV':'UBERON:0002080', 
          'LA':'UBERON:0002079', 
          'SP':'UBERON:0002094', 
          'RA':'UBERON:0002078', 
          'SAN':'UBERON:0002351', 
          'AVN':'UBERON:0002352'}

In [175]:
# add 'tissue_ontology_term_id' column

In [176]:
adata.obs['tissue_ontology_term_id'] = adata.obs['region'].map(mapping)

In [177]:
# change data type of column

In [178]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [179]:
#list the unique values in 'tissue_ontology_term_id' column

In [180]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002351']

In [181]:
# view obs

In [182]:
adata.obs

Unnamed: 0_level_0,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor,donor_type,...,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,50,102,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACACCAATAACTGC-1,59,19,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGAGCGACTCCT-1,14,94,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,43,9,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGGGTCTATATT-1,47,13,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,24,64,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,31,77,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_TTGTTTCACATCCAGG-1,58,42,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,60,30,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,AV1,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351


In [183]:
del adata.obs['donor']
del adata.obs['gender']
del adata.obs['highest_cell_Density_columns']
del adata.obs['columns_with_multiple_max']
del adata.obs['array_row']
del adata.obs['array_col']
del adata.obs['modality']
del adata.obs['kit_10x']

#### **obsm (Embeddings)**

In [184]:
adata.obsm

AxisArrays with keys: MT, X_spatial, means_cell_abundance_w_sf, prop, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, stds_cell_abundance_w_sf

In [185]:
adata.obsm.keys()

KeysView(AxisArrays with keys: MT, X_spatial, means_cell_abundance_w_sf, prop, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, stds_cell_abundance_w_sf)

In [186]:
adata.obsm['means_cell_abundance_w_sf']= adata.obsm['means_cell_abundance_w_sf'].values

In [187]:
adata.obsm['stds_cell_abundance_w_sf'] = adata.obsm['stds_cell_abundance_w_sf'].values

In [188]:
adata.obsm['prop']=adata.obsm['prop'].values

In [189]:
adata.obsm['q05_cell_abundance_w_sf'] = adata.obsm['q05_cell_abundance_w_sf'].values

In [190]:
adata.obsm['q95_cell_abundance_w_sf']= adata.obsm['q95_cell_abundance_w_sf'].values

In [191]:
adata.obsm['X_means_cell_abundance_w_sf'] = adata.obsm['means_cell_abundance_w_sf']
adata.obsm['X_prop'] = adata.obsm['prop']
adata.obsm['X_q05_cell_abundance_w_sf'] = adata.obsm['q05_cell_abundance_w_sf']
adata.obsm['X_q95_cell_abundance_w_sf'] = adata.obsm['q95_cell_abundance_w_sf']
adata.obsm['X_stds_cell_abundance_w_sf'] = adata.obsm['stds_cell_abundance_w_sf']

In [192]:
adata.obsm

AxisArrays with keys: MT, X_spatial, means_cell_abundance_w_sf, prop, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, stds_cell_abundance_w_sf, X_means_cell_abundance_w_sf, X_prop, X_q05_cell_abundance_w_sf, X_q95_cell_abundance_w_sf, X_stds_cell_abundance_w_sf

In [193]:
del adata.obsm['MT']
del adata.obsm['means_cell_abundance_w_sf']
del adata.obsm['prop']
del adata.obsm['q05_cell_abundance_w_sf']
del adata.obsm['q95_cell_abundance_w_sf']
del adata.obsm['stds_cell_abundance_w_sf']

In [194]:
adata.obsm

AxisArrays with keys: X_spatial, X_means_cell_abundance_w_sf, X_prop, X_q05_cell_abundance_w_sf, X_q95_cell_abundance_w_sf, X_stds_cell_abundance_w_sf

#### **uns (Dataset Metadata)**

In [195]:
adata.uns

OverloadedDict, wrapping:
	{'log1p': {}, 'spatial': {'HCAHeartST11702008': {'images': {'hires': array([[[0.94509804, 0.9254902 , 0.92941177],
        [0.94509804, 0.92156863, 0.92941177],
        [0.94509804, 0.92156863, 0.92941177],
        ...,
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ]],

       [[0.9490196 , 0.9254902 , 0.92941177],
        [0.94509804, 0.9254902 , 0.92941177],
        [0.94509804, 0.92156863, 0.92941177],
        ...,
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ]],

       [[0.9490196 , 0.9254902 , 0.92941177],
        [0.9490196 , 0.9254902 , 0.92941177],
        [0.94509804, 0.9254902 , 0.92941177],
        ...,
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ]],

       ...,

       [[0.9450980

In [196]:
adata.uns['image_caption'] = 'Shown here is an image of 10 μm thick slice of  sinoatrial node region of the adult human heart stained with H&E'

In [197]:
adata.uns['title'] = 'Visium spatial - HCAHeartST11702008 (FFPE)'

In [198]:
adata.uns['default_embedding'] = 'X_spatial'

In [199]:
adata.uns.keys()

dict_keys(['log1p', 'spatial', 'image_caption', 'title', 'default_embedding'])

### **Final checks and adjustments**

In [200]:
adata

AnnData object with n_obs × n_vars = 3533 × 17912
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'sangerID', 'region', 'donor_type', 'age', 'facility', 'flushed', 'annotation_final', 'prop_Adip1', 'prop_Adip2', 'prop_B', 'prop_B_plasma', 'prop_CD14+Mo', 'prop_CD16+Mo', 'prop_CD4+T_act', 'prop_CD4+T_naive', 'prop_CD8+T_cytox', 'prop_CD8+T_em', 'prop_CD8+T_te', 'prop_CD8+T_trans', 'prop_DC', 'prop_EC10_CMC-like', 'prop_EC1_cap', 'prop_EC2_cap', 'prop_EC3_cap', 'prop_EC4_immune', 'prop_EC5_art', 'prop_EC6_ven', 'prop_EC7_endocardial', 'prop_EC8_ln', 'prop_FB1', 'prop_FB2', 'prop_FB3', 'prop_FB4_activated', 'prop_FB5', 'prop_FB6', 'prop_ILC', 'prop_LYVE1+IGF1+MP', 'prop_LYVE1+MP_cycling', 'prop_LYVE1+TIMD4+MP', 'prop_MAIT-like', 'prop_Mast', 'prop_MoMP', 'prop_NC1_glial', 'prop_NC2_glial_NGF+', 'prop_NK_CD16hi', 'prop_NK_CD56hi', 'prop_Neut', 'prop_PC2_atria', 'prop_PC3_str', 'prop_SAN_P_cell', 'prop_SMC1_basic', 'prop_SMC2_art', 'prop_aCM1',

In [201]:
adata.obs.dtypes

n_genes_by_counts                              int32
log1p_n_genes_by_counts                      float64
total_counts                                 float32
log1p_total_counts                           float32
sangerID                                    category
                                              ...   
ethinic_origin                              category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
suspension_type                             category
tissue_ontology_term_id                     category
Length: 121, dtype: object

In [202]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [203]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.get_level_values(0):
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.get_level_values(0):
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.get_level_values(0):
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed log1p_n_genes_by_counts from float64 to float32
changed prop_Adip1 from float64 to float32
changed prop_Adip2 from float64 to float32
changed prop_B from float64 to float32
changed prop_B_plasma from float64 to float32
changed prop_CD14+Mo from float64 to float32
changed prop_CD16+Mo from float64 to float32
changed prop_CD4+T_act from float64 to float32
changed prop_CD4+T_naive from float64 to float32
changed prop_CD8+T_cytox from float64 to float32
changed prop_CD8+T_em from float64 to float32
changed prop_CD8+T_te from float64 to float32
changed prop_CD8+T_trans from float64 to float32
changed prop_DC from float64 to float32
changed prop_EC10_CMC-like from float64 to float32
changed prop_EC1_cap from float64 to float32
changed prop_EC2_cap from float64 to float32
changed prop_EC3_cap from float64 to float32
changed prop_EC4_immune from float64 to float32
changed prop_EC5_art from float64 to float32
changed prop_EC6_ven from float64 to float32
changed prop_EC7_endocardial from

In [204]:
adata.obs

Unnamed: 0_level_0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor_type,age,facility,flushed,...,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACACCAATAACTGC-1,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGAGCGACTCCT-1,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGGGTCTATATT-1,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_TTGTTTCACATCCAGG-1,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351


In [205]:
adata.obs.columns

Index(['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'sangerID', 'region', 'donor_type', 'age',
       'facility', 'flushed',
       ...
       'donor_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=121)

In [206]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
gene_ids,Unnamed: 1_level_1
ENSG00000187634,False
ENSG00000188976,False
ENSG00000187961,False
ENSG00000187583,False
ENSG00000187642,False
...,...
ENSG00000160298,False
ENSG00000160299,False
ENSG00000160305,False
ENSG00000160307,False


In [207]:
adata.obs

Unnamed: 0_level_0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,sangerID,region,donor_type,age,facility,flushed,...,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,4557,8.424639,14605.0,9.589188,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACACCAATAACTGC-1,2134,7.666222,3628.0,8.196712,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGAGCGACTCCT-1,2004,7.603399,3318.0,8.107419,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,4028,8.301273,10940.0,9.300272,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGGGTCTATATT-1,4049,8.306472,10116.0,9.221972,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702008_TTGTTCAGTGTGCTAC-1,1614,7.387090,2548.0,7.843456,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_TTGTTGTGTGTCAAGA-1,2897,7.971776,5786.0,8.663369,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_TTGTTTCACATCCAGG-1,4646,8.443977,14860.0,9.606496,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_TTGTTTCATTAGTCTA-1,5269,8.569786,15816.0,9.668840,HCAHeartST11702008,SAN,,,Sanger,,...,AV1,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351


In [208]:
adata.obs.columns

Index(['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'sangerID', 'region', 'donor_type', 'age',
       'facility', 'flushed',
       ...
       'donor_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=121)

In [209]:
#check the format of expression matrix

In [210]:
adata.X

<3533x17912 sparse matrix of type '<class 'numpy.float32'>'
	with 12253655 stored elements in Compressed Sparse Row format>

In [211]:
araw.X

<3533x17912 sparse matrix of type '<class 'numpy.float32'>'
	with 12253655 stored elements in Compressed Sparse Row format>

In [212]:
#Copy raw counts to adata.raw

In [213]:
adata.raw = araw

In [214]:
#write the curated object to final_objects folder

In [215]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/final_objects/HCAHeartST11702008(FFPE).h5ad', compression = 'gzip')