### **Curating visium-FFPE.h5ad**

Article: Spatially resolved multiomics of human cardiac niches

DOI: https://doi.org/10.1038/s41586-023-06311-1 

Data Source : https://www.heartcellatlas.org

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/Raw_counts/visium-FFPE_adult-8reg_raw.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 11312 × 17922
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac', 'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'region_cell2loc'
    var: 'gene_ids', 'feature_types', 'genome', 'SYMBOL'
    uns: 'spatial'
    obsm: 'MT', 'means_cell_abundance_w_sf', 'q05_cell_abundance_w_sf', 'q95_cell_abundance_w_sf', 'spatial', 'stds_cell_abundance_w_sf'

##### **X- expression matrix**

In [7]:
# View the expression matrix of the anndata object

In [8]:
adata.X

<11312x17922 sparse matrix of type '<class 'numpy.float32'>'
	with 36178068 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 1)	2.0
  (0, 4)	2.0
  (0, 5)	4.0
  (0, 12)	1.0
  (0, 13)	6.0
  (0, 14)	2.0
  (0, 16)	1.0
  (0, 18)	3.0
  (0, 20)	1.0
  (0, 23)	3.0
  (0, 24)	1.0
  (0, 25)	3.0
  (0, 26)	2.0
  (0, 29)	1.0
  (0, 32)	5.0
  (0, 36)	1.0
  (0, 39)	2.0
  (0, 40)	3.0
  (0, 45)	1.0
  (0, 47)	2.0
  (0, 49)	1.0
  (0, 67)	5.0
  (0, 68)	2.0
  (0, 72)	1.0
  (0, 75)	1.0
  :	:
  (11311, 17822)	1.0
  (11311, 17826)	6.0
  (11311, 17832)	1.0
  (11311, 17835)	1.0
  (11311, 17838)	1.0
  (11311, 17843)	1.0
  (11311, 17844)	1.0
  (11311, 17845)	1.0
  (11311, 17846)	3.0
  (11311, 17854)	1.0
  (11311, 17856)	2.0
  (11311, 17871)	1.0
  (11311, 17874)	11.0
  (11311, 17878)	2.0
  (11311, 17881)	2.0
  (11311, 17900)	1.0
  (11311, 17901)	3.0
  (11311, 17903)	2.0
  (11311, 17904)	1.0
  (11311, 17907)	1.0
  (11311, 17910)	3.0
  (11311, 17911)	8.0
  (11311, 17913)	1.0
  (11311, 17914)	1.0
  (11311, 17919)	1.0


##### **Variables(var)**

In [11]:
#View the var of anndata and raw object

In [12]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [13]:
# load the approved genes file

In [14]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [15]:
# Create a dictionary from the approved genes file using the symbols and feature id columns.

In [16]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [17]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [18]:
len(genedict)

116184

In [19]:
# Filter out the genes which are not in the approved genes file

In [20]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]

In [21]:
len(var_to_keep_adata)

17912

In [22]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


In [23]:
# Modify the anndata object by filtering out the filtered genes. copy the index column values to a new column called gene_symbols

In [24]:
adata = adata[:, var_to_keep_adata].copy()

In [25]:
#  View the var

In [26]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1
...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B


feature is filtered

In [27]:
# Assign False since the feature was not filtered out in the normalized matrix (X).

In [28]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [29]:
#View var

In [30]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome,SYMBOL,feature_is_filtered
ENSG00000187634,ENSG00000187634,Gene Expression,GRCh38,SAMD11,False
ENSG00000188976,ENSG00000188976,Gene Expression,GRCh38,NOC2L,False
ENSG00000187961,ENSG00000187961,Gene Expression,GRCh38,KLHL17,False
ENSG00000187583,ENSG00000187583,Gene Expression,GRCh38,PLEKHN1,False
ENSG00000187642,ENSG00000187642,Gene Expression,GRCh38,PERM1,False
...,...,...,...,...,...
ENSG00000160298,ENSG00000160298,Gene Expression,GRCh38,C21orf58,False
ENSG00000160299,ENSG00000160299,Gene Expression,GRCh38,PCNT,False
ENSG00000160305,ENSG00000160305,Gene Expression,GRCh38,DIP2A,False
ENSG00000160307,ENSG00000160307,Gene Expression,GRCh38,S100B,False


In [31]:
#  Delete the unwanted columns in adata and araw.

In [32]:
del adata.var['gene_ids']
del adata.var['SYMBOL']
del adata.var['feature_types']
del adata.var['genome']

In [33]:
# view var

In [34]:
adata.var

Unnamed: 0,feature_is_filtered
ENSG00000187634,False
ENSG00000188976,False
ENSG00000187961,False
ENSG00000187583,False
ENSG00000187642,False
...,...
ENSG00000160298,False
ENSG00000160299,False
ENSG00000160305,False
ENSG00000160307,False


#### **Observations(obs) (Cell metadata)**

In [35]:
#view obs

In [36]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,region,region_finest,age,gender,facility,cell_or_nuclei,modality,kit_10x,flushed,region_cell2loc
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,SAN-FFPE,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,SAN-FFPE,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,SAN-FFPE,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,SAN-FFPE,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,SAN-FFPE,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE


In [37]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac',
       'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication',
       'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
       'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x',
       'flushed', 'region_cell2loc'],
      dtype='object')

#### **assay_ontology_term_id**

In [38]:
# identify the column in adata which corresponds to assay

In [39]:
list(adata.obs['kit_10x'].unique())

['Visium-v1-FFPE']

In [40]:
# add the assay_ontology_term_id column

In [41]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [42]:
# change datatype of the column

In [43]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [44]:
# view adata.obs

In [45]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,region_finest,age,gender,facility,cell_or_nuclei,modality,kit_10x,flushed,region_cell2loc,assay_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,SAN CRAN CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,SAN MID CRAN,,,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961


#### **cell_type_ontology_term_id**

In [46]:
#get the column in adata.obs related. to cell type annotation

In [47]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac',
       'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication',
       'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
       'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x',
       'flushed', 'region_cell2loc', 'assay_ontology_term_id'],
      dtype='object')

In [48]:
adata.obsm

AxisArrays with keys: MT, means_cell_abundance_w_sf, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, spatial, stds_cell_abundance_w_sf

In [49]:
adata.obsm['means_cell_abundance_w_sf']

Unnamed: 0_level_0,meanscell_abundance_w_sf_Adip1,meanscell_abundance_w_sf_Adip2,meanscell_abundance_w_sf_B,meanscell_abundance_w_sf_B_plasma,meanscell_abundance_w_sf_CD14+Mo,meanscell_abundance_w_sf_CD16+Mo,meanscell_abundance_w_sf_CD4+T_act,meanscell_abundance_w_sf_CD4+T_naive,meanscell_abundance_w_sf_CD8+T_cytox,meanscell_abundance_w_sf_CD8+T_em,...,meanscell_abundance_w_sf_Neut,meanscell_abundance_w_sf_PC2_atria,meanscell_abundance_w_sf_PC3_str,meanscell_abundance_w_sf_SAN_P_cell,meanscell_abundance_w_sf_SMC1_basic,meanscell_abundance_w_sf_SMC2_art,meanscell_abundance_w_sf_aCM1,meanscell_abundance_w_sf_aCM2,meanscell_abundance_w_sf_aCM3,meanscell_abundance_w_sf_aCM4
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,0.034087,0.014062,0.131219,0.204413,0.202151,0.055197,0.221306,0.234631,0.074122,0.182979,...,0.557784,0.206627,1.181703,0.628595,0.364029,0.371932,0.577294,0.119928,1.146386,8.548860
HCAHeartST11702008_AAACACCAATAACTGC-1,0.012364,0.013477,0.147598,0.100063,0.054316,0.039809,0.081068,0.103140,0.078240,0.065698,...,0.211951,0.084845,0.047951,0.059058,0.050788,0.119662,0.120812,0.434308,0.042894,0.156782
HCAHeartST11702008_AAACAGAGCGACTCCT-1,0.007575,0.004719,0.087094,0.007829,0.081405,0.031330,0.051603,0.063728,0.108002,0.056766,...,0.071347,0.047041,0.054377,0.159433,0.106617,0.054258,0.098117,0.249715,0.058340,0.079322
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,0.017588,0.018783,0.086528,0.183383,0.190777,0.064008,0.086374,0.083343,0.057750,0.069885,...,0.134450,0.075085,0.188253,0.269785,0.172557,0.239707,0.754950,1.302178,0.215141,4.904526
HCAHeartST11702008_AAACAGGGTCTATATT-1,0.004345,0.002594,0.077050,0.020789,0.066558,0.097732,0.071563,0.117570,0.067732,0.069363,...,0.117135,0.072369,0.108168,0.097635,0.537598,0.047743,0.739938,2.193104,0.060727,1.561950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,0.014647,0.016158,0.141416,0.026997,0.043420,0.036142,0.043286,0.089484,0.101932,0.072491,...,0.095902,0.063026,0.029871,0.098360,0.168560,0.017221,0.249792,0.139493,0.126452,0.055456
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,0.009754,0.006928,0.056721,0.040988,0.156312,0.039324,0.064720,0.066373,0.109144,0.069932,...,0.391903,0.131205,0.100241,0.080365,0.177255,0.041504,0.647972,0.127906,0.309471,0.924165
HCAHeartST11702010_TTGTTTCCATACAACT-1,0.026111,0.025188,0.059657,0.016857,0.088733,0.037009,0.056217,0.059852,0.408165,0.053619,...,0.087774,0.201910,0.046899,0.029533,0.076989,0.132591,0.058321,0.238574,0.170422,0.237617
HCAHeartST11702010_TTGTTTGTATTACACG-1,0.042995,0.079800,0.129303,0.012647,0.023697,0.034052,0.042160,0.062178,0.048363,0.051912,...,0.056953,0.086132,0.064089,0.121906,0.080216,0.012073,0.068237,0.466279,0.494972,0.032782


In [50]:
max_columns = adata.obsm['means_cell_abundance_w_sf'].apply(lambda x: x[x == x.max()].index.tolist(), axis=1)

In [51]:
max_columns

spot_id
HCAHeartST11702008_AAACAAGTATCTCCCA-1          [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702008_AAACACCAATAACTGC-1    [meanscell_abundance_w_sf_EC4_immune]
HCAHeartST11702008_AAACAGAGCGACTCCT-1           [meanscell_abundance_w_sf_FB4]
HCAHeartST11702008_AAACAGCTTTCAGAAG-1          [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702008_AAACAGGGTCTATATT-1          [meanscell_abundance_w_sf_aCM2]
                                                         ...                  
HCAHeartST11702010_TTGTTTCACATCCAGG-1        [meanscell_abundance_w_sf_EC8_ln]
HCAHeartST11702010_TTGTTTCATTAGTCTA-1          [meanscell_abundance_w_sf_aCM4]
HCAHeartST11702010_TTGTTTCCATACAACT-1           [meanscell_abundance_w_sf_FB3]
HCAHeartST11702010_TTGTTTGTATTACACG-1           [meanscell_abundance_w_sf_FB6]
HCAHeartST11702010_TTGTTTGTGTAAATTC-1          [meanscell_abundance_w_sf_aCM4]
Length: 11312, dtype: object

In [52]:
columns_with_multiple_max = max_columns.apply(lambda x: [col for col in x if x.count(col)>1])

In [53]:
columns_with_multiple_max 

spot_id
HCAHeartST11702008_AAACAAGTATCTCCCA-1    []
HCAHeartST11702008_AAACACCAATAACTGC-1    []
HCAHeartST11702008_AAACAGAGCGACTCCT-1    []
HCAHeartST11702008_AAACAGCTTTCAGAAG-1    []
HCAHeartST11702008_AAACAGGGTCTATATT-1    []
                                         ..
HCAHeartST11702010_TTGTTTCACATCCAGG-1    []
HCAHeartST11702010_TTGTTTCATTAGTCTA-1    []
HCAHeartST11702010_TTGTTTCCATACAACT-1    []
HCAHeartST11702010_TTGTTTGTATTACACG-1    []
HCAHeartST11702010_TTGTTTGTGTAAATTC-1    []
Length: 11312, dtype: object

In [54]:
adata.obs['columns_with_multiple_max'] = columns_with_multiple_max

In [55]:
non_empty_columns = columns_with_multiple_max[columns_with_multiple_max .apply(lambda x: len(x) > 0)]

In [56]:
len(non_empty_columns )

0

In [57]:
max_columns = [col[0].replace('meanscell_abundance_w_sf_','') if len(col) > 0 else '' for col in max_columns]

In [58]:
adata.obs['highest_cell_Density_columns'] = max_columns

In [59]:
adata.obs['highest_cell_Density_columns']

spot_id
HCAHeartST11702008_AAACAAGTATCTCCCA-1          aCM4
HCAHeartST11702008_AAACACCAATAACTGC-1    EC4_immune
HCAHeartST11702008_AAACAGAGCGACTCCT-1           FB4
HCAHeartST11702008_AAACAGCTTTCAGAAG-1          aCM4
HCAHeartST11702008_AAACAGGGTCTATATT-1          aCM2
                                            ...    
HCAHeartST11702010_TTGTTTCACATCCAGG-1        EC8_ln
HCAHeartST11702010_TTGTTTCATTAGTCTA-1          aCM4
HCAHeartST11702010_TTGTTTCCATACAACT-1           FB3
HCAHeartST11702010_TTGTTTGTATTACACG-1           FB6
HCAHeartST11702010_TTGTTTGTGTAAATTC-1          aCM4
Name: highest_cell_Density_columns, Length: 11312, dtype: object

In [60]:
list(adata.obs['highest_cell_Density_columns'].unique())

['aCM4',
 'EC4_immune',
 'FB4',
 'aCM2',
 'CD8+T_cytox',
 'ILC',
 'FB3',
 'Neut',
 'Adip1',
 'FB1',
 'aCM3',
 'DC',
 'NC1',
 'SAN_P_cell',
 'NC2',
 'EC10_CMC-like',
 'EC7_atria',
 'SMC2_art',
 'EC2_cap',
 'B',
 'PC2_atria',
 'EC5_art',
 'FB5',
 'Mast',
 'FB2',
 'FB6',
 'SMC1_basic',
 'EC1_cap',
 'NK_CD56hi',
 'aCM1',
 'EC3_cap',
 'EC8_ln',
 'PC3_str',
 'NK_CD16hi',
 'EC6_ven',
 'LYVE1+TIMD4+MP',
 'CD4+T_naive',
 'MoMP',
 'Adip2',
 'LYVE1+IGF1+MP',
 'CD4+T_act',
 'LYVE1+MP_cycling',
 'MAIT-like',
 'CD14+Mo',
 'CD16+Mo',
 'CD8+T_te',
 'CD8+T_trans',
 'CD8+T_em']

In [61]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['highest_cell_Density_columns']

In [62]:
# create a dictionary of cell type and ontology term

In [63]:
mapping= {'aCM1': 'CL:0002129',
 'aCM2': 'CL:0002129',
 'aCM3': 'CL:0002129',
 'aCM4': 'CL:0002129',
 'aCM5': 'CL:0002129',
 'Adip1': 'CL:0000136',
 'Adip2': 'CL:0000136',
 'Adip3': 'CL:0000136',
 'Adip4': 'CL:0000136',
 'Adipocyte': 'CL:0000136',
 'Atrial Cardiomyocyte': 'CL:0002129',
 'AVN_bundle_cell': 'CL:0010005',
 'AVN_P_cell': 'CL:1000477',
 'B': 'CL:0000236',
 'B_plasma': 'CL:0000786',
 'CD14+Mo': 'CL:0001054',
 'CD16+Mo': 'CL:0002396',
 'CD4+T_act': 'CL:0000896',
 'CD4+T_naive': 'CL:0000895',
 'CD4+T_Th2': 'CL:0000546',
 'CD8+T_cytox': 'CL:0000794',
 'CD8+T_em': 'CL:0000913',
 'CD8+T_te': 'CL:0000625',
 'CD8+T_trans': 'CL:0000625',
 'DC': 'CL:0001056',
 'EC1_cap': 'CL:0002144',
 'EC10_CMC-like': 'CL:0000115',
 'EC2_cap': 'CL:0002144',
 'EC3_cap': 'CL:0002144',
 'EC4_immune': 'CL:0000115',
 'EC5_art': 'CL:1000413',
 'EC6_ven': 'CL:0002543',
 'EC7_atria': 'CL:0002350',
 'EC7_endocardial': 'CL:0002350',
 'EC8_ln': 'CL:0002138',
 'Endothelial cell': 'CL:0000115',
 'FB1': 'CL:0002548',
 'FB2': 'CL:0002548',
 'FB3': 'CL:0002548',
 'FB4': 'CL:0002548',
 'FB4_activated': 'CL:0002548',
 'FB5': 'CL:0002548',
 'FB6': 'CL:0002548',
 'Fibroblast': 'CL:0000057',
 'ILC': 'CL:0001065',
 'Lymphatic Endothelial cell': 'CL:0002138',
 'Lymphoid': 'CL:0000542',
 'LYVE1+IGF1+MP': 'CL:0000235',
 'LYVE1+MP_cycling': 'CL:0000235',
 'LYVE1+TIMD4+MP': 'CL:0000235',
 'MAIT-like': 'CL:0000940',
 'Mast': 'CL:0000097',
 'Mast cell': 'CL:0000097',
 'Meso': 'CL:0000077',
 'Mesothelial cell': 'CL:0000077',
 'MoMP': 'CL:0000576',
 'Mural cell': 'CL:0008034',
 'Myeloid': 'CL:0000763',
 'NC1': 'CL:0000125',
 'NC1_glial': 'CL:0000125',
 'NC2': 'CL:0000125',
 'NC2_glial_NGF+': 'CL:0000125',
 'Neural cell': 'CL:0002319',
 'Neut': 'CL:0000775',
 'NK_CD16hi': 'CL:0000939',
 'NK_CD56hi': 'CL:0000938',
 'PC1_vent': 'CL:0000669',
 'PC2_atria': 'CL:0000669',
 'PC3_str': 'CL:0000669',
 'PC4_CMC-like': 'CL:0000669',
 'SAN_P_cell': 'CL:1000477',
 'SMC1_basic': 'CL:0000192',
 'SMC2_art': 'CL:0002591',
 'T/NK_cycling': 'CL:0000814',
 'vCM1': 'CL:0002131',
 'vCM3_stressed': 'CL:0002131',
 'vCM4': 'CL:0002131',
 'Ventricular Cardiomyocyte': 'CL:0002131',
 'unclassified': 'CL:0000003'}

In [64]:
# add the cell_type_ontology_term_id column

In [65]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [66]:
# change datatype of the column

In [67]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [68]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,facility,cell_or_nuclei,modality,kit_10x,flushed,region_cell2loc,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],EC4_immune,CL:0000115
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB4,CL:0002548
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM2,CL:0002129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],EC8_ln,CL:0002138
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB3,CL:0002548
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,Sanger,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB6,CL:0002548


#### **donor_id**

In [69]:
#identify the column in adata.obs which provides donor information

In [70]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac',
       'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication',
       'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
       'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x',
       'flushed', 'region_cell2loc', 'assay_ontology_term_id',
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id'],
      dtype='object')

In [71]:
list(adata.obs['donor'].unique())

['HOPA3']

In [72]:
list(adata.obs['combinedID'].unique())

['na']

In [73]:
# add the donor_id column

In [74]:
adata.obs['donor_id'] = adata.obs['donor']

In [75]:
# change datatype of the column

In [76]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [77]:
# view unique values of donor_id column

In [78]:
list(adata.obs['donor_id'].unique())

['HOPA3']

In [79]:
#view obs

In [80]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,cell_or_nuclei,modality,kit_10x,flushed,region_cell2loc,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],EC4_immune,CL:0000115,HOPA3
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB4,CL:0002548,HOPA3
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM2,CL:0002129,HOPA3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],EC8_ln,CL:0002138,HOPA3
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB3,CL:0002548,HOPA3
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,na,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB6,CL:0002548,HOPA3


In [81]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac',
       'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication',
       'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
       'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x',
       'flushed', 'region_cell2loc', 'assay_ontology_term_id',
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id'],
      dtype='object')

In [82]:
list(adata.obs['annotation_JC'].unique())

['myocardium',
 'endocardium',
 'node',
 'fat_subendocardial',
 'fat_subepicardial',
 'vessel',
 'nerve',
 'fibrosis_subendocardial',
 'other_haemorrhage',
 'epicardium']

#### **development_stage_ontology_term_id**

In [83]:
# identify the column in adata which corresponds to age

In [84]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac',
       'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication',
       'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
       'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x',
       'flushed', 'region_cell2loc', 'assay_ontology_term_id',
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id'],
      dtype='object')

In [85]:
list(adata.obs['age'].unique())

['nan']

In [86]:
adata.obs['age'] = ['45-50'] * len(adata.obs)

In [87]:
suppl_info = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/heart_cell_atlas/suppl_info_heart.csv')

In [88]:
# create a dictionary for age and development stage ontology term id

In [89]:
mapping = dict(zip(suppl_info['age'], suppl_info['development_stage_ontology_term_id']))

In [90]:
# add the development_stage_ontology_term_id column

In [91]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['age'].map(mapping)

In [92]:
# change datatype of the column

In [93]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [94]:
# view unique values of development_stage_ontology_term_id column

In [95]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000239']

In [96]:
# view adata.obs

In [97]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,modality,kit_10x,flushed,region_cell2loc,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],EC4_immune,CL:0000115,HOPA3,HsapDv:0000239
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB4,CL:0002548,HOPA3,HsapDv:0000239
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM2,CL:0002129,HOPA3,HsapDv:0000239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],EC8_ln,CL:0002138,HOPA3,HsapDv:0000239
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB3,CL:0002548,HOPA3,HsapDv:0000239
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,Visium-FFPE,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB6,CL:0002548,HOPA3,HsapDv:0000239


In [98]:
list(adata.obs['highest_cell_Density_columns'].unique())

['aCM4',
 'EC4_immune',
 'FB4',
 'aCM2',
 'CD8+T_cytox',
 'ILC',
 'FB3',
 'Neut',
 'Adip1',
 'FB1',
 'aCM3',
 'DC',
 'NC1',
 'SAN_P_cell',
 'NC2',
 'EC10_CMC-like',
 'EC7_atria',
 'SMC2_art',
 'EC2_cap',
 'B',
 'PC2_atria',
 'EC5_art',
 'FB5',
 'Mast',
 'FB2',
 'FB6',
 'SMC1_basic',
 'EC1_cap',
 'NK_CD56hi',
 'aCM1',
 'EC3_cap',
 'EC8_ln',
 'PC3_str',
 'NK_CD16hi',
 'EC6_ven',
 'LYVE1+TIMD4+MP',
 'CD4+T_naive',
 'MoMP',
 'Adip2',
 'LYVE1+IGF1+MP',
 'CD4+T_act',
 'LYVE1+MP_cycling',
 'MAIT-like',
 'CD14+Mo',
 'CD16+Mo',
 'CD8+T_te',
 'CD8+T_trans',
 'CD8+T_em']

#### **disease_ontology_term_id**

In [99]:
# Assign normal since all are healthy patients

In [100]:
# add the disease_ontology_term_id column

In [101]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461']* len(adata.obs)

In [102]:
#change data type of column

In [103]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [104]:
# view obs

In [105]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,kit_10x,flushed,region_cell2loc,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],EC4_immune,CL:0000115,HOPA3,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB4,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM2,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],EC8_ln,CL:0002138,HOPA3,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB3,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,Visium-v1-FFPE,,SAN-FFPE,EFO:0010961,[],FB6,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461


#### **is_primary_data**

In [106]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [107]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,flushed,region_cell2loc,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,,SAN-FFPE,EFO:0010961,[],EC4_immune,CL:0000115,HOPA3,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,,SAN-FFPE,EFO:0010961,[],FB4,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,,SAN-FFPE,EFO:0010961,[],aCM2,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,,SAN-FFPE,EFO:0010961,[],EC8_ln,CL:0002138,HOPA3,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,,SAN-FFPE,EFO:0010961,[],FB3,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,,SAN-FFPE,EFO:0010961,[],FB6,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False


In [108]:
#change data type of column

In [109]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

#### **organism_ontology_term_id**

In [110]:
# assign organism id 

In [111]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [112]:
#change data type of column

In [113]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [114]:
# view obs

In [115]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,region_cell2loc,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,SAN-FFPE,EFO:0010961,[],EC4_immune,CL:0000115,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,SAN-FFPE,EFO:0010961,[],FB4,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,SAN-FFPE,EFO:0010961,[],aCM2,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,SAN-FFPE,EFO:0010961,[],EC8_ln,CL:0002138,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,SAN-FFPE,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,SAN-FFPE,EFO:0010961,[],FB3,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,SAN-FFPE,EFO:0010961,[],FB6,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [116]:
# add self_reported_ethnicity_ontology_term_id column

In [117]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['HANCESTRO:0005'] * len(adata.obs)

In [118]:
# change data type

In [119]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [120]:
# view obs

In [121]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,assay_ontology_term_id,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,EFO:0010961,[],EC4_immune,CL:0000115,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,EFO:0010961,[],FB4,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,EFO:0010961,[],aCM2,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,EFO:0010961,[],EC8_ln,CL:0002138,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,EFO:0010961,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,EFO:0010961,[],FB3,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,EFO:0010961,[],FB6,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005


#### **sex_ontology_term_id**

In [122]:
# identify the column in adata.obs which corresponds to sex

In [123]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac',
       'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication',
       'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
       'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x',
       'flushed', 'region_cell2loc', 'assay_ontology_term_id',
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [124]:
# list the unique values 

In [125]:
list(adata.obs['gender'].unique())

['nan']

In [126]:
# create a dictionary of sex and sex ontology term id

In [127]:
adata.obs['sex_ontology_term_id'] = ['PATO:0000383'] * len(adata.obs)

In [128]:
# change data type

In [129]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [130]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,columns_with_multiple_max,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,[],EC4_immune,CL:0000115,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,[],FB4,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,[],aCM2,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,[],EC8_ln,CL:0002138,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,[],aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,[],FB3,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,[],FB6,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383


#### **suspension_type**

In [131]:
# since visium suspension type is 'na'

In [132]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [133]:
# change data type

In [134]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [135]:
# view obs

In [136]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,highest_cell_Density_columns,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,EC4_immune,CL:0000115,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,FB4,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,aCM2,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,EC8_ln,CL:0002138,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,aCM4,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,FB3,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,FB6,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na


#### **tissue_ontology_term_id**

In [137]:
# identify the column in adata.obs which corresponds to tissue

In [138]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac',
       'n_counts', 'n_genes', 'annotation_JC', 'sangerID', 'Publication',
       'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
       'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x',
       'flushed', 'region_cell2loc', 'assay_ontology_term_id',
       'columns_with_multiple_max', 'highest_cell_Density_columns',
       'cell_type_ontology_term_id', 'donor_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type'],
      dtype='object')

In [139]:
# list unique values

In [140]:
list(adata.obs['annotation_JC'].unique())

['myocardium',
 'endocardium',
 'node',
 'fat_subendocardial',
 'fat_subepicardial',
 'vessel',
 'nerve',
 'fibrosis_subendocardial',
 'other_haemorrhage',
 'epicardium']

In [141]:
# create a dictionary with tissue and corresponding tissue ontology term id

In [142]:
mapping= {'AX':'UBERON:0002098', 
          'LV':'UBERON:0002084', 
          'RV':'UBERON:0002080', 
          'LA':'UBERON:0002079', 
          'SP':'UBERON:0002094', 
          'RA':'UBERON:0002078', 
          'SAN':'UBERON:0002351', 
          'AVN':'UBERON:0002352',
         'SAN-FFPE':'UBERON:0002351'}

In [143]:
# add 'tissue_ontology_term_id' column

In [144]:
adata.obs['tissue_ontology_term_id'] = adata.obs['region'].map(mapping)

In [145]:
# change data type of column

In [146]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [147]:
#list the unique values in 'tissue_ontology_term_id' column

In [148]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002351']

In [149]:
# view obs

In [150]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,50,102,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489558,36.857241,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACACCAATAACTGC-1,1,59,19,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018743,25.606395,...,CL:0000115,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,14,94,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582278,...,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,43,9,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,47,13,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013049,28.706999,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,58,42,HCAHeartST11702010,2707,7.903966,4686.0,8.452548,16.282544,22.108408,...,CL:0002138,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,60,30,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294299,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,45,27,HCAHeartST11702010,2435,7.798113,4555.0,8.424200,19.626784,26.498353,...,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,73,41,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,...,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351


In [151]:
del adata.obs['donor']
del adata.obs['gender']
del adata.obs['highest_cell_Density_columns']
del adata.obs['columns_with_multiple_max']
del adata.obs['array_row']
del adata.obs['array_col']
del adata.obs['modality']
del adata.obs['kit_10x']
del adata.obs['age'] 
del adata.obs['region']
del adata.obs['cell_or_nuclei']
del adata.obs['region_cell2loc'] 
del adata.obs['sangerID'] 
del adata.obs['combinedID']

In [152]:
adata.obs.columns

Index(['in_tissue', 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts',
       'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes',
       'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
       'pct_counts_in_top_500_genes', 'mt_frac', 'n_counts', 'n_genes',
       'annotation_JC', 'Publication', 'donor_type', 'region_finest',
       'facility', 'flushed', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'donor_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [153]:
adata.obsm

AxisArrays with keys: MT, means_cell_abundance_w_sf, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, spatial, stds_cell_abundance_w_sf

In [154]:
adata.obsm.keys()

KeysView(AxisArrays with keys: MT, means_cell_abundance_w_sf, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, spatial, stds_cell_abundance_w_sf)

In [155]:
adata.obsm['means_cell_abundance_w_sf']= adata.obsm['means_cell_abundance_w_sf'].values

In [156]:
adata.obsm['stds_cell_abundance_w_sf'] = adata.obsm['stds_cell_abundance_w_sf'].values

In [157]:
adata.obsm['q95_cell_abundance_w_sf']= adata.obsm['q95_cell_abundance_w_sf'].values

In [158]:
adata.obsm['q05_cell_abundance_w_sf']= adata.obsm['q05_cell_abundance_w_sf'].values

In [159]:
adata.obsm['X_means_cell_abundance_w_sf'] = adata.obsm['means_cell_abundance_w_sf']
adata.obsm['X_q05_cell_abundance_w_sf'] = adata.obsm['q05_cell_abundance_w_sf']
adata.obsm['X_q95_cell_abundance_w_sf'] = adata.obsm['q95_cell_abundance_w_sf']
adata.obsm['X_stds_cell_abundance_w_sf'] = adata.obsm['stds_cell_abundance_w_sf']
adata.obsm['X_spatial'] = adata.obsm['spatial']

In [160]:
adata.obsm

AxisArrays with keys: MT, means_cell_abundance_w_sf, q05_cell_abundance_w_sf, q95_cell_abundance_w_sf, spatial, stds_cell_abundance_w_sf, X_means_cell_abundance_w_sf, X_q05_cell_abundance_w_sf, X_q95_cell_abundance_w_sf, X_stds_cell_abundance_w_sf, X_spatial

In [161]:
del adata.obsm['MT']
del adata.obsm['means_cell_abundance_w_sf']
del adata.obsm['q05_cell_abundance_w_sf']
del adata.obsm['q95_cell_abundance_w_sf']
del adata.obsm['stds_cell_abundance_w_sf']
del adata.obsm['spatial']

In [162]:
adata.obsm

AxisArrays with keys: X_means_cell_abundance_w_sf, X_q05_cell_abundance_w_sf, X_q95_cell_abundance_w_sf, X_stds_cell_abundance_w_sf, X_spatial

#### **uns (Dataset Metadata)**

In [163]:
adata.uns

OverloadedDict, wrapping:
	{'spatial': {'HCAHeartST11702008': {'images': {'hires': array([[[0.94509804, 0.9254902 , 0.92941177],
        [0.94509804, 0.92156863, 0.92941177],
        [0.94509804, 0.92156863, 0.92941177],
        ...,
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ]],

       [[0.9490196 , 0.9254902 , 0.92941177],
        [0.94509804, 0.9254902 , 0.92941177],
        [0.94509804, 0.92156863, 0.92941177],
        ...,
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ]],

       [[0.9490196 , 0.9254902 , 0.92941177],
        [0.9490196 , 0.9254902 , 0.92941177],
        [0.94509804, 0.9254902 , 0.92941177],
        ...,
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ],
        [0.9372549 , 0.91764706, 0.9254902 ]],

       ...,

       [[0.94509804, 0.9254902 

In [164]:
adata.uns['image_caption'] = 'Shown here is an image of 10 μm thick slice of sinoatrial node region of the adult human heart stained with H&E'

In [165]:
adata.uns['title'] = 'Visium spatial - FFPE'

In [166]:
adata.uns['default_embedding'] = 'X_spatial'

In [167]:
adata.uns.keys()

dict_keys(['spatial', 'image_caption', 'title', 'default_embedding'])

### **Final checks and adjustments**

In [168]:
adata

AnnData object with n_obs × n_vars = 11312 × 17912
    obs: 'in_tissue', 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'mt_frac', 'n_counts', 'n_genes', 'annotation_JC', 'Publication', 'donor_type', 'region_finest', 'facility', 'flushed', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'donor_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'
    var: 'feature_is_filtered'
    uns: 'spatial', 'image_caption', 'title', 'default_embedding'
    obsm: 'X_means_cell_abundance_w_sf', 'X_q05_cell_abundance_w_sf', 'X_q95_cell_abundance_w_sf', 'X_stds_cell_abundance_w_sf', 'X_spatial'

In [169]:
adata.obs.dtypes

in_tissue                                      int64
sample                                      category
n_genes_by_counts                              int32
log1p_n_genes_by_counts                      float64
total_counts                                 float32
log1p_total_counts                           float32
pct_counts_in_top_50_genes                   float64
pct_counts_in_top_100_genes                  float64
pct_counts_in_top_200_genes                  float64
pct_counts_in_top_500_genes                  float64
mt_frac                                      float64
n_counts                                     float32
n_genes                                        int64
annotation_JC                               category
Publication                                 category
donor_type                                  category
region_finest                               category
facility                                    category
flushed                                     ca

In [170]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [171]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed log1p_n_genes_by_counts from float64 to float32
changed pct_counts_in_top_50_genes from float64 to float32
changed pct_counts_in_top_100_genes from float64 to float32
changed pct_counts_in_top_200_genes from float64 to float32
changed pct_counts_in_top_500_genes from float64 to float32
changed mt_frac from float64 to float32
changed in_tissue from int64 to int32
changed n_genes from int64 to int32


In [172]:
adata.obs

Unnamed: 0_level_0,in_tissue,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,...,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489559,36.857239,44.354671,56.679218,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACACCAATAACTGC-1,1,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018744,25.606394,34.812569,52.398014,...,CL:0000115,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582279,36.106087,54.430378,...,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,40.575867,53.647167,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013048,28.706999,35.863979,49.505733,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,HCAHeartST11702010,2707,7.903965,4686.0,8.452548,16.282543,22.108408,30.793854,46.670937,...,CL:0002138,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294300,38.444126,53.323730,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,HCAHeartST11702010,2435,7.798112,4555.0,8.424200,19.626783,26.498354,35.675083,52.030735,...,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,39.300583,64.279770,...,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351


In [173]:
adata.obs.columns

Index(['in_tissue', 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts',
       'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes',
       'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
       'pct_counts_in_top_500_genes', 'mt_frac', 'n_counts', 'n_genes',
       'annotation_JC', 'Publication', 'donor_type', 'region_finest',
       'facility', 'flushed', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'donor_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [174]:
list(adata.obs['Publication'].unique())

['8regions']

In [175]:
adata.var

Unnamed: 0,feature_is_filtered
ENSG00000187634,False
ENSG00000188976,False
ENSG00000187961,False
ENSG00000187583,False
ENSG00000187642,False
...,...
ENSG00000160298,False
ENSG00000160299,False
ENSG00000160305,False
ENSG00000160307,False


In [176]:
adata.obs

Unnamed: 0_level_0,in_tissue,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,...,cell_type_ontology_term_id,donor_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST11702008_AAACAAGTATCTCCCA-1,1,HCAHeartST11702008,4557,8.424639,14605.0,9.589188,30.489559,36.857239,44.354671,56.679218,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACACCAATAACTGC-1,1,HCAHeartST11702008,2134,7.666222,3628.0,8.196712,19.018744,25.606394,34.812569,52.398014,...,CL:0000115,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGAGCGACTCCT-1,1,HCAHeartST11702008,2004,7.603399,3318.0,8.107419,19.228451,26.582279,36.106087,54.430378,...,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGCTTTCAGAAG-1,1,HCAHeartST11702008,4028,8.301273,10940.0,9.300272,26.919561,33.016453,40.575867,53.647167,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702008_AAACAGGGTCTATATT-1,1,HCAHeartST11702008,4049,8.306472,10116.0,9.221972,23.013048,28.706999,35.863979,49.505733,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST11702010_TTGTTTCACATCCAGG-1,1,HCAHeartST11702010,2707,7.903965,4686.0,8.452548,16.282543,22.108408,30.793854,46.670937,...,CL:0002138,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCATTAGTCTA-1,1,HCAHeartST11702010,2518,7.831617,4859.0,8.488793,23.893805,30.294300,38.444126,53.323730,...,CL:0002129,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTCCATACAACT-1,1,HCAHeartST11702010,2435,7.798112,4555.0,8.424200,19.626783,26.498354,35.675083,52.030735,...,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351
HCAHeartST11702010_TTGTTTGTATTACACG-1,1,HCAHeartST11702010,929,6.835185,1201.0,7.091742,18.318068,26.644463,39.300583,64.279770,...,CL:0002548,HOPA3,HsapDv:0000239,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,na,UBERON:0002351


In [177]:
adata.obs.columns

Index(['in_tissue', 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts',
       'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes',
       'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
       'pct_counts_in_top_500_genes', 'mt_frac', 'n_counts', 'n_genes',
       'annotation_JC', 'Publication', 'donor_type', 'region_finest',
       'facility', 'flushed', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'donor_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [178]:
#check the format of expression matrix

In [179]:
adata.X

<11312x17912 sparse matrix of type '<class 'numpy.float32'>'
	with 36167086 stored elements in Compressed Sparse Row format>

In [180]:
#write the curated object to final_objects folder

In [181]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/final_objects/visium-FFPE.h5ad', compression = 'gzip')