### **Curating heart_global.h5ad**

Article: Spatially resolved multiomics of human cardiac niches

DOI: https://doi.org/10.1038/s41586-023-06311-1 

Data Source : https://www.heartcellatlas.org

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/data/heart_global.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 704296 × 32732
    obs: 'sangerID', 'donor', 'donor_type', 'region', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'scrublet_score'
    var: 'gene_name_scRNA-0-original', 'gene_name_snRNA-1-original', 'gene_name_multiome-2-original', 'gene_id'
    uns: 'age_colors', 'cell_or_nuclei_colors', 'cell_state_colors', 'cell_type_colors', 'donor_colors', 'donor_type_colors', 'facility_colors', 'flushed_colors', 'gender_colors', 'kit_10x_colors', 'leiden', 'log1p', 'modality_colors', 'neighbors', 'original_or_new_colors', 'region_colors', 'region_finest_colors', 'scANVI_predictions_colors', 'umap'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<704296x32732 sparse matrix of type '<class 'numpy.float32'>'
	with 1049989447 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 32659)	0.87692964
  (0, 32637)	0.87692964
  (0, 32590)	0.87692964
  (0, 32582)	0.87692964
  (0, 32575)	1.3368461
  (0, 32564)	0.87692964
  (0, 32553)	0.87692964
  (0, 32520)	0.87692964
  (0, 32508)	2.0816321
  (0, 32496)	0.87692964
  (0, 32492)	0.87692964
  (0, 32474)	0.87692964
  (0, 32464)	0.87692964
  (0, 32449)	0.87692964
  (0, 32445)	1.6506809
  (0, 32419)	0.87692964
  (0, 32405)	0.87692964
  (0, 32355)	0.87692964
  (0, 32339)	2.503734
  (0, 32338)	0.87692964
  (0, 32331)	1.3368461
  (0, 32329)	1.6506809
  (0, 32328)	1.3368461
  (0, 32277)	0.87692964
  (0, 32211)	0.87692964
  :	:
  (704295, 1800)	3.0560234
  (704295, 1783)	3.0560234
  (704295, 1683)	3.0560234
  (704295, 1557)	3.0560234
  (704295, 1545)	3.0560234
  (704295, 1426)	3.0560234
  (704295, 1390)	3.0560234
  (704295, 1370)	3.7253518
  (704295, 1315)	3.0560234
  (704295, 1295)	3.0560234
  (704295, 1288)	3.0560234
  (704295, 1167)	3.0560234
  (704295, 1000)	3.0560234
  (704295, 996)	3.0560234
  (704295, 884)	3.0560234

##### **Raw counts matrix**

In [11]:
# If X has normalized counts, check for the raw counts matrix.

In [12]:
# check whether raw counts are present in adata.raw

In [13]:
adata.raw

In [14]:
# print(adata.raw.X)

In [15]:
#Raw count matrix is not present in adata.raw. Here the raw counts are provided in a separate object, load the raw counts matrix

In [16]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/Raw_counts/RNA_adult-8reg_full_raw_cellstate-annotated.h5ad')

In [17]:
araw

AnnData object with n_obs × n_vars = 704296 × 32732
    obs: 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'batch_key', 'leiden_scVI', 'cell_type', 'cell_state_HCAv1', 'cell_state_scNym', 'cell_state_scNym_confidence', 'cell_state', 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'n_counts', '_scvi_batch', '_scvi_labels', 'clus20', 'doublet_cls', 'original_or_new', 'batch', 'scANVI_predictions', 'leiden_scArches'
    var: 'gene_name-new', 'gene_name_scRNA-0-original', 'gene_name_snRNA-1-original', 'gene_name_multiome-2-original'
    uns: 'age_colors', 'cell_or_nuclei_colors', 'cell_state_colors', 'cell_type_colors', 'donor_colors', 'donor_t

In [18]:
# Check whether adata and araw has same dimensions.

In [19]:
araw.X

<704296x32732 sparse matrix of type '<class 'numpy.float32'>'
	with 1049989447 stored elements in Compressed Sparse Row format>

In [20]:
# print raw matrix

In [21]:
print(araw.X)

  (0, 32659)	1.0
  (0, 32637)	1.0
  (0, 32590)	1.0
  (0, 32582)	1.0
  (0, 32575)	2.0
  (0, 32564)	1.0
  (0, 32553)	1.0
  (0, 32520)	1.0
  (0, 32508)	5.0
  (0, 32496)	1.0
  (0, 32492)	1.0
  (0, 32474)	1.0
  (0, 32464)	1.0
  (0, 32449)	1.0
  (0, 32445)	3.0
  (0, 32419)	1.0
  (0, 32405)	1.0
  (0, 32355)	1.0
  (0, 32339)	8.0
  (0, 32338)	1.0
  (0, 32331)	2.0
  (0, 32329)	3.0
  (0, 32328)	2.0
  (0, 32277)	1.0
  (0, 32211)	1.0
  :	:
  (704295, 1800)	1.0
  (704295, 1783)	1.0
  (704295, 1683)	1.0
  (704295, 1557)	1.0
  (704295, 1545)	1.0
  (704295, 1426)	1.0
  (704295, 1390)	1.0
  (704295, 1370)	2.0
  (704295, 1315)	1.0
  (704295, 1295)	1.0
  (704295, 1288)	1.0
  (704295, 1167)	1.0
  (704295, 1000)	1.0
  (704295, 996)	1.0
  (704295, 884)	1.0
  (704295, 704)	2.0
  (704295, 649)	2.0
  (704295, 636)	1.0
  (704295, 602)	1.0
  (704295, 587)	1.0
  (704295, 446)	2.0
  (704295, 445)	1.0
  (704295, 250)	1.0
  (704295, 154)	1.0
  (704295, 91)	1.0


##### **Variables(var)**

In [22]:
# View the var of anndata and raw object

In [23]:
adata.var

Unnamed: 0_level_0,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original,gene_id
gene_name-new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,ENSG00000243485
FAM138A,FAM138A,FAM138A,FAM138A,ENSG00000237613
OR4F5,OR4F5,OR4F5,OR4F5,ENSG00000186092
AL627309.1,AL627309.1,AL627309.1,AL627309.1,ENSG00000238009
AL627309.3,AL627309.3,AL627309.3,AL627309.3,ENSG00000239945
...,...,...,...,...
AC233755.2,AC233755.2,AC233755.2,AC233755.2,ENSG00000277856
AC233755.1,AC233755.1,AC233755.1,AC233755.1,ENSG00000275063
AC240274.1,AC240274.1,AC240274.1,AC240274.1,ENSG00000271254
AC213203.2,AC213203.1,AC213203.1,AC213203.2,ENSG00000277475


In [24]:
araw.var

Unnamed: 0,gene_name-new,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original
ENSG00000243485,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG
ENSG00000237613,FAM138A,FAM138A,FAM138A,FAM138A
ENSG00000186092,OR4F5,OR4F5,OR4F5,OR4F5
ENSG00000238009,AL627309.1,AL627309.1,AL627309.1,AL627309.1
ENSG00000239945,AL627309.3,AL627309.3,AL627309.3,AL627309.3
...,...,...,...,...
ENSG00000277856,AC233755.2,AC233755.2,AC233755.2,AC233755.2
ENSG00000275063,AC233755.1,AC233755.1,AC233755.1,AC233755.1
ENSG00000271254,AC240274.1,AC240274.1,AC240274.1,AC240274.1
ENSG00000277475,AC213203.2,AC213203.1,AC213203.1,AC213203.2


In [25]:
# If ensembl ids are not in the index column and is present in another column, set the ensembl ids column as the index column.

In [26]:
adata.var['gene_symbols'] = adata.var_names

In [27]:
adata.var

Unnamed: 0_level_0,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original,gene_id,gene_symbols
gene_name-new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,ENSG00000243485,MIR1302-2HG
FAM138A,FAM138A,FAM138A,FAM138A,ENSG00000237613,FAM138A
OR4F5,OR4F5,OR4F5,OR4F5,ENSG00000186092,OR4F5
AL627309.1,AL627309.1,AL627309.1,AL627309.1,ENSG00000238009,AL627309.1
AL627309.3,AL627309.3,AL627309.3,AL627309.3,ENSG00000239945,AL627309.3
...,...,...,...,...,...
AC233755.2,AC233755.2,AC233755.2,AC233755.2,ENSG00000277856,AC233755.2
AC233755.1,AC233755.1,AC233755.1,AC233755.1,ENSG00000275063,AC233755.1
AC240274.1,AC240274.1,AC240274.1,AC240274.1,ENSG00000271254,AC240274.1
AC213203.2,AC213203.1,AC213203.1,AC213203.2,ENSG00000277475,AC213203.2


In [28]:
adata.var_names = adata.var['gene_id']

In [29]:
adata.var

Unnamed: 0_level_0,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original,gene_id,gene_symbols
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,ENSG00000243485,MIR1302-2HG
ENSG00000237613,FAM138A,FAM138A,FAM138A,ENSG00000237613,FAM138A
ENSG00000186092,OR4F5,OR4F5,OR4F5,ENSG00000186092,OR4F5
ENSG00000238009,AL627309.1,AL627309.1,AL627309.1,ENSG00000238009,AL627309.1
ENSG00000239945,AL627309.3,AL627309.3,AL627309.3,ENSG00000239945,AL627309.3
...,...,...,...,...,...
ENSG00000277856,AC233755.2,AC233755.2,AC233755.2,ENSG00000277856,AC233755.2
ENSG00000275063,AC233755.1,AC233755.1,AC233755.1,ENSG00000275063,AC233755.1
ENSG00000271254,AC240274.1,AC240274.1,AC240274.1,ENSG00000271254,AC240274.1
ENSG00000277475,AC213203.1,AC213203.1,AC213203.2,ENSG00000277475,AC213203.2


In [30]:
# Load the approved genes file.

In [31]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [32]:
#Create a dictionary from the approved genes file 

In [33]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [34]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [35]:
len(genedict)

116184

In [36]:
#Filter out the genes which are not in the approved genes file.

In [37]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [38]:
len(var_to_keep_adata)

32664

In [39]:
len(var_to_keep_araw)

32664

In [40]:
adata.var

Unnamed: 0_level_0,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original,gene_id,gene_symbols
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,ENSG00000243485,MIR1302-2HG
ENSG00000237613,FAM138A,FAM138A,FAM138A,ENSG00000237613,FAM138A
ENSG00000186092,OR4F5,OR4F5,OR4F5,ENSG00000186092,OR4F5
ENSG00000238009,AL627309.1,AL627309.1,AL627309.1,ENSG00000238009,AL627309.1
ENSG00000239945,AL627309.3,AL627309.3,AL627309.3,ENSG00000239945,AL627309.3
...,...,...,...,...,...
ENSG00000277856,AC233755.2,AC233755.2,AC233755.2,ENSG00000277856,AC233755.2
ENSG00000275063,AC233755.1,AC233755.1,AC233755.1,ENSG00000275063,AC233755.1
ENSG00000271254,AC240274.1,AC240274.1,AC240274.1,ENSG00000271254,AC240274.1
ENSG00000277475,AC213203.1,AC213203.1,AC213203.2,ENSG00000277475,AC213203.2


In [41]:
araw.var

Unnamed: 0,gene_name-new,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original
ENSG00000243485,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG
ENSG00000237613,FAM138A,FAM138A,FAM138A,FAM138A
ENSG00000186092,OR4F5,OR4F5,OR4F5,OR4F5
ENSG00000238009,AL627309.1,AL627309.1,AL627309.1,AL627309.1
ENSG00000239945,AL627309.3,AL627309.3,AL627309.3,AL627309.3
...,...,...,...,...
ENSG00000277856,AC233755.2,AC233755.2,AC233755.2,AC233755.2
ENSG00000275063,AC233755.1,AC233755.1,AC233755.1,AC233755.1
ENSG00000271254,AC240274.1,AC240274.1,AC240274.1,AC240274.1
ENSG00000277475,AC213203.2,AC213203.1,AC213203.1,AC213203.2


In [42]:
# Modify the anndata object by filtering out the filtered genes.

In [43]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_araw].copy()

In [44]:
adata.var

Unnamed: 0_level_0,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original,gene_id,gene_symbols
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,ENSG00000243485,MIR1302-2HG
ENSG00000237613,FAM138A,FAM138A,FAM138A,ENSG00000237613,FAM138A
ENSG00000186092,OR4F5,OR4F5,OR4F5,ENSG00000186092,OR4F5
ENSG00000238009,AL627309.1,AL627309.1,AL627309.1,ENSG00000238009,AL627309.1
ENSG00000239945,AL627309.3,AL627309.3,AL627309.3,ENSG00000239945,AL627309.3
...,...,...,...,...,...
ENSG00000277856,AC233755.2,AC233755.2,AC233755.2,ENSG00000277856,AC233755.2
ENSG00000275063,AC233755.1,AC233755.1,AC233755.1,ENSG00000275063,AC233755.1
ENSG00000271254,AC240274.1,AC240274.1,AC240274.1,ENSG00000271254,AC240274.1
ENSG00000277475,AC213203.1,AC213203.1,AC213203.2,ENSG00000277475,AC213203.2


In [45]:
araw.var

Unnamed: 0,gene_name-new,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original
ENSG00000243485,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG
ENSG00000237613,FAM138A,FAM138A,FAM138A,FAM138A
ENSG00000186092,OR4F5,OR4F5,OR4F5,OR4F5
ENSG00000238009,AL627309.1,AL627309.1,AL627309.1,AL627309.1
ENSG00000239945,AL627309.3,AL627309.3,AL627309.3,AL627309.3
...,...,...,...,...
ENSG00000277856,AC233755.2,AC233755.2,AC233755.2,AC233755.2
ENSG00000275063,AC233755.1,AC233755.1,AC233755.1,AC233755.1
ENSG00000271254,AC240274.1,AC240274.1,AC240274.1,AC240274.1
ENSG00000277475,AC213203.2,AC213203.1,AC213203.1,AC213203.2


feature is filtered

In [46]:
#Since feature was not filtered out in the normalized matrix (X), assign false

In [47]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [48]:
#View var

In [49]:
adata.var

Unnamed: 0_level_0,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original,gene_id,gene_symbols,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000243485,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,ENSG00000243485,MIR1302-2HG,False
ENSG00000237613,FAM138A,FAM138A,FAM138A,ENSG00000237613,FAM138A,False
ENSG00000186092,OR4F5,OR4F5,OR4F5,ENSG00000186092,OR4F5,False
ENSG00000238009,AL627309.1,AL627309.1,AL627309.1,ENSG00000238009,AL627309.1,False
ENSG00000239945,AL627309.3,AL627309.3,AL627309.3,ENSG00000239945,AL627309.3,False
...,...,...,...,...,...,...
ENSG00000277856,AC233755.2,AC233755.2,AC233755.2,ENSG00000277856,AC233755.2,False
ENSG00000275063,AC233755.1,AC233755.1,AC233755.1,ENSG00000275063,AC233755.1,False
ENSG00000271254,AC240274.1,AC240274.1,AC240274.1,ENSG00000271254,AC240274.1,False
ENSG00000277475,AC213203.1,AC213203.1,AC213203.2,ENSG00000277475,AC213203.2,False


In [50]:
araw.var

Unnamed: 0,gene_name-new,gene_name_scRNA-0-original,gene_name_snRNA-1-original,gene_name_multiome-2-original
ENSG00000243485,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG
ENSG00000237613,FAM138A,FAM138A,FAM138A,FAM138A
ENSG00000186092,OR4F5,OR4F5,OR4F5,OR4F5
ENSG00000238009,AL627309.1,AL627309.1,AL627309.1,AL627309.1
ENSG00000239945,AL627309.3,AL627309.3,AL627309.3,AL627309.3
...,...,...,...,...
ENSG00000277856,AC233755.2,AC233755.2,AC233755.2,AC233755.2
ENSG00000275063,AC233755.1,AC233755.1,AC233755.1,AC233755.1
ENSG00000271254,AC240274.1,AC240274.1,AC240274.1,AC240274.1
ENSG00000277475,AC213203.2,AC213203.1,AC213203.1,AC213203.2


In [51]:
del araw.var['gene_name-new']
del araw.var['gene_name_scRNA-0-original']
del araw.var['gene_name_snRNA-1-original']
del araw.var['gene_name_multiome-2-original']

In [52]:
del adata.var['gene_name_scRNA-0-original']
del adata.var['gene_name_snRNA-1-original']
del adata.var['gene_name_multiome-2-original']
del adata.var['gene_id']
del adata.var['gene_symbols']

In [53]:
# View var

In [54]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
gene_id,Unnamed: 1_level_1
ENSG00000243485,False
ENSG00000237613,False
ENSG00000186092,False
ENSG00000238009,False
ENSG00000239945,False
...,...
ENSG00000277856,False
ENSG00000275063,False
ENSG00000271254,False
ENSG00000277475,False


In [55]:
araw.var

ENSG00000243485
ENSG00000237613
ENSG00000186092
ENSG00000238009
ENSG00000239945
...
ENSG00000277856
ENSG00000275063
ENSG00000271254
ENSG00000277475
ENSG00000268674


#### **obs (Cell metadata)**

In [56]:
#view obs

In [57]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,cell_type,cell_state,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,scrublet_score
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,Endothelial cell,EC5_art,2470,2470,7127.0,0.0,0.000000,469.0,6.580609,0.228571
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,Mural cell,SMC2_art,2057,2057,6094.0,1.0,0.016410,334.0,5.480801,0.134615
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,Endothelial cell,EC5_art,2432,2432,6254.0,0.0,0.000000,326.0,5.212664,0.228571
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,Mural cell,PC2_atria,1821,1821,5864.0,0.0,0.000000,597.0,10.180764,0.228571
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,Endothelial cell,EC5_art,2332,2332,4922.0,0.0,0.000000,246.0,4.997968,0.172414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,Fibroblast,FB3,366,366,506.0,3.0,0.592885,2.0,0.395257,0.141722
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,Lymphoid,CD8+T_em,433,433,504.0,24.0,4.761905,2.0,0.396825,0.244000
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,Fibroblast,FB4_activated,361,361,501.0,12.0,2.395210,1.0,0.199601,0.057737
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,Myeloid,MoMP,430,430,507.0,19.0,3.747534,2.0,0.394477,0.133080


In [58]:
# view the column names in obs

In [59]:
adata.obs.columns

Index(['sangerID', 'donor', 'donor_type', 'region', 'age', 'gender',
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed',
       'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score'],
      dtype='object')

#### **assay_ontology_term_id**

In [60]:
# identify the column in adata which corresponds to assay

In [61]:
list(adata.obs['kit_10x'].unique())

['3prime-v2', '3prime-v3', 'Multiome-v1']

In [62]:
# create a dictionary with assay and corresponding assay ontology term id

In [63]:
mapping= {'3prime-v2':'EFO:0009899', '3prime-v3':'EFO:0009922', 'Multiome-v1' : 'EFO:0030059'}

In [64]:
# add the assay_ontology_term_id column

In [65]:
adata.obs['assay_ontology_term_id'] = adata.obs['kit_10x'].map(mapping)

In [66]:
# change datatype of the column

In [67]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [68]:
# view adata.obs

In [69]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,cell_state,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,scrublet_score,assay_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,EC5_art,2470,2470,7127.0,0.0,0.000000,469.0,6.580609,0.228571,EFO:0009899
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,SMC2_art,2057,2057,6094.0,1.0,0.016410,334.0,5.480801,0.134615,EFO:0009899
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,EC5_art,2432,2432,6254.0,0.0,0.000000,326.0,5.212664,0.228571,EFO:0009899
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,PC2_atria,1821,1821,5864.0,0.0,0.000000,597.0,10.180764,0.228571,EFO:0009899
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,EC5_art,2332,2332,4922.0,0.0,0.000000,246.0,4.997968,0.172414,EFO:0009899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,FB3,366,366,506.0,3.0,0.592885,2.0,0.395257,0.141722,EFO:0030059
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,CD8+T_em,433,433,504.0,24.0,4.761905,2.0,0.396825,0.244000,EFO:0030059
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,FB4_activated,361,361,501.0,12.0,2.395210,1.0,0.199601,0.057737,EFO:0030059
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,MoMP,430,430,507.0,19.0,3.747534,2.0,0.394477,0.133080,EFO:0030059


#### **cell_type_ontology_term_id**

In [70]:
#identify the column in adata.obs related. to cell type annotation

In [71]:
adata.obs.columns

Index(['sangerID', 'donor', 'donor_type', 'region', 'age', 'gender',
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed',
       'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id'],
      dtype='object')

In [72]:
list(adata.obs['cell_type'].unique())

['Endothelial cell',
 'Mural cell',
 'Myeloid',
 'Fibroblast',
 'Lymphoid',
 'Neural cell',
 'Ventricular Cardiomyocyte',
 'Lymphatic Endothelial cell',
 'Mesothelial cell',
 'Atrial Cardiomyocyte',
 'Mast cell',
 'Adipocyte']

In [73]:
# create a dictionary of cell type and ontology term

In [74]:
mapping= {'aCM1': 'CL:0002129',
 'aCM2': 'CL:0002129',
 'aCM3': 'CL:0002129',
 'aCM4': 'CL:0002129',
 'aCM5': 'CL:0002129',
 'Adip1': 'CL:0000136',
 'Adip2': 'CL:0000136',
 'Adip3': 'CL:0000136',
 'Adip4': 'CL:0000136',
 'Adipocyte': 'CL:0000136',
 'Atrial Cardiomyocyte': 'CL:0002129',
 'AVN_bundle_cell': 'CL:0010005',
 'AVN_P_cell': 'CL:1000477',
 'B': 'CL:0000236',
 'B_plasma': 'CL:0000786',
 'CD14+Mo': 'CL:0001054',
 'CD16+Mo': 'CL:0002396',
 'CD4+T_act': 'CL:0000896',
 'CD4+T_naive': 'CL:0000895',
 'CD4+T_Th2': 'CL:0000546',
 'CD8+T_cytox': 'CL:0000794',
 'CD8+T_em': 'CL:0000913',
 'CD8+T_te': 'CL:0000625',
 'CD8+T_trans': 'CL:0000625',
 'DC': 'CL:0001056',
 'EC1_cap': 'CL:0002144',
 'EC10_CMC-like': 'CL:0000115',
 'EC2_cap': 'CL:0002144',
 'EC3_cap': 'CL:0002144',
 'EC4_immune': 'CL:0000115',
 'EC5_art': 'CL:1000413',
 'EC6_ven': 'CL:0002543',
 'EC7_atria': 'CL:0002350',
 'EC7_endocardial': 'CL:0002350',
 'EC8_ln': 'CL:0002138',
 'Endothelial cell': 'CL:0000115',
 'FB1': 'CL:0002548',
 'FB2': 'CL:0002548',
 'FB3': 'CL:0002548',
 'FB4': 'CL:0002548',
 'FB4_activated': 'CL:0002548',
 'FB5': 'CL:0002548',
 'FB6': 'CL:0002548',
 'Fibroblast': 'CL:0000057',
 'ILC': 'CL:0001065',
 'Lymphatic Endothelial cell': 'CL:0002138',
 'Lymphoid': 'CL:0000542',
 'LYVE1+IGF1+MP': 'CL:0000235',
 'LYVE1+MP_cycling': 'CL:0000235',
 'LYVE1+TIMD4+MP': 'CL:0000235',
 'MAIT-like': 'CL:0000940',
 'Mast': 'CL:0000097',
 'Mast cell': 'CL:0000097',
 'Meso': 'CL:0000077',
 'Mesothelial cell': 'CL:0000077',
 'MoMP': 'CL:0000576',
 'Mural cell': 'CL:0008034',
 'Myeloid': 'CL:0000763',
 'NC1': 'CL:0000125',
 'NC1_glial': 'CL:0000125',
 'NC2': 'CL:0000125',
 'NC2_glial_NGF+': 'CL:0000125',
 'Neural cell': 'CL:0002319',
 'Neut': 'CL:0000775',
 'NK_CD16hi': 'CL:0000939',
 'NK_CD56hi': 'CL:0000938',
 'PC1_vent': 'CL:0000669',
 'PC2_atria': 'CL:0000669',
 'PC3_str': 'CL:0000669',
 'PC4_CMC-like': 'CL:0000669',
 'SAN_P_cell': 'CL:1000477',
 'SMC1_basic': 'CL:0000192',
 'SMC2_art': 'CL:0002591',
 'T/NK_cycling': 'CL:0000814',
 'vCM1': 'CL:0002131',
 'vCM3_stressed': 'CL:0002131',
 'vCM4': 'CL:0002131',
 'Ventricular Cardiomyocyte': 'CL:0002131',
 'unclassified': 'CL:0000003'}

In [75]:
# add the cell_type_ontology_term_id column

In [76]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type'].map(mapping)

In [77]:
# change datatype of the column

In [78]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [79]:
# view adata.obs

In [80]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,scrublet_score,assay_ontology_term_id,cell_type_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,2470,2470,7127.0,0.0,0.000000,469.0,6.580609,0.228571,EFO:0009899,CL:0000115
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,2057,2057,6094.0,1.0,0.016410,334.0,5.480801,0.134615,EFO:0009899,CL:0008034
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,2432,2432,6254.0,0.0,0.000000,326.0,5.212664,0.228571,EFO:0009899,CL:0000115
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,1821,1821,5864.0,0.0,0.000000,597.0,10.180764,0.228571,EFO:0009899,CL:0008034
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,2332,2332,4922.0,0.0,0.000000,246.0,4.997968,0.172414,EFO:0009899,CL:0000115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,366,366,506.0,3.0,0.592885,2.0,0.395257,0.141722,EFO:0030059,CL:0000057
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,433,433,504.0,24.0,4.761905,2.0,0.396825,0.244000,EFO:0030059,CL:0000542
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,361,361,501.0,12.0,2.395210,1.0,0.199601,0.057737,EFO:0030059,CL:0000057
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,430,430,507.0,19.0,3.747534,2.0,0.394477,0.133080,EFO:0030059,CL:0000763


#### **development_stage_ontology_term_id**

In [81]:
# identify the column in adata which corresponds to age

In [82]:
adata.obs.columns

Index(['sangerID', 'donor', 'donor_type', 'region', 'age', 'gender',
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed',
       'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [83]:
list(adata.obs['age'].unique())

['50-55', '55-60', '70-75', '65-70', '60-65', '40-45', '45-50', 'nan', '20-25']

In [84]:
# modify the age of donors based on supplementary info

In [85]:
adata.obs['age'] = np.where(adata.obs['donor'] == 'D6' , '70-75', adata.obs['age'])
adata.obs['age'] = np.where(adata.obs['donor'] == 'AH1' , '45-50', adata.obs['age'])
adata.obs['age'] = np.where(adata.obs['donor'] == 'A61' , '70-75', adata.obs['age'])

In [86]:
#adata.obs.loc[adata.obs['donor'] == 'D6', 'age'] = '70-75'
#adata.obs.loc[adata.obs['donor'] == 'AH1', 'age'] = '45-50'
#adata.obs.loc[adata.obs['donor'] == 'A61', 'age'] = '70-75'

In [87]:
# view the modified age

In [88]:
age_value = adata.obs.loc[adata.obs['donor'] == 'D6', 'age'].values[0]

In [89]:
age_value

'70-75'

In [90]:
# Get unique values of age column

In [91]:
list(adata.obs['age'].unique())

['50-55', '55-60', '70-75', '65-70', '60-65', '40-45', '45-50', '20-25']

In [92]:
# create a dictionary for age and development stage ontology term id

In [93]:
mapping= {'50-55':'HsapDv:0000240', 
          '55-60':'HsapDv:0000240', 
          '70-75':'HsapDv:0000242', 
          '65-70':'HsapDv:0000241', 
          '60-65':'HsapDv:0000241',
          '40-45':'HsapDv:0000239', 
          '45-50':'HsapDv:0000239', 
          '20-25':'HsapDv:0000237'}

In [94]:
# add the development_stage_ontology_term_id column

In [95]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['age'].map(mapping)

In [96]:
# change datatype of the column

In [97]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [98]:
# view unique values of development_stage_ontology_term_id column

In [99]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000240',
 'HsapDv:0000242',
 'HsapDv:0000241',
 'HsapDv:0000239',
 'HsapDv:0000237']

In [100]:
# view adata.obs

In [101]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,scrublet_score,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,2470,7127.0,0.0,0.000000,469.0,6.580609,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,2057,6094.0,1.0,0.016410,334.0,5.480801,0.134615,EFO:0009899,CL:0008034,HsapDv:0000240
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,2432,6254.0,0.0,0.000000,326.0,5.212664,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,1821,5864.0,0.0,0.000000,597.0,10.180764,0.228571,EFO:0009899,CL:0008034,HsapDv:0000240
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,2332,4922.0,0.0,0.000000,246.0,4.997968,0.172414,EFO:0009899,CL:0000115,HsapDv:0000240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,366,506.0,3.0,0.592885,2.0,0.395257,0.141722,EFO:0030059,CL:0000057,HsapDv:0000239
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,433,504.0,24.0,4.761905,2.0,0.396825,0.244000,EFO:0030059,CL:0000542,HsapDv:0000239
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,361,501.0,12.0,2.395210,1.0,0.199601,0.057737,EFO:0030059,CL:0000057,HsapDv:0000239
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,430,507.0,19.0,3.747534,2.0,0.394477,0.133080,EFO:0030059,CL:0000763,HsapDv:0000239


#### **donor_id**

In [102]:
#identify the column in adata.obs which provides donor information

In [103]:
adata.obs.columns

Index(['sangerID', 'donor', 'donor_type', 'region', 'age', 'gender',
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed',
       'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [104]:
list(adata.obs['donor'].unique())

['D1',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D11',
 'D2',
 'H5',
 'H6',
 'H3',
 'H2',
 'H7',
 'H4',
 'D8',
 'AH1',
 'A61',
 'AV10',
 'AV14',
 'AV3',
 'AV13',
 'AH2']

In [105]:
# add the donor_id column

In [106]:
adata.obs['donor_id'] = adata.obs['donor']

In [107]:
# change datatype of the column

In [108]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [109]:
# view unique values of donor_id column

In [110]:
list(adata.obs['donor_id'].unique())

['D1',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D11',
 'D2',
 'H5',
 'H6',
 'H3',
 'H2',
 'H7',
 'H4',
 'D8',
 'AH1',
 'A61',
 'AV10',
 'AV14',
 'AV3',
 'AV13',
 'AH2']

In [111]:
#view obs

In [112]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,scrublet_score,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,7127.0,0.0,0.000000,469.0,6.580609,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240,D1
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,6094.0,1.0,0.016410,334.0,5.480801,0.134615,EFO:0009899,CL:0008034,HsapDv:0000240,D1
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,6254.0,0.0,0.000000,326.0,5.212664,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240,D1
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,5864.0,0.0,0.000000,597.0,10.180764,0.228571,EFO:0009899,CL:0008034,HsapDv:0000240,D1
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,4922.0,0.0,0.000000,246.0,4.997968,0.172414,EFO:0009899,CL:0000115,HsapDv:0000240,D1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,506.0,3.0,0.592885,2.0,0.395257,0.141722,EFO:0030059,CL:0000057,HsapDv:0000239,AH2
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,504.0,24.0,4.761905,2.0,0.396825,0.244000,EFO:0030059,CL:0000542,HsapDv:0000239,AH2
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,501.0,12.0,2.395210,1.0,0.199601,0.057737,EFO:0030059,CL:0000057,HsapDv:0000239,AH2
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,507.0,19.0,3.747534,2.0,0.394477,0.133080,EFO:0030059,CL:0000763,HsapDv:0000239,AH2


#### **disease_ontology_term_id**

In [113]:
# Assign normal since all are healthy patients

In [114]:
# add the disease_ontology_term_id column

In [115]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461']* len(adata.obs)

In [116]:
# change datatype of the column

In [117]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [118]:
# view obs

In [119]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,scrublet_score,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,0.0,0.000000,469.0,6.580609,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,1.0,0.016410,334.0,5.480801,0.134615,EFO:0009899,CL:0008034,HsapDv:0000240,D1,PATO:0000461
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,0.0,0.000000,326.0,5.212664,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,0.0,0.000000,597.0,10.180764,0.228571,EFO:0009899,CL:0008034,HsapDv:0000240,D1,PATO:0000461
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,0.0,0.000000,246.0,4.997968,0.172414,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,3.0,0.592885,2.0,0.395257,0.141722,EFO:0030059,CL:0000057,HsapDv:0000239,AH2,PATO:0000461
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,24.0,4.761905,2.0,0.396825,0.244000,EFO:0030059,CL:0000542,HsapDv:0000239,AH2,PATO:0000461
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,12.0,2.395210,1.0,0.199601,0.057737,EFO:0030059,CL:0000057,HsapDv:0000239,AH2,PATO:0000461
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,19.0,3.747534,2.0,0.394477,0.133080,EFO:0030059,CL:0000763,HsapDv:0000239,AH2,PATO:0000461


#### **is_primary_data**

In [120]:
# load the heart cell anndata object which is already in the cellxgene portal

In [121]:
adata2 = sc.read_h5ad('/home/jovyan/CXG_DATASETS_PORTAL/heart_cell_atlas/suppl_info/local.h5ad')

In [122]:
# view columns in obs

In [123]:
adata.obs.columns

Index(['sangerID', 'donor', 'donor_type', 'region', 'age', 'gender',
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed',
       'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id'],
      dtype='object')

In [124]:
adata2.obs.columns

Index(['NRP', 'cell_source', 'donor_id', 'n_counts', 'n_genes', 'percent_mito',
       'percent_ribo', 'sample', 'scrublet_score', 'source', 'type',
       'cell_states', 'Used', 'disease_ontology_term_id',
       'assay_ontology_term_id', 'cell_type_original',
       'tissue_ontology_term_id', 'development_stage_ontology_term_id',
       'cell_type_ontology_term_id', 'suspension_type',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id', 'cell_type', 'assay',
       'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity',
       'development_stage'],
      dtype='object')

In [125]:
#view cells

In [126]:
adata.obs_names

Index(['HCAHeart7606896_GATGAGGCACGGCTAC', 'HCAHeart7606896_CGCTTCACATTTGCCC',
       'HCAHeart7606896_GTTAAGCAGAGACTAT', 'HCAHeart7606896_TCGCGTTGTAAGAGGA',
       'HCAHeart7606896_GCTGCGAGTGTTGGGA', 'HCAHeart7606896_AAGACCTGTACTTGAC',
       'HCAHeart7606896_CTTACCGCACCGAATT', 'HCAHeart7606896_TCTATTGTCGTGACAT',
       'HCAHeart7606896_GACTGCGGTAACGTTC', 'HCAHeart7606896_CTGGTCTAGGTAAACT',
       ...
       'HCAHeartST13180619_HCAHeartST13177116_TACAGCTAGTCATTAG-1',
       'HCAHeartST13180619_HCAHeartST13177116_GAGTGATCAGCAAGTG-1',
       'HCAHeartST13180619_HCAHeartST13177116_GCTCACAAGGCGCACT-1',
       'HCAHeartST13180619_HCAHeartST13177116_GAGTCATTCTTGGACG-1',
       'HCAHeartST13180619_HCAHeartST13177116_GTACTAATCTTACTCG-1',
       'HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1',
       'HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1',
       'HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1',
       'HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAA

In [127]:
adata2.obs_names

Index(['AAACCCAAGAACGCGT-1-H0015_apex', 'AAACCCAAGCAAACAT-1-H0015_apex',
       'AAACCCAAGCTACTGT-1-H0015_apex', 'AAACCCAGTACCGCGT-1-H0015_apex',
       'AAACCCATCAAACCCA-1-H0015_apex', 'AAACCCATCGCAACAT-1-H0015_apex',
       'AAACCCATCTGGTCAA-1-H0015_apex', 'AAACGAAAGATAGGGA-1-H0015_apex',
       'AAACGAAAGCTCAGAG-1-H0015_apex', 'AAACGAAAGGCTAACG-1-H0015_apex',
       ...
       'TTTGTTGCACCCTAAA-1-HCAHeart8102862',
       'TTTGTTGCACCCTGTT-1-HCAHeart8102862',
       'TTTGTTGCACTGAGGA-1-HCAHeart8102862',
       'TTTGTTGCAGCGGTTC-1-HCAHeart8102862',
       'TTTGTTGTCATACAGC-1-HCAHeart8102862',
       'TTTGTTGTCATACGGT-1-HCAHeart8102862',
       'TTTGTTGTCCTACCAC-1-HCAHeart8102862',
       'TTTGTTGTCGACGCTG-1-HCAHeart8102862',
       'TTTGTTGTCGGCTGAC-1-HCAHeart8102862',
       'TTTGTTGTCGGTGTTA-1-HCAHeart8102862'],
      dtype='object', length=486134)

In [128]:
# copy cells to barcode column

In [129]:
adata.obs['barcode'] = adata.obs_names

In [130]:
adata2.obs['barcode'] = adata2.obs_names

In [131]:
adata.obs['barcode']

barcode
HCAHeart7606896_GATGAGGCACGGCTAC                                             HCAHeart7606896_GATGAGGCACGGCTAC
HCAHeart7606896_CGCTTCACATTTGCCC                                             HCAHeart7606896_CGCTTCACATTTGCCC
HCAHeart7606896_GTTAAGCAGAGACTAT                                             HCAHeart7606896_GTTAAGCAGAGACTAT
HCAHeart7606896_TCGCGTTGTAAGAGGA                                             HCAHeart7606896_TCGCGTTGTAAGAGGA
HCAHeart7606896_GCTGCGAGTGTTGGGA                                             HCAHeart7606896_GCTGCGAGTGTTGGGA
                                                                                  ...                        
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1    HCAHeartST13180619_HCAHeartST13177116_CTTGAATC...
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1    HCAHeartST13180619_HCAHeartST13177116_AAAGCTTG...
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1    HCAHeartST13180619_HCAHeartST13177116_AATCATCC...
HC

In [132]:
adata2.obs['barcode']

AAACCCAAGAACGCGT-1-H0015_apex              AAACCCAAGAACGCGT-1-H0015_apex
AAACCCAAGCAAACAT-1-H0015_apex              AAACCCAAGCAAACAT-1-H0015_apex
AAACCCAAGCTACTGT-1-H0015_apex              AAACCCAAGCTACTGT-1-H0015_apex
AAACCCAGTACCGCGT-1-H0015_apex              AAACCCAGTACCGCGT-1-H0015_apex
AAACCCATCAAACCCA-1-H0015_apex              AAACCCATCAAACCCA-1-H0015_apex
                                                     ...                
TTTGTTGTCATACGGT-1-HCAHeart8102862    TTTGTTGTCATACGGT-1-HCAHeart8102862
TTTGTTGTCCTACCAC-1-HCAHeart8102862    TTTGTTGTCCTACCAC-1-HCAHeart8102862
TTTGTTGTCGACGCTG-1-HCAHeart8102862    TTTGTTGTCGACGCTG-1-HCAHeart8102862
TTTGTTGTCGGCTGAC-1-HCAHeart8102862    TTTGTTGTCGGCTGAC-1-HCAHeart8102862
TTTGTTGTCGGTGTTA-1-HCAHeart8102862    TTTGTTGTCGGTGTTA-1-HCAHeart8102862
Name: barcode, Length: 486134, dtype: object

In [133]:
# Rename columns in adata2 such that it is similar to adata

In [134]:
def rearrange_barcode(barcode):
    parts = barcode.split('-')
    if len(parts) == 3:
        return f'{parts[2]}_{parts[0]}-{parts[1]}'

In [135]:
adata2.obs['barcode'] = adata2.obs['barcode'].apply(rearrange_barcode)


In [136]:
# View adata2 barcode column

In [137]:
adata2.obs['barcode'] 

AAACCCAAGAACGCGT-1-H0015_apex              H0015_apex_AAACCCAAGAACGCGT-1
AAACCCAAGCAAACAT-1-H0015_apex              H0015_apex_AAACCCAAGCAAACAT-1
AAACCCAAGCTACTGT-1-H0015_apex              H0015_apex_AAACCCAAGCTACTGT-1
AAACCCAGTACCGCGT-1-H0015_apex              H0015_apex_AAACCCAGTACCGCGT-1
AAACCCATCAAACCCA-1-H0015_apex              H0015_apex_AAACCCATCAAACCCA-1
                                                     ...                
TTTGTTGTCATACGGT-1-HCAHeart8102862    HCAHeart8102862_TTTGTTGTCATACGGT-1
TTTGTTGTCCTACCAC-1-HCAHeart8102862    HCAHeart8102862_TTTGTTGTCCTACCAC-1
TTTGTTGTCGACGCTG-1-HCAHeart8102862    HCAHeart8102862_TTTGTTGTCGACGCTG-1
TTTGTTGTCGGCTGAC-1-HCAHeart8102862    HCAHeart8102862_TTTGTTGTCGGCTGAC-1
TTTGTTGTCGGTGTTA-1-HCAHeart8102862    HCAHeart8102862_TTTGTTGTCGGTGTTA-1
Name: barcode, Length: 486134, dtype: object

In [138]:
# identify similar cells and assign 'is_primary_data value'

In [139]:
def split_and_update_barcode(row):
    parts = row['barcode'].split('_')
    if len(parts) == 3:
        part1, part2, part3 = parts
        if part1 in row['sangerID']:
            return f"{part1}_{part3}"
        elif part2 in row['sangerID']:
            return f"{part2}_{part3}"
    return row['barcode']

In [140]:
adata.obs['barcode'] = adata.obs.apply(split_and_update_barcode, axis=1)

In [141]:
adata.obs['barcode'] = adata.obs['barcode'].str.replace('-1', '')

In [142]:
adata2.obs['barcode'] = adata2.obs['barcode'].str.replace('-1', '')

In [143]:
barcode_set_adata2 = set(adata2.obs['barcode'])

In [144]:
barcode_set_adata2 

{'HCAHeart7664653_CGGACGTGTCTAGCGC',
 'HCAHeart7888925_GCGACCAAGTTAAGTG',
 'HCAHeart8102862_CATGCAATCTCCGCAT',
 'HCAHeart7844002_ACCGTAAAGAATCTCC',
 'HCAHeart8287127_GTCTTTACATCCGAGC',
 'HCAHeart7843999_TCTCTAATCACAACGT',
 'HCAHeart7702881_AGCTTGACATCTCGCT',
 'H0025_apex_CTTCCTTGTGCAATAA',
 'HCAHeart7757637_CGACTTCTCATCATTC',
 'HCAHeart7702873_GGATTACGTCATTAGC',
 'HCAHeart7833855_TACGGTAGTCAGAGGT',
 'HCAHeart8102861_GTCGTAAGTCCGACGT',
 'H0015_apex_TAAGCCAGTATGTCCA',
 'H0015_RV_AAGCGTTCATAGGAGC',
 'H0020_RV_TTTCGATTCCGATAGT',
 'H0037_Apex_TTGCATTCAATCAAGA',
 'H0015_septum_AAGTACCTCTGGCCTT',
 'H0035_RA_TCCAGAAGTTGCACGC',
 'H0035_RV_GCTGAATGTACGATGG',
 'H0026_RA_GTCGAATCATAACAGA',
 'HCAHeart7888928_TCAGCAATCGTTTATC',
 'HCAHeart7656538_CATCGAACAGGTGGAT',
 'HCAHeart7844000_GCGACCAAGACCTTTG',
 'HCAHeart7985086_TCTTTCCAGGCATGGT',
 'HCAHeart7844001_CATATGGGTGAAATCA',
 'HCAHeart8102862_ACCCTCATCCCTGGTT',
 'H0015_septum_AGATGAATCGCCTATC',
 'H0035_RV_GTGTTAGCAATACGCT',
 'H0026_LA_AACGGGAGTGACCTGC

In [145]:
adata.obs['is_primary_data'] = ~adata.obs['barcode'].isin(barcode_set_adata2)

In [146]:
adata.obs['is_primary_data']

barcode
HCAHeart7606896_GATGAGGCACGGCTAC                            False
HCAHeart7606896_CGCTTCACATTTGCCC                            False
HCAHeart7606896_GTTAAGCAGAGACTAT                            False
HCAHeart7606896_TCGCGTTGTAAGAGGA                            False
HCAHeart7606896_GCTGCGAGTGTTGGGA                            False
                                                            ...  
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1     True
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1     True
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1     True
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1     True
HCAHeartST13180619_HCAHeartST13177116_GCAAGTCGTACGGGTT-1     True
Name: is_primary_data, Length: 704296, dtype: bool

In [147]:
list(adata.obs['is_primary_data'].unique())

[False, True]

In [148]:
# view 'is_primary_data'

In [149]:
adata.obs['is_primary_data'] 

barcode
HCAHeart7606896_GATGAGGCACGGCTAC                            False
HCAHeart7606896_CGCTTCACATTTGCCC                            False
HCAHeart7606896_GTTAAGCAGAGACTAT                            False
HCAHeart7606896_TCGCGTTGTAAGAGGA                            False
HCAHeart7606896_GCTGCGAGTGTTGGGA                            False
                                                            ...  
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1     True
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1     True
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1     True
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1     True
HCAHeartST13180619_HCAHeartST13177116_GCAAGTCGTACGGGTT-1     True
Name: is_primary_data, Length: 704296, dtype: bool

In [150]:
# view 'is_primary_data' unique values

In [151]:
list(adata.obs['is_primary_data'].unique())

[False, True]

In [152]:
#change data type of column

In [153]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [154]:
# view obs

In [155]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,total_counts_ribo,pct_counts_ribo,scrublet_score,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,barcode,is_primary_data
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,469.0,6.580609,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GATGAGGCACGGCTAC,False
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,334.0,5.480801,0.134615,EFO:0009899,CL:0008034,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_CGCTTCACATTTGCCC,False
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,326.0,5.212664,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GTTAAGCAGAGACTAT,False
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,597.0,10.180764,0.228571,EFO:0009899,CL:0008034,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_TCGCGTTGTAAGAGGA,False
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,246.0,4.997968,0.172414,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GCTGCGAGTGTTGGGA,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,2.0,0.395257,0.141722,EFO:0030059,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_CTTGAATCAAACGGGC,True
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,2.0,0.396825,0.244000,EFO:0030059,CL:0000542,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AAAGCTTGTAGGATCC,True
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,1.0,0.199601,0.057737,EFO:0030059,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AATCATCCATAATGAG,True
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,2.0,0.394477,0.133080,EFO:0030059,CL:0000763,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AGGCTAGCAGAAATGC,True


In [156]:
adata.obs[['barcode','sangerID','is_primary_data']].to_csv('/home/jovyan/adata_bar-sam-prim.csv', index= False)

In [157]:
adata2.obs[['barcode','sample']].to_csv('/home/jovyan/adata2_bar-sam-prim.csv', index= False)

#### **organism_ontology_term_id**

In [158]:
# assign organism id 

In [159]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [160]:
#change data type of column

In [161]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [162]:
# view obs

In [163]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,pct_counts_ribo,scrublet_score,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,barcode,is_primary_data,organism_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,6.580609,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GATGAGGCACGGCTAC,False,NCBITaxon:9606
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,5.480801,0.134615,EFO:0009899,CL:0008034,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_CGCTTCACATTTGCCC,False,NCBITaxon:9606
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,5.212664,0.228571,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GTTAAGCAGAGACTAT,False,NCBITaxon:9606
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,10.180764,0.228571,EFO:0009899,CL:0008034,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_TCGCGTTGTAAGAGGA,False,NCBITaxon:9606
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,4.997968,0.172414,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GCTGCGAGTGTTGGGA,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,0.395257,0.141722,EFO:0030059,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_CTTGAATCAAACGGGC,True,NCBITaxon:9606
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,0.396825,0.244000,EFO:0030059,CL:0000542,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AAAGCTTGTAGGATCC,True,NCBITaxon:9606
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,0.199601,0.057737,EFO:0030059,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AATCATCCATAATGAG,True,NCBITaxon:9606
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,0.394477,0.133080,EFO:0030059,CL:0000763,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AGGCTAGCAGAAATGC,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [164]:
# create a dictionary of donor and ethinic_origin

In [165]:
mapping= {'D1': 'Caucasian',
 'D2': 'Caucasian',
 'D3': 'Caucasian',
 'D4': 'Caucasian',
 'D5': 'Caucasian',
 'D6': 'Caucasian',
 'D7': 'Caucasian',
 'D11': 'Caucasian',
 'H2': 'Caucasian',
 'H3': 'Asian',
 'H4': 'Caucasian',
 'H5': 'Caucasian',
 'H6': 'Caucasian',
 'H7': 'Caucasian',
 'A61': 'Caucasian',
 'AH1': 'Caucasian',
 'AH1-A61': 'unknown',
 'AH2': 'South Asian',
 'AV10': 'Caucasian',
 'AV13': 'Caucasian',
 'AV14': 'Caucasian',
 'AV3': 'Caucasian',
 'D8': 'Caucasian',
 'AH5': 'Caucasian',
 'AH6': 'Caucasian',
 'AV1': 'Caucasian'}

In [166]:
# add ethinic_origin column

In [167]:
adata.obs['ethinic_origin'] = adata.obs['donor'].map(mapping)

In [168]:
mapping= {'Caucasian': 'HANCESTRO:0005',
 'Asian': 'HANCESTRO:0008',
 'unknown': 'unknown',
 'South Asian': 'HANCESTRO:0006'}

In [169]:
# add self_reported_ethnicity_ontology_term_id column

In [170]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['ethinic_origin'].map(mapping)

In [171]:
# change data type

In [172]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [173]:
# view obs

In [174]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,barcode,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GATGAGGCACGGCTAC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,EFO:0009899,CL:0008034,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_CGCTTCACATTTGCCC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GTTAAGCAGAGACTAT,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,EFO:0009899,CL:0008034,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_TCGCGTTGTAAGAGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,EFO:0009899,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GCTGCGAGTGTTGGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,EFO:0030059,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_CTTGAATCAAACGGGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,EFO:0030059,CL:0000542,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AAAGCTTGTAGGATCC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,EFO:0030059,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AATCATCCATAATGAG,True,NCBITaxon:9606,South Asian,HANCESTRO:0006
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,EFO:0030059,CL:0000763,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AGGCTAGCAGAAATGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006


#### **sex_ontology_term_id**

In [175]:
# identify the column in adata.obs which corresponds to sex

In [176]:
adata.obs.columns

Index(['sangerID', 'donor', 'donor_type', 'region', 'age', 'gender',
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed',
       'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'barcode', 'is_primary_data',
       'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [177]:
# list the unique values 

In [178]:
list(adata.obs['gender'].unique())

['Female', 'Male']

In [179]:
# create a dictionary of sex and sex ontology term id

In [180]:
mapping= {'Female': 'PATO:0000383', 'Male': 'PATO:0000384'}

In [181]:
# add sex_ontology_term_id column

In [182]:
adata.obs['sex_ontology_term_id'] = adata.obs['gender'].map(mapping)

In [183]:
# change data type

In [184]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [185]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,barcode,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GATGAGGCACGGCTAC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,CL:0008034,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_CGCTTCACATTTGCCC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GTTAAGCAGAGACTAT,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,CL:0008034,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_TCGCGTTGTAAGAGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,CL:0000115,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GCTGCGAGTGTTGGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_CTTGAATCAAACGGGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,CL:0000542,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AAAGCTTGTAGGATCC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AATCATCCATAATGAG,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,CL:0000763,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AGGCTAGCAGAAATGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384


#### **suspension_type**

In [186]:
# identify the column in adata.obs which corresponds to suspension type

In [187]:
adata.obs.columns

Index(['sangerID', 'donor', 'donor_type', 'region', 'age', 'gender',
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed',
       'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'barcode', 'is_primary_data',
       'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id'],
      dtype='object')

In [188]:
# list the unique values in the column

In [189]:
list(adata.obs['cell_or_nuclei'].unique())

['Cell', 'Nuclei']

In [190]:
# create a mapping dictionary

In [191]:
mapping= {'Cell':'cell', 'Nuclei':'nucleus'}

In [192]:
# add 'suspension_type' column

In [193]:
adata.obs['suspension_type'] = adata.obs['cell_or_nuclei'].map(mapping)

In [194]:
# change data type of column

In [195]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [196]:
# view obs

In [197]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,barcode,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GATGAGGCACGGCTAC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_CGCTTCACATTTGCCC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GTTAAGCAGAGACTAT,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_TCGCGTTGTAAGAGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,HsapDv:0000240,D1,PATO:0000461,HCAHeart7606896_GCTGCGAGTGTTGGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_CTTGAATCAAACGGGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AAAGCTTGTAGGATCC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AATCATCCATAATGAG,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,HsapDv:0000239,AH2,PATO:0000461,HCAHeartST13180619_AGGCTAGCAGAAATGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus


#### **tissue_ontology_term_id**

In [198]:
# identify the column in adata.obs which corresponds to tissue

In [199]:
adata.obs.columns

Index(['sangerID', 'donor', 'donor_type', 'region', 'age', 'gender',
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed',
       'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'barcode', 'is_primary_data',
       'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type'],
      dtype='object')

In [200]:
list(adata.obs['region'].unique())

['AX', 'LV', 'RV', 'LA', 'SP', 'RA', 'SAN', 'AVN']

In [201]:
# create a dictionary with tissue and corresponding tissue ontology term id

In [202]:
mapping= {'AX':'UBERON:0002098', 
          'LV':'UBERON:0002084', 
          'RV':'UBERON:0002080', 
          'LA':'UBERON:0002079', 
          'SP':'UBERON:0002094', 
          'RA':'UBERON:0002078', 
          'SAN':'UBERON:0002351', 
          'AVN':'UBERON:0002352'}

In [203]:
# add 'tissue_ontology_term_id' column

In [204]:
adata.obs['tissue_ontology_term_id'] = adata.obs['region'].map(mapping)

In [205]:
# change data type of column

In [206]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [207]:
#list the unique values in 'tissue_ontology_term_id' column

In [208]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002098',
 'UBERON:0002084',
 'UBERON:0002080',
 'UBERON:0002079',
 'UBERON:0002094',
 'UBERON:0002078',
 'UBERON:0002351',
 'UBERON:0002352']

In [209]:
# view obs

In [210]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,donor_id,disease_ontology_term_id,barcode,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_GATGAGGCACGGCTAC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_CGCTTCACATTTGCCC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_GTTAAGCAGAGACTAT,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_TCGCGTTGTAAGAGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_GCTGCGAGTGTTGGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,AH2,PATO:0000461,HCAHeartST13180619_CTTGAATCAAACGGGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,AH2,PATO:0000461,HCAHeartST13180619_AAAGCTTGTAGGATCC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,AH2,PATO:0000461,HCAHeartST13180619_AATCATCCATAATGAG,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,AH2,PATO:0000461,HCAHeartST13180619_AGGCTAGCAGAAATGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352


#### **obsm (Embeddings)**

In [211]:
# view obsm

In [212]:
# check whether all columns are prefixed with X

In [213]:
adata.obsm

AxisArrays with keys: X_umap

#### **uns (Dataset Metadata)**

In [214]:
# View

In [215]:
adata.uns

OverloadedDict, wrapping:
	{'age_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf'], dtype=object), 'cell_or_nuclei_colors': array(['#1f77b4', '#ff7f0e'], dtype=object), 'cell_state_colors': array(['#FFFF00', '#1CE6FF', '#FF34FF', '#FF4A46', '#008941', '#006FA6',
       '#A30059', '#FFDBE5', '#7A4900', '#0000A6', '#63FFAC', '#B79762',
       '#004D43', '#8FB0FF', '#997D87', '#5A0007', '#809693', '#6A3A4C',
       '#1B4400', '#4FC601', '#3B5DFF', '#4A3B53', '#FF2F80', '#61615A',
       '#BA0900', '#6B7900', '#00C2A0', '#FFAA92', '#FF90C9', '#B903AA',
       '#D16100', '#DDEFFF', '#000035', '#7B4F4B', '#A1C299', '#300018',
       '#0AA6D8', '#013349', '#00846F', '#372101', '#FFB500', '#C2FFED',
       '#A079BF', '#CC0744', '#C0B9B2', '#C2FF99', '#001E09', '#00489C',
       '#6F0062', '#0CBD66', '#EEC3FF', '#456D75', '#B77B68', '#7A87A1',
       '#788D66', '#885578', '#FAD09F', '#FF8A9A', '#D157A0', '#BEC459',
       '

In [216]:
adata.uns.keys

<bound method OverloadedDict.keys of OverloadedDict, wrapping:
	{'age_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf'], dtype=object), 'cell_or_nuclei_colors': array(['#1f77b4', '#ff7f0e'], dtype=object), 'cell_state_colors': array(['#FFFF00', '#1CE6FF', '#FF34FF', '#FF4A46', '#008941', '#006FA6',
       '#A30059', '#FFDBE5', '#7A4900', '#0000A6', '#63FFAC', '#B79762',
       '#004D43', '#8FB0FF', '#997D87', '#5A0007', '#809693', '#6A3A4C',
       '#1B4400', '#4FC601', '#3B5DFF', '#4A3B53', '#FF2F80', '#61615A',
       '#BA0900', '#6B7900', '#00C2A0', '#FFAA92', '#FF90C9', '#B903AA',
       '#D16100', '#DDEFFF', '#000035', '#7B4F4B', '#A1C299', '#300018',
       '#0AA6D8', '#013349', '#00846F', '#372101', '#FFB500', '#C2FFED',
       '#A079BF', '#CC0744', '#C0B9B2', '#C2FF99', '#001E09', '#00489C',
       '#6F0062', '#0CBD66', '#EEC3FF', '#456D75', '#B77B68', '#7A87A1',
       '#788D66', '#885578', '#FAD09F', '#FF

In [217]:
# Give a title for the dataset

In [218]:
adata.uns['title'] = 'Combined single cell and single nuclei RNA-Seq data - Heart Global'

In [219]:
# Set the default embedding

In [220]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [221]:
# view anndata object

In [222]:
adata

AnnData object with n_obs × n_vars = 704296 × 32664
    obs: 'sangerID', 'donor', 'donor_type', 'region', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'barcode', 'is_primary_data', 'organism_ontology_term_id', 'ethinic_origin', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'
    var: 'feature_is_filtered'
    uns: 'age_colors', 'cell_or_nuclei_colors', 'cell_state_colors', 'cell_type_colors', 'donor_colors', 'donor_type_colors', 'facility_colors', 'flushed_colors', 'gender_colors', 'kit_10x_colors', 'leiden', 'log1p', 'modality_colors', 'neighbors', 'original_or_new_colors', 'region_colors

In [223]:
# view obs and var data types

In [224]:
adata.obs.dtypes

sangerID                                    category
donor                                       category
donor_type                                  category
region                                      category
age                                           object
gender                                      category
facility                                    category
cell_or_nuclei                              category
modality                                    category
kit_10x                                     category
flushed                                     category
cell_type                                   category
cell_state                                  category
n_genes                                        int64
n_genes_by_counts                              int32
total_counts                                 float32
total_counts_mt                              float32
pct_counts_mt                                float32
total_counts_ribo                            f

In [225]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [226]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed scrublet_score from float64 to float32
changed n_genes from int64 to int32
changed age from object to category
changed barcode from object to category
changed ethinic_origin from object to category


In [227]:
# view obs

In [228]:
adata.obs

Unnamed: 0_level_0,sangerID,donor,donor_type,region,age,gender,facility,cell_or_nuclei,modality,kit_10x,...,donor_id,disease_ontology_term_id,barcode,is_primary_data,organism_ontology_term_id,ethinic_origin,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_GATGAGGCACGGCTAC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_CGCTTCACATTTGCCC,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_GTTAAGCAGAGACTAT,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_TCGCGTTGTAAGAGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,D1,DBD,AX,50-55,Female,Sanger,Cell,scRNA,3prime-v2,...,D1,PATO:0000461,HCAHeart7606896_GCTGCGAGTGTTGGGA,False,NCBITaxon:9606,Caucasian,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,AH2,PATO:0000461,HCAHeartST13180619_CTTGAATCAAACGGGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,AH2,PATO:0000461,HCAHeartST13180619_AAAGCTTGTAGGATCC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,AH2,PATO:0000461,HCAHeartST13180619_AATCATCCATAATGAG,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,AH2,DCD,AVN,40-45,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,...,AH2,PATO:0000461,HCAHeartST13180619_AGGCTAGCAGAAATGC,True,NCBITaxon:9606,South Asian,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352


In [229]:
adata.obs.columns

Index(['sangerID', 'donor', 'donor_type', 'region', 'age', 'gender',
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed',
       'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'barcode', 'is_primary_data',
       'organism_ontology_term_id', 'ethinic_origin',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [230]:
# delete unwanted columns in obs

In [231]:
del adata.obs['cell_type']
del adata.obs['donor']
del adata.obs['cell_or_nuclei']
del adata.obs['kit_10x']
del adata.obs['gender']
del adata.obs['ethinic_origin']
del adata.obs['modality']
del adata.obs['barcode']

In [232]:
# view obs

In [233]:
adata.obs

Unnamed: 0_level_0,sangerID,donor_type,region,age,facility,flushed,cell_state,n_genes,n_genes_by_counts,total_counts,...,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_GATGAGGCACGGCTAC,HCAHeart7606896,DBD,AX,50-55,Sanger,no,EC5_art,2470,2470,7127.0,...,CL:0000115,HsapDv:0000240,D1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_CGCTTCACATTTGCCC,HCAHeart7606896,DBD,AX,50-55,Sanger,no,SMC2_art,2057,2057,6094.0,...,CL:0008034,HsapDv:0000240,D1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_GTTAAGCAGAGACTAT,HCAHeart7606896,DBD,AX,50-55,Sanger,no,EC5_art,2432,2432,6254.0,...,CL:0000115,HsapDv:0000240,D1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_TCGCGTTGTAAGAGGA,HCAHeart7606896,DBD,AX,50-55,Sanger,no,PC2_atria,1821,1821,5864.0,...,CL:0008034,HsapDv:0000240,D1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
HCAHeart7606896_GCTGCGAGTGTTGGGA,HCAHeart7606896,DBD,AX,50-55,Sanger,no,EC5_art,2332,2332,4922.0,...,CL:0000115,HsapDv:0000240,D1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,cell,UBERON:0002098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeartST13180619_HCAHeartST13177116_CTTGAATCAAACGGGC-1,HCAHeartST13180619,DCD,AVN,40-45,Sanger,yes,FB3,366,366,506.0,...,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,True,NCBITaxon:9606,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352
HCAHeartST13180619_HCAHeartST13177116_AAAGCTTGTAGGATCC-1,HCAHeartST13180619,DCD,AVN,40-45,Sanger,yes,CD8+T_em,433,433,504.0,...,CL:0000542,HsapDv:0000239,AH2,PATO:0000461,True,NCBITaxon:9606,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352
HCAHeartST13180619_HCAHeartST13177116_AATCATCCATAATGAG-1,HCAHeartST13180619,DCD,AVN,40-45,Sanger,yes,FB4_activated,361,361,501.0,...,CL:0000057,HsapDv:0000239,AH2,PATO:0000461,True,NCBITaxon:9606,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352
HCAHeartST13180619_HCAHeartST13177116_AGGCTAGCAGAAATGC-1,HCAHeartST13180619,DCD,AVN,40-45,Sanger,yes,MoMP,430,430,507.0,...,CL:0000763,HsapDv:0000239,AH2,PATO:0000461,True,NCBITaxon:9606,HANCESTRO:0006,PATO:0000384,nucleus,UBERON:0002352


In [234]:
adata.obs.columns

Index(['sangerID', 'donor_type', 'region', 'age', 'facility', 'flushed',
       'cell_state', 'n_genes', 'n_genes_by_counts', 'total_counts',
       'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [235]:
# view var

In [236]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
gene_id,Unnamed: 1_level_1
ENSG00000243485,False
ENSG00000237613,False
ENSG00000186092,False
ENSG00000238009,False
ENSG00000239945,False
...,...
ENSG00000277856,False
ENSG00000275063,False
ENSG00000271254,False
ENSG00000277475,False


In [237]:
#view uns

In [238]:
adata.uns

OverloadedDict, wrapping:
	{'age_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf'], dtype=object), 'cell_or_nuclei_colors': array(['#1f77b4', '#ff7f0e'], dtype=object), 'cell_state_colors': array(['#FFFF00', '#1CE6FF', '#FF34FF', '#FF4A46', '#008941', '#006FA6',
       '#A30059', '#FFDBE5', '#7A4900', '#0000A6', '#63FFAC', '#B79762',
       '#004D43', '#8FB0FF', '#997D87', '#5A0007', '#809693', '#6A3A4C',
       '#1B4400', '#4FC601', '#3B5DFF', '#4A3B53', '#FF2F80', '#61615A',
       '#BA0900', '#6B7900', '#00C2A0', '#FFAA92', '#FF90C9', '#B903AA',
       '#D16100', '#DDEFFF', '#000035', '#7B4F4B', '#A1C299', '#300018',
       '#0AA6D8', '#013349', '#00846F', '#372101', '#FFB500', '#C2FFED',
       '#A079BF', '#CC0744', '#C0B9B2', '#C2FF99', '#001E09', '#00489C',
       '#6F0062', '#0CBD66', '#EEC3FF', '#456D75', '#B77B68', '#7A87A1',
       '#788D66', '#885578', '#FAD09F', '#FF8A9A', '#D157A0', '#BEC459',
       '

In [239]:
list(adata.uns.keys())

['age_colors',
 'cell_or_nuclei_colors',
 'cell_state_colors',
 'cell_type_colors',
 'donor_colors',
 'donor_type_colors',
 'facility_colors',
 'flushed_colors',
 'gender_colors',
 'kit_10x_colors',
 'leiden',
 'log1p',
 'modality_colors',
 'neighbors',
 'original_or_new_colors',
 'region_colors',
 'region_finest_colors',
 'scANVI_predictions_colors',
 'umap',
 'title',
 'default_embedding']

In [240]:
adata.obs.columns

Index(['sangerID', 'donor_type', 'region', 'age', 'facility', 'flushed',
       'cell_state', 'n_genes', 'n_genes_by_counts', 'total_counts',
       'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'scrublet_score', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [241]:
# change column names based on obs columns

In [242]:
adata.uns['donor_id_colors']= adata.uns['donor_colors']

In [243]:
# Remove unwanted columns in uns

In [244]:
del adata.uns['cell_or_nuclei_colors']
del adata.uns['scANVI_predictions_colors']
del adata.uns['donor_colors']
del adata.uns['region_finest_colors']
del adata.uns['original_or_new_colors']
del adata.uns['kit_10x_colors']
del adata.uns['modality_colors']
del adata.uns['gender_colors']

In [245]:
#check the format of expression matrix

In [246]:
adata.X

<704296x32664 sparse matrix of type '<class 'numpy.float32'>'
	with 1049679525 stored elements in Compressed Sparse Row format>

In [247]:
araw.X

<704296x32664 sparse matrix of type '<class 'numpy.float32'>'
	with 1049679525 stored elements in Compressed Sparse Row format>

In [248]:
#Copy raw counts to adata.raw

In [249]:
adata.raw = araw

In [250]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/heart_cell_atlas/final_objects/heart_global.h5ad', compression = 'gzip')