### **Curating Heart.h5ad**

Article: Automatic cell-type harmonization and integration across Human Cell Atlas datasets

DOI: https://doi.org/10.1016/j.cell.2023.11.026

Data Source : https://www.celltypist.org/organs

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Data/Heart.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 931012 × 49568
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'
    var: 'exist_in_Tucker2020', 'exist_in_Kuppe2022', 'exist_in_Koenig2022', 'exist_in_Litvinukova2020'
    uns: 'schema_version', 'title'
    obsm: 'X_umap'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<931012x49568 sparse matrix of type '<class 'numpy.float32'>'
	with 1129151034 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 30703)	1.1868877
  (0, 43447)	1.1868877
  (0, 29205)	1.7144705
  (0, 45583)	1.1868877
  (0, 14151)	1.1868877
  (0, 32623)	2.0580392
  (0, 42577)	1.1868877
  (0, 30921)	1.1868877
  (0, 14269)	1.1868877
  (0, 29302)	3.0676715
  (0, 27783)	1.1868877
  (0, 48053)	1.1868877
  (0, 31616)	1.1868877
  (0, 22334)	1.7144705
  (0, 43453)	1.7144705
  (0, 12467)	1.7144705
  (0, 8533)	1.1868877
  (0, 14193)	1.7144705
  (0, 31042)	1.1868877
  (0, 47909)	1.1868877
  (0, 22342)	1.7144705
  (0, 14740)	1.1868877
  (0, 45055)	1.1868877
  (0, 45047)	1.1868877
  (0, 15466)	1.7144705
  :	:
  (931011, 13008)	1.4304695
  (931011, 27271)	1.4304695
  (931011, 24031)	1.9962397
  (931011, 45762)	1.4304695
  (931011, 21018)	1.4304695
  (931011, 20762)	1.4304695
  (931011, 43000)	1.9962397
  (931011, 44997)	1.9962397
  (931011, 5703)	1.4304695
  (931011, 42957)	2.6190474
  (931011, 45574)	1.4304695
  (931011, 43708)	1.4304695
  (931011, 18277)	1.4304695
  (931011, 19514)	1.4304695
  (931011, 28689)	1.9962397
 

##### **Raw counts matrix**

In [11]:
print(adata.raw.X)

  (0, 30703)	1.0
  (0, 43447)	1.0
  (0, 29205)	2.0
  (0, 45583)	1.0
  (0, 14151)	1.0
  (0, 32623)	3.0
  (0, 42577)	1.0
  (0, 30921)	1.0
  (0, 14269)	1.0
  (0, 29302)	9.0
  (0, 27783)	1.0
  (0, 48053)	1.0
  (0, 31616)	1.0
  (0, 22334)	2.0
  (0, 43453)	2.0
  (0, 12467)	2.0
  (0, 8533)	1.0
  (0, 14193)	2.0
  (0, 31042)	1.0
  (0, 47909)	1.0
  (0, 22342)	2.0
  (0, 14740)	1.0
  (0, 45055)	1.0
  (0, 45047)	1.0
  (0, 15466)	2.0
  :	:
  (931011, 13008)	1.0
  (931011, 27271)	1.0
  (931011, 24031)	2.0
  (931011, 45762)	1.0
  (931011, 21018)	1.0
  (931011, 20762)	1.0
  (931011, 43000)	2.0
  (931011, 44997)	2.0
  (931011, 5703)	1.0
  (931011, 42957)	4.0
  (931011, 45574)	1.0
  (931011, 43708)	1.0
  (931011, 18277)	1.0
  (931011, 19514)	1.0
  (931011, 28689)	2.0
  (931011, 15603)	1.0
  (931011, 15908)	1.0
  (931011, 20142)	1.0
  (931011, 28019)	1.0
  (931011, 15433)	3.0
  (931011, 48902)	1.0
  (931011, 22442)	1.0
  (931011, 49429)	1.0
  (931011, 47595)	1.0
  (931011, 32043)	1.0


In [12]:
adata.raw.var

Unnamed: 0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020
A1BG,True,True,True,True
A1BG-AS1,True,True,True,True
A1CF,True,True,True,True
A2M,True,True,True,True
A2M-AS1,True,True,True,True
...,...,...,...,...
bP-21264C1.2,True,True,True,True
bP-2171C21.3,True,True,True,True
bP-2189O9.3,True,False,True,False
bP-2189O9.5,False,True,False,False


In [13]:
araw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)

##### **Variables(var)**

In [14]:
# View the var of anndata and raw object

In [15]:
adata.var

Unnamed: 0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020
A1BG,True,True,True,True
A1BG-AS1,True,True,True,True
A1CF,True,True,True,True
A2M,True,True,True,True
A2M-AS1,True,True,True,True
...,...,...,...,...
bP-21264C1.2,True,True,True,True
bP-2171C21.3,True,True,True,True
bP-2189O9.3,True,False,True,False
bP-2189O9.5,False,True,False,False


In [16]:
ensembl_data = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/symbol2ID.csv')

In [17]:
ensembl_dict = dict(zip(ensembl_data['gene_symbol'], ensembl_data['gene_id']))

In [18]:
ensembl_dict

{'MT-TF': 'ENSG00000210049',
 'MT-RNR1': 'ENSG00000211459',
 'MT-TV': 'ENSG00000210077',
 'MT-RNR2': 'ENSG00000210082',
 'MT-TL1': 'ENSG00000209082',
 'MT-ND1': 'ENSG00000198888',
 'MT-TI': 'ENSG00000210100',
 'MT-TQ': 'ENSG00000210107',
 'MT-TM': 'ENSG00000210112',
 'MT-ND2': 'ENSG00000198763',
 'MT-TW': 'ENSG00000210117',
 'MT-TA': 'ENSG00000210127',
 'MT-TN': 'ENSG00000210135',
 'MT-TC': 'ENSG00000210140',
 'MT-TY': 'ENSG00000210144',
 'MT-CO1': 'ENSG00000198804',
 'MT-TS1': 'ENSG00000210151',
 'MT-TD': 'ENSG00000210154',
 'MT-CO2': 'ENSG00000198712',
 'MT-TK': 'ENSG00000210156',
 'MT-ATP8': 'ENSG00000228253',
 'MT-ATP6': 'ENSG00000198899',
 'MT-CO3': 'ENSG00000198938',
 'MT-TG': 'ENSG00000210164',
 'MT-ND3': 'ENSG00000198840',
 'MT-TR': 'ENSG00000210174',
 'MT-ND4L': 'ENSG00000212907',
 'MT-ND4': 'ENSG00000198886',
 'MT-TH': 'ENSG00000210176',
 'MT-TS2': 'ENSG00000210184',
 'MT-TL2': 'ENSG00000210191',
 'MT-ND5': 'ENSG00000198786',
 'MT-ND6': 'ENSG00000198695',
 'MT-TE': 'ENSG00000

In [19]:
adata.var['gene_id'] = adata.var_names.map(ensembl_dict)
araw.var['gene_id'] = araw.var_names.map(ensembl_dict)


In [20]:
adata.var

Unnamed: 0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,gene_id
A1BG,True,True,True,True,ENSG00000121410
A1BG-AS1,True,True,True,True,ENSG00000268895
A1CF,True,True,True,True,ENSG00000148584
A2M,True,True,True,True,ENSG00000175899
A2M-AS1,True,True,True,True,ENSG00000245105
...,...,...,...,...,...
bP-21264C1.2,True,True,True,True,
bP-2171C21.3,True,True,True,True,
bP-2189O9.3,True,False,True,False,
bP-2189O9.5,False,True,False,False,


In [21]:
araw.var

Unnamed: 0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,gene_id
A1BG,True,True,True,True,ENSG00000121410
A1BG-AS1,True,True,True,True,ENSG00000268895
A1CF,True,True,True,True,ENSG00000148584
A2M,True,True,True,True,ENSG00000175899
A2M-AS1,True,True,True,True,ENSG00000245105
...,...,...,...,...,...
bP-21264C1.2,True,True,True,True,
bP-2171C21.3,True,True,True,True,
bP-2189O9.3,True,False,True,False,
bP-2189O9.5,False,True,False,False,


In [22]:
nan_count = adata.var['gene_id'].isna().sum()
print("Number of NaN values in adata.obs['gene_id']: ", nan_count)

Number of NaN values in adata.obs['gene_id']:  12512


In [23]:
adata.var['gene_name'] = adata.var.index
araw.var['gene_name'] = araw.var.index

In [24]:
adata.var.index = adata.var['gene_id'] 
araw.var.index = araw.var['gene_id']

In [25]:
# Load the approved genes file.

In [26]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [27]:
#Create a dictionary from the approved genes file 

In [28]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [29]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [30]:
len(genedict)

119799

In [31]:
#Filter out the genes which are not in the approved genes file.

In [32]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [33]:
len(var_to_keep_adata)

33719

In [34]:
len(var_to_keep_araw)

33719

In [35]:
adata.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
,True,True,True,True,,bP-21264C1.2
,True,True,True,True,,bP-2171C21.3
,True,False,True,False,,bP-2189O9.3
,False,True,False,False,,bP-2189O9.5


In [36]:
araw.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
,True,True,True,True,,bP-21264C1.2
,True,True,True,True,,bP-2171C21.3
,True,False,True,False,,bP-2189O9.3
,False,True,False,False,,bP-2189O9.5


In [37]:
# Modify the anndata object by filtering out the filtered genes.

In [38]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [39]:
adata.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000203995,True,True,True,True,ENSG00000203995,ZYG11A
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,True,True,True,True,ENSG00000036549,ZZZ3


In [40]:
# View var

In [41]:
araw.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000203995,True,True,True,True,ENSG00000203995,ZYG11A
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,True,True,True,True,ENSG00000036549,ZZZ3


In [42]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = adata.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
adata = adata[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", adata.var_names.is_unique)

All variable names are unique: True


In [43]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = araw.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
araw = araw[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", araw.var_names.is_unique)

All variable names are unique: True


feature is filtered

In [44]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [45]:
adata.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,gene_id,gene_name,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG,False
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1,False
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF,False
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M,False
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1,False
...,...,...,...,...,...,...,...
ENSG00000203995,True,True,True,True,ENSG00000203995,ZYG11A,False
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B,False
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1,False
ENSG00000036549,True,True,True,True,ENSG00000036549,ZZZ3,False


In [46]:
araw.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000203995,True,True,True,True,ENSG00000203995,ZYG11A
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,True,True,True,True,ENSG00000036549,ZZZ3


In [47]:
del adata.var['gene_id']
del araw.var['gene_id']
del adata.var['gene_name']
del araw.var['gene_name']

#### **obs (Cell metadata)**

In [48]:
#view obs

In [49]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,Homo sapiens,normal,heart
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,Homo sapiens,normal,heart


In [50]:
# view the column names in obs

In [51]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'],
      dtype='object')

#### **assay_ontology_term_id**

In [52]:
list(adata.obs['assay'].unique())

["10x 3' v2", "10x 3' v3", "10x 5' v1"]

In [53]:
adata.obs['barcodes'] = adata.obs_names

In [54]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [55]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [56]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [57]:
adata.obs['assays'] = adata.obs['barcodes'].map(mapping)

In [58]:
list(adata.obs['assays'].unique())

['3pv2_5pv1_5pv2+3pv3',
 '3pv2_5pv1_5pv2',
 '3pv2_5pv1_5pv2+multiome',
 nan,
 '3pv3',
 '3pv3+multiome']

In [59]:
import pandas as pd
import scanpy as sc

# Suppose 'adata' is your AnnData object already loaded.

# Step 1: Check unique values in 'assay'
unique_assays = adata.obs['assay'].unique()
print("Unique values in 'assay':", unique_assays)

# Step 2: Display unique 'assays' for each unique 'assay'
for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['assay'] == assay, 'assays'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)


Unique values in 'assay': ['10x 3' v2', '10x 3' v3', '10x 5' v1']
Categories (3, object): ['10x 3' v2', '10x 3' v3', '10x 5' v1']
Unique 'assays' for assay 10x 3' v2: ['3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay 10x 3' v3: ['3pv3' '3pv2_5pv1_5pv2+3pv3' '3pv3+multiome' nan]
Unique 'assays' for assay 10x 5' v1: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]


In [60]:
mapping ={"10x 5' v2":'EFO:0009900', "10x 5' v1":'EFO:0011025', "10x 3' v3":'EFO:0009922', "10x 3' transcription profiling":'EFO:0009899',"10x 3' v2":'EFO:0009899'}

In [61]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay'].map(mapping)

In [62]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922


In [63]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [64]:
# view adata.obs

In [65]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922


#### **cell_type_ontology_term_id**

In [66]:
#identify the column in adata.obs related. to cell type annotation

In [67]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id'],
      dtype='object')

In [68]:
list(adata.obs['cell_type'].unique())

['regular atrial cardiac myocyte',
 'fat cell',
 'fibroblast',
 'vascular associated smooth muscle cell',
 'myeloid cell',
 'cardiac muscle cell',
 'capillary endothelial cell',
 'endothelial cell',
 'smooth muscle cell',
 'pericyte',
 'endothelial cell of lymphatic vessel',
 'endothelial cell of artery',
 'vein endothelial cell',
 'regular ventricular cardiac myocyte',
 'lymphocyte',
 'neuron',
 'natural killer cell',
 'mast cell',
 'mesothelial cell']

In [69]:

df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/cell_typist_annotation.csv')



In [70]:
mapping = df.set_index('Cell_type')['Cell_ontology_ID'].to_dict()

In [71]:
mapping

{'central memory CD4-positive, alpha-beta T cell': 'CL:0000904',
 'CD16-positive, CD56-dim natural killer cell, human': 'CL:0000939',
 'effector memory CD4-positive, alpha-beta T cell': 'CL:0000905',
 'central memory CD8-positive, alpha-beta T cell': 'CL:0000907',
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated': 'CL:0001062',
 'classical monocyte': 'CL:0000860',
 'class switched memory B cell': 'CL:0000972',
 'mucosal invariant T cell': 'CL:0000940',
 'naive B cell': 'CL:0000788',
 'effector memory CD8-positive, alpha-beta T cell': 'CL:0000913',
 'unswitched memory B cell': 'CL:0000970',
 'non-classical monocyte': 'CL:0000875',
 'CD16-negative, CD56-bright natural killer cell, human': 'CL:0000938',
 'gamma-delta T cell': 'CL:0000798',
 'regulatory T cell': 'CL:0000815',
 'conventional dendritic cell': 'CL:0000990',
 'plasma cell': 'CL:0000786',
 'memory B cell': 'CL:0000787',
 'effector memory CD4-positive, alpha-beta T cell, terminally differentiated': 'C

In [72]:
# add the cell_type_ontology_term_id column

In [73]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type'].map(mapping)

In [74]:
# change datatype of the column

In [75]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [76]:
# view adata.obs

In [77]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413


#### **development_stage_ontology_term_id**

In [78]:
# identify the column in adata which corresponds to age

In [79]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [80]:
list(adata.obs['development_stage'].unique())

['51-year-old human stage',
 '54-year-old human stage',
 '39-year-old human stage',
 '59-year-old human stage',
 '60-year-old human stage',
 '47-year-old human stage',
 '52-year-old human stage',
 '44-year-old human stage',
 '55-year-old human stage',
 '61-year-old human stage',
 '68-year-old human stage',
 '62-year-old human stage',
 '48-year-old human stage',
 '65-year-old human stage',
 '27-year-old human stage',
 '46-year-old human stage',
 '75-year-old human stage',
 '63-year-old human stage',
 '71-year-old human stage',
 '73-year-old human stage',
 '20-year-old human stage',
 '35-year-old human stage',
 '66-year-old human stage',
 '21-year-old human stage',
 'sixth decade human stage',
 'fifth decade human stage',
 'seventh decade human stage',
 'eighth decade human stage']

In [81]:
age = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/age.csv')

In [82]:
age_dict = pd.Series(age['development_stage_ontology_term_id'].values, index=age['age']).to_dict()

In [83]:
age_dict

{'29-year-old human stage': 'HsapDv:0000123',
 '58-year-old human stage': 'HsapDv:0000152',
 '35-year-old human stage': 'HsapDv:0000129',
 '33-year-old human stage': 'HsapDv:0000127',
 '45-year-old human stage': 'HsapDv:0000139',
 '37-year-old human stage': 'HsapDv:0000131',
 '69-year-old human stage': 'HsapDv:0000163',
 '55-year-old human stage': 'HsapDv:0000149',
 '71-year-old human stage': 'HsapDv:0000165',
 '26-year-old human stage': 'HsapDv:0000120',
 '53-year-old human stage': 'HsapDv:0000147',
 '49-year-old human stage': 'HsapDv:0000143',
 '46-year-old human stage': 'HsapDv:0000140',
 '34-year-old human stage': 'HsapDv:0000128',
 '27-year-old human stage': 'HsapDv:0000121',
 '28-year-old human stage': 'HsapDv:0000122',
 '30-year-old human stage': 'HsapDv:0000124',
 'seventh decade human stage': 'HsapDv:0000241',
 'sixth decade human stage': 'HsapDv:0000240',
 '59-year-old human stage': 'HsapDv:0000153',
 '39-year-old human stage': 'HsapDv:0000133',
 '22-year-old human stage': 'H

In [84]:
donor_ids = ["A29", "390C", "A26 (386C)", "A26", "A32 (411C)", "A32", "A34 (417C)", 
             "417C", "356C", "A32 (411C)", "A26 (386C)", "284C", "368C", "296C", 
             "A33 (414C)", "A30 (398B)", "417c", "454C", "A32", "A37", "A40", "A44", 
             "A47", "640C", "390C", "A29", "390c", "302C", "302c", "390C", "390c","411C","A34","386C"]

# Convert `adata.obs['donor_id']` to a set for faster lookup
present_donors_set = set(adata.obs['donor_id'])

# Check which donors are present in `adata.obs['donor_id']`
present_donors = [donor for donor in donor_ids if donor in present_donors_set]

# Display the donors that are present
print("Donors present in adata.obs['donor_id']: ", present_donors)

Donors present in adata.obs['donor_id']:  []


In [85]:
donors_to_replace = ['A29', 'A26', 'A32', '417C','302c', '390c','417c','386C','390C','411C',]
if any(donor in adata.obs['donor_id'].values for donor in donors_to_replace):
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A29', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A26', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A32', 'A32 (411C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417C', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('302c', '302C')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417c', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390C', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('386C', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('411C', 'A32 (411C)')

In [86]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage'].map(age_dict)

In [87]:
update_dict = {
'D5' :'HsapDv:0000241',
'A29 (390C)' :'HsapDv:0000161',
'A37':'HsapDv:0000149',
'A34 (417C)':'HsapDv:0000094',
'356C':'HsapDv:0000239',
'A32 (411C)':'HsapDv:0000123',
'A26 (386C)':'HsapDv:0000169',
'284C':'HsapDv:0000240',
'368C':'HsapDv:0000240',
'296C':'HsapDv:0000238',
'A33 (414C)':'HsapDv:0000090',
'A30 (398B)':'HsapDv:0000090',
'417c':'HsapDv:0000094',
'A32':'HsapDv:0000123',
'A37':'HsapDv:0000153',
'A40':'HsapDv:0000158',
'A44':'HsapDv:0000160',
'A47':'HsapDv:0000152',
'640C':'HsapDv:0000242'}

# Update adata.obs['development_stage_ontology_term_id'] based on the update dictionary
adata.obs['development_stage_ontology_term_id'] = adata.obs.apply(
    lambda row: update_dict[row['donor_id']] if row['donor_id'] in update_dict else row['development_stage_ontology_term_id'], 
    axis=1
)

In [88]:
obs_df = adata.obs[['development_stage_ontology_term_id', 'donor_id']]

# Dropping duplicates to get unique pairs
unique_pairs = obs_df.drop_duplicates()

# Converting to a list of tuples if needed
unique_pairs_list = list(unique_pairs.itertuples(index=False, name=None))

unique_pairs_list

[('HsapDv:0000145', '1600'),
 ('HsapDv:0000148', '1666'),
 ('HsapDv:0000133', '1681'),
 ('HsapDv:0000153', '1702'),
 ('HsapDv:0000154', '1708'),
 ('HsapDv:0000141', '1723'),
 ('HsapDv:0000146', '1221'),
 ('HsapDv:0000138', 'P1'),
 ('HsapDv:0000149', 'P7'),
 ('HsapDv:0000138', 'P8'),
 ('HsapDv:0000155', 'P17'),
 ('HsapDv:0000162', 'H_ZC-11-292'),
 ('HsapDv:0000156', 'TWCM-10-68'),
 ('HsapDv:0000154', 'TWCM-11-41'),
 ('HsapDv:0000142', 'TWCM-11-42'),
 ('HsapDv:0000159', 'TWCM-11-74'),
 ('HsapDv:0000121', 'TWCM-11-78'),
 ('HsapDv:0000138', 'TWCM-11-82'),
 ('HsapDv:0000140', 'TWCM-11-103'),
 ('HsapDv:0000169', 'TWCM-11-104'),
 ('HsapDv:0000157', 'TWCM-11-192'),
 ('HsapDv:0000133', 'TWCM-11-256'),
 ('HsapDv:0000165', 'TWCM-11-264'),
 ('HsapDv:0000155', 'TWCM-13-1'),
 ('HsapDv:0000167', 'TWCM-13-36'),
 ('HsapDv:0000140', 'TWCM-13-80'),
 ('HsapDv:0000114', 'TWCM-13-96'),
 ('HsapDv:0000154', 'TWCM-13-101'),
 ('HsapDv:0000142', 'TWCM-13-104'),
 ('HsapDv:0000142', 'TWCM-13-132'),
 ('HsapDv:00001

In [89]:
# change datatype of the column

In [90]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [91]:
# view unique values of development_stage_ontology_term_id column

In [92]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000145',
 'HsapDv:0000148',
 'HsapDv:0000133',
 'HsapDv:0000153',
 'HsapDv:0000154',
 'HsapDv:0000141',
 'HsapDv:0000146',
 'HsapDv:0000138',
 'HsapDv:0000149',
 'HsapDv:0000155',
 'HsapDv:0000162',
 'HsapDv:0000156',
 'HsapDv:0000142',
 'HsapDv:0000159',
 'HsapDv:0000121',
 'HsapDv:0000140',
 'HsapDv:0000169',
 'HsapDv:0000157',
 'HsapDv:0000165',
 'HsapDv:0000167',
 'HsapDv:0000114',
 'HsapDv:0000129',
 'HsapDv:0000160',
 'HsapDv:0000115',
 'HsapDv:0000240',
 'HsapDv:0000239',
 'HsapDv:0000241',
 'HsapDv:0000242']

In [93]:
# view adata.obs

In [94]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241


#### **donor_id**

In [95]:
#identify the column in adata.obs which provides donor information

In [96]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [97]:
# add the donor_id column

In [98]:
adata.obs['donor_id'] = adata.obs['donor_id']

In [99]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [100]:
# change datatype of the column

In [101]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [102]:
# view unique values of donor_id column

In [103]:
list(adata.obs['donor_id'].unique())

['1600',
 '1666',
 '1681',
 '1702',
 '1708',
 '1723',
 '1221',
 'P1',
 'P7',
 'P8',
 'P17',
 'H_ZC-11-292',
 'TWCM-10-68',
 'TWCM-11-41',
 'TWCM-11-42',
 'TWCM-11-74',
 'TWCM-11-78',
 'TWCM-11-82',
 'TWCM-11-103',
 'TWCM-11-104',
 'TWCM-11-192',
 'TWCM-11-256',
 'TWCM-11-264',
 'TWCM-13-1',
 'TWCM-13-36',
 'TWCM-13-80',
 'TWCM-13-96',
 'TWCM-13-101',
 'TWCM-13-104',
 'TWCM-13-132',
 'TWCM-13-152',
 'TWCM-13-168',
 'TWCM-13-192',
 'TWCM-14-173',
 'HDCM5',
 'HDCM7',
 'H5',
 'H6',
 'H3',
 'H2',
 'H7',
 'H4',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D11']

In [104]:
#view obs

In [105]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241


In [106]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [107]:
list(adata.obs['disease'].unique())

['normal']

In [108]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [109]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461


In [110]:
# change datatype of the column

In [111]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [112]:
# view obs

In [113]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461


#### **is_primary_data**

In [114]:
#change data type of column

In [115]:
list(adata.obs['Dataset'].unique())

['Tucker et al. 2020',
 'Kuppe et al. 2022',
 'Koenig et al. 2022',
 'Litvinukova et al. 2020']

In [116]:
mapping = {'Aizarani et al. 2019':False,
'Andrews et al. 2022':False,
'Guilliams et al. 2022':False,
'MacParland et al. 2018':False,
'HCA Kidney 2022':False,
'Lake et al. 2021':False,
'Muto et al. 2021':False,
'Stewart et al. 2019':False,
'Dominguez Conde et al. 2022':False,
'Elmentaite et al. 2021':False,
'James et al. 2020':False,
'Szabo et al. 2019':False,
'He et al. 2020':False,
'Micheli et al. 2020':True,
'Perez et al. 2022':True,
'Ren et al. 2021':False,
'Stephenson et al. 2021':False,
'Yoshida et al. 2021':False,
'Madissoon et al. 2020':False,
'Tabula Sapiens 2022':False,
'Ayhan et al. 2021':False,
'Franjic et al. 2022':True,
'Siletti et al. 2022':False,
'Tran et al. 2021':True,
'Burclaff et al. 2022':False,
'Smillie et al. 2019':False,
'Roy et al. 2021':True,
'Adams et al. 2020':False,
'Madissoon et al. 2022':False,
'Travaglini et al. 2020':False,
'Koenig et al. 2022':True,
'Kuppe et al. 2022':False,
'Litvinukova et al. 2020':False,
'Tucker et al. 2020':True,
'Fasolino et al. 2022':False,
'Muraro et al. 2016':False,
'Tosti et al. 2021':True,
'Tritschler et al. 2022':False}

In [117]:
adata.obs['is_primary_data']= adata.obs['Dataset'].map(mapping)

In [118]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [119]:
# view obs

In [120]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,Homo sapiens,normal,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,Homo sapiens,normal,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False


#### **organism_ontology_term_id**

In [121]:
# assign organism id 

In [122]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [123]:
#change data type of column

In [124]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [125]:
# view obs

In [126]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,normal,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,normal,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,normal,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,normal,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,normal,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,normal,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,...,normal,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,normal,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,...,normal,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [127]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/heart/metadata_heart.csv')
mapping = dict(zip(df['cells'], df['ethnicity']))
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs_names.map(mapping)

In [128]:
# change data type

In [129]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [130]:
# view obs

In [131]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,heart,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,heart,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,heart,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,heart,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,heart,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,heart,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,...,heart,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,heart,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,...,heart,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005


In [132]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [133]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

[nan, 'HANCESTRO:0005', 'HANCESTRO:0008']

In [134]:
import numpy as np

adata.obs['self_reported_ethnicity_ontology_term_id'] = np.where(adata.obs['self_reported_ethnicity_ontology_term_id'].isna(), 'unknown', adata.obs['self_reported_ethnicity_ontology_term_id'])


In [135]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['unknown', 'HANCESTRO:0005', 'HANCESTRO:0008']

#### **sex_ontology_term_id**

In [136]:
# identify the column in adata.obs which corresponds to sex

In [137]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [138]:
list(adata.obs['sex'].unique())

['female', 'male']

In [139]:
# list the unique values 

In [140]:
mapping= {'female': 'PATO:0000383', 'male': 'PATO:0000384', 'unknown':'unknown'}

In [141]:
# add sex_ontology_term_id column

In [142]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex'].map(mapping)

In [143]:
# change data type

In [144]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [145]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,...,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,...,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383


#### **suspension_type**

In [146]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,...,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,...,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383


In [147]:
list(adata.obs['suspension_type'].unique())

['nucleus', 'cell']

In [148]:
adata.obs['suspension_type'] = adata.obs['suspension_type']

In [149]:
# change data type of column

In [150]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [151]:
# view obs

In [152]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,TACTTACTCCTCTAGC,3pv2_5pv1_5pv2+3pv3,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,GGCGTGTAGGAGTTGC,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,ACGCCGACATCGGTTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,ACCCACTGTCGGCACT,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,CCTTTCTTCCGCGGTA,3pv2_5pv1_5pv2,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,TTTGTTGTCATACGGT,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,...,TTTGTTGTCCTACCAC,3pv3,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,TTTGTTGTCGACGCTG,3pv3,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,...,TTTGTTGTCGGCTGAC,3pv3,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383


#### **tissue_type**

In [153]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [154]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [155]:
# identify the column in adata.obs which corresponds to tissue

In [156]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type'],
      dtype='object')

In [157]:
list(adata.obs['tissue'].unique())

['heart']

In [158]:
mapping= {'blood' : 'UBERON:0000178',
         'bone marrow':'UBERON:0002371',
          'heart':'UBERON:0000948',
          'intestine':'UBERON:0000160',
          'kidney':'UBERON:0002113',
          'hippocampal formation':'UBERON:0002421',
          'liver':'UBERON:0002107',
          'lung':'UBERON:0002048',
          'lymph node':'UBERON:0000029',
          'pancreas':'UBERON:0001264',
          'skeletal muscle organ':'UBERON:0014892',
          'spleen':'UBERON:0002106'}

In [159]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/heart/metadata_heart.csv')
mapping = dict(zip(df['cells'], df['tissue_ontology_term_id']))
adata.obs['tissue_ontology_term_id'] = adata.obs_names.map(mapping)

In [160]:
# add 'tissue_ontology_term_id' column

In [161]:
# change data type of column

In [162]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [163]:
#list the unique values in 'tissue_ontology_term_id' column

In [164]:
list(adata.obs['tissue_ontology_term_id'].unique())

[nan,
 'UBERON:0002084',
 'UBERON:0002098',
 'UBERON:0002079',
 'UBERON:0002078',
 'UBERON:0002080',
 'UBERON:0002094']

In [165]:
adata.obs['tissue_ontology_term_id'] = np.where(adata.obs['tissue_ontology_term_id'].isna(), 'UBERON:0000948', adata.obs['tissue_ontology_term_id'])

In [166]:
list(adata.obs['Dataset'].unique())

['Tucker et al. 2020',
 'Kuppe et al. 2022',
 'Koenig et al. 2022',
 'Litvinukova et al. 2020']

In [167]:
if 'Dataset' in adata.obs.columns:
    # Assign 'tissue_ontology_term_id' for rows where 'Dataset' equals 'Koenig et al. 2022'
    adata.obs.loc[adata.obs['Dataset'] == 'Koenig et al. 2022', 'tissue_ontology_term_id'] = 'UBERON:0002084'
else:
    print("The 'Dataset' column does not exist in adata.obs")

In [168]:
if 'Dataset' in adata.obs.columns:
    # Assign 'tissue_ontology_term_id' for rows where 'Dataset' equals 'Koenig et al. 2022'
    adata.obs.loc[adata.obs['Dataset'] == 'Tucker et al. 2020', 'tissue_ontology_term_id'] = 'UBERON:0001133'
else:
    print("The 'Dataset' column does not exist in adata.obs")

In [169]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0001133',
 'UBERON:0002084',
 'UBERON:0002098',
 'UBERON:0002079',
 'UBERON:0002078',
 'UBERON:0002080',
 'UBERON:0002094']

In [170]:
# view obs

In [171]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,...,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,...,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098


In [172]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [173]:
# view obsm

In [174]:
# check whether all columns are prefixed with X

In [175]:
adata.obsm

AxisArrays with keys: X_umap

#### **uns (Dataset Metadata)**

In [176]:
# View

In [177]:
adata.uns

{'schema_version': '3.0.0', 'title': 'Adult human heart'}

In [178]:
adata.uns.keys

<function dict.keys>

In [179]:
# Give a title for the dataset

In [180]:
adata.uns['title'] = 'Heart'

In [181]:
# Set the default embedding

In [182]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [183]:
# view anndata object

In [184]:
adata

AnnData object with n_obs × n_vars = 931012 × 31180
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue', 'barcodes', 'assays', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'
    var: 'exist_in_Tucker2020', 'exist_in_Kuppe2022', 'exist_in_Koenig2022', 'exist_in_Litvinukova2020', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'default_embedding'
    obsm: 'X_umap'

In [185]:
# view obs and var data types

In [186]:
adata.obs.dtypes

Dataset                                     category
donor_id                                    category
development_stage                           category
sex                                         category
suspension_type                             category
assay                                       category
Original_annotation                         category
CellHint_harmonised_group                   category
cell_type                                   category
Curated_annotation                          category
organism                                    category
disease                                     category
tissue                                      category
barcodes                                      object
assays                                        object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    ca

In [187]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [188]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed barcodes from object to category
changed assays from object to category
changed self_reported_ethnicity_ontology_term_id from object to category
changed tissue_ontology_term_id from object to category


In [189]:
# view obs

In [190]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,51-year-old human stage,female,nucleus,10x 3' v2,03. Atrial Cardiomyocyte,Group41,regular atrial cardiac myocyte,Atrial cardiomyocytes,...,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,endothelial cell,Immune-related endothelial cells,...,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC1_cap,Group38,capillary endothelial cell,Capillary endothelial cells,...,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,seventh decade human stage,female,cell,10x 3' v3,EC5_art,Group38,endothelial cell of artery,Arterial endothelial cells,...,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098


In [191]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [192]:
# delete unwanted columns in obs

In [193]:
del adata.obs['tissue']
del adata.obs['organism']
del adata.obs['disease']
del adata.obs['assay']
del adata.obs['sex']
del adata.obs['development_stage']
del adata.obs['assays']
del adata.obs['barcodes']
del adata.uns['schema_version']
del adata.obs['cell_type']

In [194]:
# view obs

In [195]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,cell,EC1_cap,Group38,Capillary endothelial cells,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,cell,EC1_cap,Group38,Immune-related endothelial cells,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,cell,EC1_cap,Group38,Capillary endothelial cells,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,cell,EC5_art,Group38,Arterial endothelial cells,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098


In [196]:
# view var

In [197]:
adata.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000121410,True,True,True,True,False
ENSG00000268895,True,True,True,True,False
ENSG00000148584,True,True,True,True,False
ENSG00000175899,True,True,True,True,False
ENSG00000245105,True,True,True,True,False
...,...,...,...,...,...
ENSG00000203995,True,True,True,True,False
ENSG00000162378,True,True,True,True,False
ENSG00000074755,True,True,True,True,False
ENSG00000036549,True,True,True,True,False


In [198]:
araw.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000121410,True,True,True,True
ENSG00000268895,True,True,True,True
ENSG00000148584,True,True,True,True
ENSG00000175899,True,True,True,True
ENSG00000245105,True,True,True,True
...,...,...,...,...
ENSG00000203995,True,True,True,True
ENSG00000162378,True,True,True,True
ENSG00000074755,True,True,True,True
ENSG00000036549,True,True,True,True


In [199]:
#view uns

In [200]:
adata.uns

{'title': 'Heart', 'default_embedding': 'X_umap'}

In [201]:
list(adata.uns.keys())

['title', 'default_embedding']

In [202]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [203]:
# Remove unwanted columns in uns

In [204]:
#check the format of expression matrix

In [205]:
adata.X

<931012x31180 sparse matrix of type '<class 'numpy.float32'>'
	with 1041207772 stored elements in Compressed Sparse Row format>

In [206]:
araw.X

<931012x31180 sparse matrix of type '<class 'numpy.float32'>'
	with 1041207772 stored elements in Compressed Sparse Row format>

In [207]:
#Copy raw counts to adata.raw

In [208]:
del adata.raw

In [209]:
adata.raw = araw

In [210]:
obs_dtype = adata.obs.dtypes

In [211]:
obs_dtype

Dataset                                     category
donor_id                                    category
suspension_type                             category
Original_annotation                         category
CellHint_harmonised_group                   category
Curated_annotation                          category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
tissue_type                                 category
tissue_ontology_term_id                     category
dtype: object

In [212]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Final_objects/to_upload/Heart.h5ad', compression = 'gzip')

In [213]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
LA_1600_1_TACTTACTCCTCTAGC-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_GGCGTGTAGGAGTTGC-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_ACGCCGACATCGGTTA-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_ACCCACTGTCGGCACT-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
LA_1600_1_CCTTTCTTCCGCGGTA-1,Tucker et al. 2020,1600,nucleus,03. Atrial Cardiomyocyte,Group41,Atrial cardiomyocytes,EFO:0009899,CL:0002129,HsapDv:0000145,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATACGGT-1-HCAHeart8102862,Litvinukova et al. 2020,D11,cell,EC1_cap,Group38,Capillary endothelial cells,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCCTACCAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,cell,EC1_cap,Group38,Immune-related endothelial cells,EFO:0009922,CL:0000115,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCGACGCTG-1-HCAHeart8102862,Litvinukova et al. 2020,D11,cell,EC1_cap,Group38,Capillary endothelial cells,EFO:0009922,CL:0002144,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098
TTTGTTGTCGGCTGAC-1-HCAHeart8102862,Litvinukova et al. 2020,D11,cell,EC5_art,Group38,Arterial endothelial cells,EFO:0009922,CL:1000413,HsapDv:0000241,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0002098


In [214]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [215]:
adata.raw.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000121410,True,True,True,True
ENSG00000268895,True,True,True,True
ENSG00000148584,True,True,True,True
ENSG00000175899,True,True,True,True
ENSG00000245105,True,True,True,True
...,...,...,...,...
ENSG00000203995,True,True,True,True
ENSG00000162378,True,True,True,True
ENSG00000074755,True,True,True,True
ENSG00000036549,True,True,True,True


In [216]:
adata.var

Unnamed: 0_level_0,exist_in_Tucker2020,exist_in_Kuppe2022,exist_in_Koenig2022,exist_in_Litvinukova2020,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000121410,True,True,True,True,False
ENSG00000268895,True,True,True,True,False
ENSG00000148584,True,True,True,True,False
ENSG00000175899,True,True,True,True,False
ENSG00000245105,True,True,True,True,False
...,...,...,...,...,...
ENSG00000203995,True,True,True,True,False
ENSG00000162378,True,True,True,True,False
ENSG00000074755,True,True,True,True,False
ENSG00000036549,True,True,True,True,False


In [217]:
adata.raw.X

<931012x31180 sparse matrix of type '<class 'numpy.float32'>'
	with 1041207772 stored elements in Compressed Sparse Row format>

In [218]:
list(adata.obs['is_primary_data'].unique())

[True, False]