### **Curating Blood.h5ad**

Article: Automatic cell-type harmonization and integration across Human Cell Atlas datasets

DOI: https://doi.org/10.1016/j.cell.2023.11.026

Data Source : https://www.celltypist.org/organs

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Data/Blood.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 335916 × 47332
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'
    var: 'exist_in_Ren2021', 'exist_in_DominguezConde2022', 'exist_in_Stephenson2021', 'exist_in_Yoshida2021'
    uns: 'schema_version', 'title'
    obsm: 'X_umap'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<335916x47332 sparse matrix of type '<class 'numpy.float32'>'
	with 508549453 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 47325)	1.1308198
  (0, 47312)	1.1308198
  (0, 47306)	1.1308198
  (0, 47278)	1.1308198
  (0, 47264)	1.1308198
  (0, 47258)	1.1308198
  (0, 47255)	1.1308198
  (0, 47240)	1.6479644
  (0, 47234)	1.1308198
  (0, 47191)	1.1308198
  (0, 47110)	1.9871325
  (0, 47103)	1.1308198
  (0, 47073)	1.1308198
  (0, 47071)	1.1308198
  (0, 47058)	1.1308198
  (0, 47040)	1.1308198
  (0, 46962)	1.1308198
  (0, 46938)	1.9871325
  (0, 46929)	1.1308198
  (0, 46902)	1.1308198
  (0, 46897)	1.1308198
  (0, 46884)	1.1308198
  (0, 46874)	1.1308198
  (0, 46864)	1.1308198
  (0, 46853)	1.1308198
  :	:
  (335915, 40802)	2.1436036
  (335915, 41478)	2.1436036
  (335915, 25231)	2.7763467
  (335915, 25273)	2.7763467
  (335915, 18090)	2.7763467
  (335915, 40930)	2.7763467
  (335915, 46241)	2.1436036
  (335915, 22136)	2.1436036
  (335915, 46238)	2.1436036
  (335915, 21037)	2.1436036
  (335915, 15567)	2.1436036
  (335915, 20563)	2.1436036
  (335915, 23944)	2.1436036
  (335915, 41644)	2.7763467
  (335915, 30047)	2.1436036

##### **Raw counts matrix**

In [11]:
print(adata.raw.X)

  (0, 47325)	1.0
  (0, 47312)	1.0
  (0, 47306)	1.0
  (0, 47278)	1.0
  (0, 47264)	1.0
  (0, 47258)	1.0
  (0, 47255)	1.0
  (0, 47240)	2.0
  (0, 47234)	1.0
  (0, 47191)	1.0
  (0, 47110)	3.0
  (0, 47103)	1.0
  (0, 47073)	1.0
  (0, 47071)	1.0
  (0, 47058)	1.0
  (0, 47040)	1.0
  (0, 46962)	1.0
  (0, 46938)	3.0
  (0, 46929)	1.0
  (0, 46902)	1.0
  (0, 46897)	1.0
  (0, 46884)	1.0
  (0, 46874)	1.0
  (0, 46864)	1.0
  (0, 46853)	1.0
  :	:
  (335915, 40802)	1.0
  (335915, 41478)	1.0
  (335915, 25231)	2.0
  (335915, 25273)	2.0
  (335915, 18090)	2.0
  (335915, 40930)	2.0
  (335915, 46241)	1.0
  (335915, 22136)	1.0
  (335915, 46238)	1.0
  (335915, 21037)	1.0
  (335915, 15567)	1.0
  (335915, 20563)	1.0
  (335915, 23944)	1.0
  (335915, 41644)	2.0
  (335915, 30047)	1.0
  (335915, 41045)	1.0
  (335915, 42908)	1.0
  (335915, 40813)	5.0
  (335915, 46474)	1.0
  (335915, 44405)	1.0
  (335915, 30055)	1.0
  (335915, 41947)	1.0
  (335915, 21181)	1.0
  (335915, 34183)	1.0
  (335915, 40825)	2.0


In [12]:
adata.raw.var

Unnamed: 0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021
A1BG,True,True,True,True
A1BG-AS1,True,True,True,True
A1CF,True,True,True,True
A2M,True,True,True,True
A2M-AS1,True,True,True,True
...,...,...,...,...
ZZZ3,False,False,False,True
bP-21264C1.2,False,False,False,True
bP-2171C21.3,False,False,False,True
hsa-mir-1253,False,True,False,True


In [13]:
araw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)

##### **Variables(var)**

In [14]:
# View the var of anndata and raw object

In [15]:
adata.var

Unnamed: 0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021
A1BG,True,True,True,True
A1BG-AS1,True,True,True,True
A1CF,True,True,True,True
A2M,True,True,True,True
A2M-AS1,True,True,True,True
...,...,...,...,...
ZZZ3,False,False,False,True
bP-21264C1.2,False,False,False,True
bP-2171C21.3,False,False,False,True
hsa-mir-1253,False,True,False,True


In [16]:
ensembl_data = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/symbol2ID.csv')

In [17]:
ensembl_dict = dict(zip(ensembl_data['gene_symbol'], ensembl_data['gene_id']))

In [18]:
ensembl_dict

{'MT-TF': 'ENSG00000210049',
 'MT-RNR1': 'ENSG00000211459',
 'MT-TV': 'ENSG00000210077',
 'MT-RNR2': 'ENSG00000210082',
 'MT-TL1': 'ENSG00000209082',
 'MT-ND1': 'ENSG00000198888',
 'MT-TI': 'ENSG00000210100',
 'MT-TQ': 'ENSG00000210107',
 'MT-TM': 'ENSG00000210112',
 'MT-ND2': 'ENSG00000198763',
 'MT-TW': 'ENSG00000210117',
 'MT-TA': 'ENSG00000210127',
 'MT-TN': 'ENSG00000210135',
 'MT-TC': 'ENSG00000210140',
 'MT-TY': 'ENSG00000210144',
 'MT-CO1': 'ENSG00000198804',
 'MT-TS1': 'ENSG00000210151',
 'MT-TD': 'ENSG00000210154',
 'MT-CO2': 'ENSG00000198712',
 'MT-TK': 'ENSG00000210156',
 'MT-ATP8': 'ENSG00000228253',
 'MT-ATP6': 'ENSG00000198899',
 'MT-CO3': 'ENSG00000198938',
 'MT-TG': 'ENSG00000210164',
 'MT-ND3': 'ENSG00000198840',
 'MT-TR': 'ENSG00000210174',
 'MT-ND4L': 'ENSG00000212907',
 'MT-ND4': 'ENSG00000198886',
 'MT-TH': 'ENSG00000210176',
 'MT-TS2': 'ENSG00000210184',
 'MT-TL2': 'ENSG00000210191',
 'MT-ND5': 'ENSG00000198786',
 'MT-ND6': 'ENSG00000198695',
 'MT-TE': 'ENSG00000

In [19]:
adata.var['gene_id'] = adata.var_names.map(ensembl_dict)
araw.var['gene_id'] = araw.var_names.map(ensembl_dict)


In [20]:
adata.var

Unnamed: 0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id
A1BG,True,True,True,True,ENSG00000121410
A1BG-AS1,True,True,True,True,ENSG00000268895
A1CF,True,True,True,True,ENSG00000148584
A2M,True,True,True,True,ENSG00000175899
A2M-AS1,True,True,True,True,ENSG00000245105
...,...,...,...,...,...
ZZZ3,False,False,False,True,ENSG00000036549
bP-21264C1.2,False,False,False,True,
bP-2171C21.3,False,False,False,True,
hsa-mir-1253,False,True,False,True,ENSG00000272920


In [21]:
araw.var

Unnamed: 0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id
A1BG,True,True,True,True,ENSG00000121410
A1BG-AS1,True,True,True,True,ENSG00000268895
A1CF,True,True,True,True,ENSG00000148584
A2M,True,True,True,True,ENSG00000175899
A2M-AS1,True,True,True,True,ENSG00000245105
...,...,...,...,...,...
ZZZ3,False,False,False,True,ENSG00000036549
bP-21264C1.2,False,False,False,True,
bP-2171C21.3,False,False,False,True,
hsa-mir-1253,False,True,False,True,ENSG00000272920


In [22]:
nan_count = adata.var['gene_id'].isna().sum()
print("Number of NaN values in adata.obs['gene_id']: ", nan_count)

Number of NaN values in adata.obs['gene_id']:  13428


In [23]:
adata.var['gene_name'] = adata.var.index
araw.var['gene_name'] = araw.var.index

In [24]:
adata.var.index = adata.var['gene_id'] 
araw.var.index = araw.var['gene_id']

In [25]:
# Load the approved genes file.

In [26]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [27]:
#Create a dictionary from the approved genes file 

In [28]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [29]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [30]:
len(genedict)

119799

In [31]:
#Filter out the genes which are not in the approved genes file.

In [32]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [33]:
len(var_to_keep_adata)

30922

In [34]:
len(var_to_keep_araw)

30922

In [35]:
adata.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3
,False,False,False,True,,bP-21264C1.2
,False,False,False,True,,bP-2171C21.3
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253


In [36]:
araw.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3
,False,False,False,True,,bP-21264C1.2
,False,False,False,True,,bP-2171C21.3
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253


In [37]:
# Modify the anndata object by filtering out the filtered genes.

In [38]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [39]:
adata.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253


del adata.var['exist_in_Ren2021']
del adata.var['exist_in_DominguezConde2022']
del adata.var['exist_in_Stephenson2021']
del adata.var['exist_in_Yoshida2021']
del adata.var['gene_name']
del araw.var['gene_id']
del araw.var['exist_in_Ren2021']
del araw.var['exist_in_DominguezConde2022']
del araw.var['exist_in_Stephenson2021']
del araw.var['exist_in_Yoshida2021']
del araw.var['gene_name']
del adata.var['gene_id']

In [40]:
duplicate_count = len(adata.var.index) - len(adata.var.index.unique())
print("Number of duplicate values:", duplicate_count)

Number of duplicate values: 446


In [41]:
adata.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253


In [42]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = adata.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
adata = adata[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", adata.var_names.is_unique)

All variable names are unique: True


In [43]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = araw.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
araw = araw[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", araw.var_names.is_unique)

All variable names are unique: True


In [44]:
len(adata.var_names)

30476

In [45]:
adata.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253


In [46]:
araw.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253


feature is filtered

In [47]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [48]:
adata.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG,False
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1,False
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF,False
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M,False
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1,False
...,...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B,False
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1,False
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3,False
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253,False


In [49]:
araw.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253


#### **obs (Cell metadata)**

In [50]:
#view obs

In [51]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,Homo sapiens,normal,blood
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,Homo sapiens,normal,blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,Homo sapiens,normal,blood
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,Homo sapiens,normal,blood
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood


In [52]:
# view the column names in obs

In [53]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'],
      dtype='object')

#### **assay_ontology_term_id**

In [54]:
list(adata.obs['assay'].unique())

["10x 5' v2", "10x 5' v1", "10x 3' v3", "10x 3' transcription profiling"]

In [55]:
adata.obs['barcodes'] = adata.obs_names

In [56]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [57]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [58]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [59]:
adata.obs['assays'] = adata.obs['barcodes'].map(mapping)

In [60]:
list(adata.obs['assays'].unique())

['3pv2_5pv1_5pv2',
 '3pv2_5pv1_5pv2+3pv3',
 '3pv2_5pv1_5pv2+multiome',
 nan,
 '3pv3',
 '3pv3+multiome']

In [61]:
import pandas as pd
import scanpy as sc

# Suppose 'adata' is your AnnData object already loaded.

# Step 1: Check unique values in 'assay'
unique_assays = adata.obs['assay'].unique()
print("Unique values in 'assay':", unique_assays)

# Step 2: Display unique 'assays' for each unique 'assay'
for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['assay'] == assay, 'assays'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)


Unique values in 'assay': ['10x 5' v2', '10x 5' v1', '10x 3' v3', '10x 3' transcription profiling']
Categories (4, object): ['10x 3' transcription profiling', '10x 3' v3', '10x 5' v1', '10x 5' v2']
Unique 'assays' for assay 10x 5' v2: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay 10x 5' v1: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay 10x 3' v3: ['3pv3' '3pv2_5pv1_5pv2+3pv3' '3pv3+multiome']
Unique 'assays' for assay 10x 3' transcription profiling: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]


In [62]:
mapping ={"10x 5' v2":'EFO:0009900', "10x 5' v1":'EFO:0011025', "10x 3' v3":'EFO:0009922', "10x 3' transcription profiling":'EFO:0009899'}

In [63]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay'].map(mapping)

In [64]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,Homo sapiens,normal,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,Homo sapiens,normal,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,Homo sapiens,normal,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,Homo sapiens,normal,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025


In [65]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [66]:
# view adata.obs

In [67]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,Homo sapiens,normal,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,Homo sapiens,normal,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,Homo sapiens,normal,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,Homo sapiens,normal,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025


#### **cell_type_ontology_term_id**

In [68]:
#identify the column in adata.obs related. to cell type annotation

In [69]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id'],
      dtype='object')

In [70]:
list(adata.obs['cell_type'].unique())

['central memory CD4-positive, alpha-beta T cell',
 'CD16-positive, CD56-dim natural killer cell, human',
 'effector memory CD4-positive, alpha-beta T cell',
 'central memory CD8-positive, alpha-beta T cell',
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated',
 'classical monocyte',
 'class switched memory B cell',
 'mucosal invariant T cell',
 'naive B cell',
 'effector memory CD8-positive, alpha-beta T cell',
 'unswitched memory B cell',
 'non-classical monocyte',
 'CD16-negative, CD56-bright natural killer cell, human',
 'gamma-delta T cell',
 'regulatory T cell',
 'conventional dendritic cell',
 'plasma cell',
 'memory B cell',
 'effector memory CD4-positive, alpha-beta T cell, terminally differentiated',
 'megakaryocyte',
 'cell',
 'plasmacytoid dendritic cell',
 'plasmablast',
 'hematopoietic multipotent progenitor cell']

In [71]:

df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/cell_typist_annotation.csv')



In [72]:
mapping = df.set_index('Cell_type')['Cell_ontology_ID'].to_dict()

In [73]:
mapping={'central memory CD4-positive, alpha-beta T cell':'CL:0000904',
'CD16-positive, CD56-dim natural killer cell, human':'CL:0000939',
'effector memory CD4-positive, alpha-beta T cell':'CL:0000905',
'central memory CD8-positive, alpha-beta T cell':'CL:0000907',
'effector memory CD8-positive, alpha-beta T cell, terminally differentiated':'CL:0001062',
'classical monocyte':'CL:0000860',
'class switched memory B cell':'CL:0000972',
'mucosal invariant T cell':'CL:0000940',
'naive B cell':'CL:0000788',
'effector memory CD8-positive, alpha-beta T cell':'CL:0000913',
'unswitched memory B cell':'CL:0000970',
'non-classical monocyte':'CL:0000875',
'CD16-negative, CD56-bright natural killer cell, human':'CL:0000938',
'gamma-delta T cell':'CL:0000798',
'regulatory T cell':'CL:0000815',
'conventional dendritic cell':'CL:0000990',
'plasma cell':'CL:0000786',
'memory B cell':'CL:0000787',
'effector memory CD4-positive, alpha-beta T cell, terminally differentiated':'CL:0001087',
'megakaryocyte':'CL:0000556',
'cell':'CL:0000738',
'plasmacytoid dendritic cell':'CL:0000784',
'plasmablast':'CL:0000980',
'hematopoietic multipotent progenitor cell':'CL:0000837'}

In [74]:
# add the cell_type_ontology_term_id column

In [75]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type'].map(mapping)

In [76]:
# change datatype of the column

In [77]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [78]:
# view adata.obs

In [79]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,Homo sapiens,normal,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,Homo sapiens,normal,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,Homo sapiens,normal,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,Homo sapiens,normal,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904


In [80]:
import pandas as pd

# Assuming 'adata' is your AnnData object

# Find rows where 'cell_type_annotation' is NaN
nan_annotation_rows = adata.obs[adata.obs['cell_type_ontology_term_id'].isna()]

# Print the rows where 'cell_type_annotation' is NaN
print(nan_annotation_rows[['cell_type', 'cell_type_ontology_term_id']])


Empty DataFrame
Columns: [cell_type, cell_type_ontology_term_id]
Index: []


#### **development_stage_ontology_term_id**

In [81]:
# identify the column in adata which corresponds to age

In [82]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [83]:
list(adata.obs['development_stage'].unique())

['44-year-old human stage',
 '29-year-old human stage',
 '58-year-old human stage',
 '35-year-old human stage',
 '33-year-old human stage',
 '45-year-old human stage',
 '37-year-old human stage',
 '69-year-old human stage',
 '55-year-old human stage',
 '71-year-old human stage',
 '26-year-old human stage',
 '53-year-old human stage',
 '49-year-old human stage',
 '46-year-old human stage',
 '34-year-old human stage',
 '27-year-old human stage',
 '28-year-old human stage',
 '30-year-old human stage',
 'seventh decade human stage',
 'sixth decade human stage',
 '59-year-old human stage',
 '39-year-old human stage',
 '22-year-old human stage',
 '51-year-old human stage',
 '32-year-old human stage',
 '38-year-old human stage',
 '57-year-old human stage',
 '40-year-old human stage',
 '64-year-old human stage',
 '62-year-old human stage',
 '73-year-old human stage',
 '70-year-old human stage',
 '21-year-old human stage',
 '63-year-old human stage',
 'human adult stage']

In [84]:
age = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/age.csv')

In [85]:
age_dict = pd.Series(age['development_stage_ontology_term_id'].values, index=age['age']).to_dict()

In [86]:
age_dict

{'29-year-old human stage': 'HsapDv:0000123',
 '58-year-old human stage': 'HsapDv:0000152',
 '35-year-old human stage': 'HsapDv:0000129',
 '33-year-old human stage': 'HsapDv:0000127',
 '45-year-old human stage': 'HsapDv:0000139',
 '37-year-old human stage': 'HsapDv:0000131',
 '69-year-old human stage': 'HsapDv:0000163',
 '55-year-old human stage': 'HsapDv:0000149',
 '71-year-old human stage': 'HsapDv:0000165',
 '26-year-old human stage': 'HsapDv:0000120',
 '53-year-old human stage': 'HsapDv:0000147',
 '49-year-old human stage': 'HsapDv:0000143',
 '46-year-old human stage': 'HsapDv:0000140',
 '34-year-old human stage': 'HsapDv:0000128',
 '27-year-old human stage': 'HsapDv:0000121',
 '28-year-old human stage': 'HsapDv:0000122',
 '30-year-old human stage': 'HsapDv:0000124',
 'seventh decade human stage': 'HsapDv:0000241',
 'sixth decade human stage': 'HsapDv:0000240',
 '59-year-old human stage': 'HsapDv:0000153',
 '39-year-old human stage': 'HsapDv:0000133',
 '22-year-old human stage': 'H

In [87]:
donor_ids = ["A29", "390C", "A26 (386C)", "A26", "A32 (411C)", "A32", "A34 (417C)", 
             "417C", "356C", "A32 (411C)", "A26 (386C)", "284C", "368C", "296C", 
             "A33 (414C)", "A30 (398B)", "417c", "454C", "A32", "A37", "A40", "A44", 
             "A47", "640C", "390C", "A29", "390c", "302C", "302c", "390C", "390c","411C","A34","386C"]

# Convert `adata.obs['donor_id']` to a set for faster lookup
present_donors_set = set(adata.obs['donor_id'])

# Check which donors are present in `adata.obs['donor_id']`
present_donors = [donor for donor in donor_ids if donor in present_donors_set]

# Display the donors that are present
print("Donors present in adata.obs['donor_id']: ", present_donors)

Donors present in adata.obs['donor_id']:  []


In [88]:
donors_to_replace = ['A29', 'A26', 'A32', '417C','302c', '390c','417c','386C','390C','411C',]
if any(donor in adata.obs['donor_id'].values for donor in donors_to_replace):
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A29', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A26', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A32', 'A32 (411C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417C', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('302c', '302C')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417c', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390C', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('386C', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('411C', 'A32 (411C)')

In [89]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage'].map(age_dict)

In [90]:
update_dict = {
'D5' :'HsapDv:0000241',
'A29 (390C)' :'HsapDv:0000161',
'A37':'HsapDv:0000149',
'A34 (417C)':'HsapDv:0000094',
'356C':'HsapDv:0000239',
'A32 (411C)':'HsapDv:0000123',
'A26 (386C)':'HsapDv:0000169',
'284C':'HsapDv:0000240',
'368C':'HsapDv:0000240',
'296C':'HsapDv:0000238',
'A33 (414C)':'HsapDv:0000090',
'A30 (398B)':'HsapDv:0000090',
'417c':'HsapDv:0000094',
'A32':'HsapDv:0000123',
'A37':'HsapDv:0000153',
'A40':'HsapDv:0000158',
'A44':'HsapDv:0000160',
'A47':'HsapDv:0000152',
'640C':'HsapDv:0000242'}

# Update adata.obs['development_stage_ontology_term_id'] based on the update dictionary
adata.obs['development_stage_ontology_term_id'] = adata.obs.apply(
    lambda row: update_dict[row['donor_id']] if row['donor_id'] in update_dict else row['development_stage_ontology_term_id'], 
    axis=1
)

In [91]:
# change datatype of the column

In [92]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [93]:
# view unique values of development_stage_ontology_term_id column

In [94]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000138',
 'HsapDv:0000123',
 'HsapDv:0000152',
 'HsapDv:0000129',
 'HsapDv:0000127',
 'HsapDv:0000139',
 'HsapDv:0000131',
 'HsapDv:0000163',
 'HsapDv:0000149',
 'HsapDv:0000165',
 'HsapDv:0000120',
 'HsapDv:0000147',
 'HsapDv:0000143',
 'HsapDv:0000140',
 'HsapDv:0000128',
 'HsapDv:0000121',
 'HsapDv:0000122',
 'HsapDv:0000124',
 'HsapDv:0000241',
 'HsapDv:0000240',
 'HsapDv:0000153',
 'HsapDv:0000133',
 'HsapDv:0000116',
 'HsapDv:0000145',
 'HsapDv:0000126',
 'HsapDv:0000132',
 'HsapDv:0000151',
 'HsapDv:0000134',
 'HsapDv:0000158',
 'HsapDv:0000156',
 'HsapDv:0000167',
 'HsapDv:0000164',
 'HsapDv:0000115',
 'HsapDv:0000157',
 'HsapDv:0000087']

In [95]:
# view adata.obs

In [96]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,Homo sapiens,normal,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,Homo sapiens,normal,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,Homo sapiens,normal,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,Homo sapiens,normal,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087


#### **donor_id**

In [97]:
#identify the column in adata.obs which provides donor information

In [98]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [99]:
# add the donor_id column

In [100]:
adata.obs['donor_id'] = adata.obs['donor_id']

In [101]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [102]:
# change datatype of the column

In [103]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [104]:
# view unique values of donor_id column

In [105]:
list(adata.obs['donor_id'].unique())

['P-HC008',
 'P-HC009',
 'P-HC010',
 'P-HC011',
 'P-HC012',
 'P-HC013',
 'P-HC014',
 'P-HC015',
 'P-HC016',
 'P-HC017',
 'P-HC019',
 'P-HC018',
 'P-HC020',
 'P-HC001',
 'P-HC002',
 'P-HC003',
 'P-HC004',
 'P-HC005',
 'P-HC006',
 'P-HC007',
 'P-HC021',
 'P-HC022',
 'P-HC023',
 'P-HC024',
 'P-HC025',
 'A36',
 'A35',
 '621B',
 '637C',
 'D503',
 'D496',
 'newcastle65',
 'MH8919226',
 'MH8919333',
 'MH8919332',
 'MH8919227',
 'MH8919283',
 'MH8919178',
 'MH8919177',
 'MH8919176',
 'MH8919179',
 'newcastle74',
 'MH8919282',
 'CV0904',
 'CV0902',
 'CV0911',
 'CV0929',
 'CV0915',
 'CV0917',
 'CV0939',
 'CV0926',
 'CV0934',
 'CV0940',
 'CV0944',
 'AN5',
 'AN3',
 'AN6',
 'AN9',
 'AN14',
 'AN11',
 'AN12',
 'AN13',
 'AN2',
 'AN7',
 'AN1']

In [106]:
#view obs

In [107]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,Homo sapiens,normal,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,Homo sapiens,normal,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,Homo sapiens,normal,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,Homo sapiens,normal,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087


In [108]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [109]:
list(adata.obs['disease'].unique())

['normal']

In [110]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [111]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,Homo sapiens,normal,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,Homo sapiens,normal,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,Homo sapiens,normal,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,Homo sapiens,normal,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461


In [112]:
# change datatype of the column

In [113]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [114]:
# view obs

In [115]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,Homo sapiens,normal,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,Homo sapiens,normal,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,Homo sapiens,normal,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,Homo sapiens,normal,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461


#### **is_primary_data**

In [116]:
#change data type of column

In [117]:
list(adata.obs['Dataset'].unique())

['Ren et al. 2021',
 'Dominguez Conde et al. 2022',
 'Stephenson et al. 2021',
 'Yoshida et al. 2021']

In [118]:
mapping = {'Aizarani et al. 2019':False,
'Andrews et al. 2022':False,
'Guilliams et al. 2022':False,
'MacParland et al. 2018':False,
'HCA Kidney 2022':False,
'Lake et al. 2021':False,
'Muto et al. 2021':False,
'Stewart et al. 2019':False,
'Dominguez Conde et al. 2022':False,
'Elmentaite et al. 2021':False,
'James et al. 2020':False,
'Szabo et al. 2019':False,
'He et al. 2020':False,
'Micheli et al. 2020':True,
'Perez et al. 2022':True,
'Ren et al. 2021':False,
'Stephenson et al. 2021':False,
'Yoshida et al. 2021':False,
'Madissoon et al. 2020':False,
'Tabula Sapiens 2022':False,
'Ayhan et al. 2021':False,
'Franjic et al. 2022':True,
'Siletti et al. 2022':False,
'Tran et al. 2021':True,
'Burclaff et al. 2022':False,
'Smillie et al. 2019':False,
'Roy et al. 2021':True,
'Adams et al. 2020':False,
'Madissoon et al. 2022':False,
'Travaglini et al. 2020':False,
'Koenig et al. 2022':True,
'Kuppe et al. 2022':False,
'Litvinukova et al. 2020':False,
'Tucker et al. 2020':True,
'Fasolino et al. 2022':False,
'Muraro et al. 2016':False,
'Tosti et al. 2021':True,
'Tritschler et al. 2022':False}

In [119]:
adata.obs['is_primary_data']= adata.obs['Dataset'].map(mapping)

In [120]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [121]:
# view obs

In [122]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,Homo sapiens,normal,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,Homo sapiens,normal,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,Homo sapiens,normal,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,Homo sapiens,normal,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,Homo sapiens,normal,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,Homo sapiens,normal,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False


In [123]:
list(adata.obs['is_primary_data'].unique())

[False]

#### **organism_ontology_term_id**

In [124]:
# assign organism id 

In [125]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [126]:
#change data type of column

In [127]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [128]:
# view obs

In [129]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,normal,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,normal,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,...,normal,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,normal,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,...,normal,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,normal,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,...,normal,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,...,normal,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,normal,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [130]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/blood/metadata_blood.csv')
mapping = dict(zip(df['cells'], df['ethnicity']))

In [131]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs_names.map(mapping)

In [132]:
# change data type

In [133]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [134]:
# view obs

In [135]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,blood,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,blood,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,...,blood,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,blood,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,...,blood,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,blood,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,...,blood,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,...,blood,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,blood,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005


In [136]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [137]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['unknown', 'HANCESTRO:0005', 'HANCESTRO:0010', 'HANCESTRO:0009']

#### **sex_ontology_term_id**

In [138]:
# identify the column in adata.obs which corresponds to sex

In [139]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [140]:
list(adata.obs['sex'].unique())

['male', 'female']

In [141]:
# list the unique values 

In [142]:
mapping= {'female': 'PATO:0000383', 'male': 'PATO:0000384', 'unknown':'unknown'}

In [143]:
# add sex_ontology_term_id column

In [144]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex'].map(mapping)

In [145]:
# change data type

In [146]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [147]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,...,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,...,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,...,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,...,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384


#### **suspension_type**

In [148]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,...,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,...,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,...,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,...,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384


In [149]:
list(adata.obs['suspension_type'].unique())

['cell']

In [150]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [151]:
# change data type of column

In [152]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [153]:
# view obs

In [154]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,AAACCTGCAAAGGAAG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,AAACCTGCACCCTATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,...,AAACCTGCACGTCAGC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,AAACCTGCAGGATCGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,...,AAACCTGCATCGGACC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,TTTACTGAGCTAAGAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,...,TTTCCTCTCCAATGGT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,...,TTTCCTCTCTGGGCCA,3pv2_5pv1_5pv2+3pv3,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,TTTGTCAAGATGAGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384


#### **tissue_type**

In [155]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [156]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [157]:
# identify the column in adata.obs which corresponds to tissue

In [158]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type'],
      dtype='object')

In [159]:
list(adata.obs['tissue'].unique())

['blood']

In [160]:
tissue_dict= {'blood' : 'UBERON:0000178',
         'bone marrow':'UBERON:0002371',
          'heart':'UBERON:0000948',
          'intestine':'UBERON:0000160',
          'kidney':'UBERON:0002113',
          'hippocampal formation':'UBERON:0002421',
          'liver':'UBERON:0002107',
          'lung':'UBERON:0002048',
          'lymph node':'UBERON:0000029',
          'pancreas':'UBERON:0001264',
          'skeletal muscle organ':'UBERON:0014892',
          'spleen':'UBERON:0002106'}

In [161]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/blood/metadata_blood.csv')
mapping = dict(zip(df['cells'], df['tissue_ontology_term_id']))
adata.obs['tissue_ontology_term_id'] = adata.obs_names.map(mapping)

In [162]:
# add 'tissue_ontology_term_id' column

In [163]:
# change data type of column

In [164]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [165]:
#list the unique values in 'tissue_ontology_term_id' column

In [166]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0000178']

In [167]:
#adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].replace('unknown', 'UBERON:0000178')

In [168]:
# view obs

In [169]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,...,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,...,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,...,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,...,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178


In [170]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [171]:
# view obsm

In [172]:
# check whether all columns are prefixed with X

In [173]:
adata.obsm

AxisArrays with keys: X_umap

#### **uns (Dataset Metadata)**

In [174]:
# View

In [175]:
adata.uns

{'schema_version': '3.0.0', 'title': 'Adult human blood'}

In [176]:
adata.uns.keys

<function dict.keys>

In [177]:
# Give a title for the dataset

In [178]:
adata.uns['title'] = 'Blood'

In [179]:
# Set the default embedding

In [180]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [181]:
# view anndata object

In [182]:
adata

AnnData object with n_obs × n_vars = 335916 × 30476
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue', 'barcodes', 'assays', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'
    var: 'exist_in_Ren2021', 'exist_in_DominguezConde2022', 'exist_in_Stephenson2021', 'exist_in_Yoshida2021', 'gene_id', 'gene_name', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'default_embedding'
    obsm: 'X_umap'

In [183]:
# view obs and var data types

In [184]:
adata.obs.dtypes

Dataset                                     category
donor_id                                    category
development_stage                           category
sex                                         category
suspension_type                             category
assay                                       category
Original_annotation                         category
CellHint_harmonised_group                   category
cell_type                                   category
Curated_annotation                          category
organism                                    category
disease                                     category
tissue                                      category
barcodes                                      object
assays                                        object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    ca

In [185]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [186]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed barcodes from object to category
changed assays from object to category


In [187]:
# view obs

In [188]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c01-LEF1,Group40,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c05-FOS,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,NK_c01-FCGR3A,Group48,"CD16-positive, CD56-dim natural killer cell, h...",CD16+ NK cells,...,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD4_c02-AQP3,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,44-year-old human stage,male,cell,10x 5' v2,T_CD8_c01-LEF1,Group42,"central memory CD8-positive, alpha-beta T cell",Tcm/Naive cytotoxic T cells,...,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 helper,Group20,"effector memory CD4-positive, alpha-beta T cell",Tem/Effector helper T cells,...,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,Monocyte CD16,Group49,classical monocyte,Classical monocytes,...,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,pDC,Group37,plasmacytoid dendritic cell,pDC,...,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,human adult stage,male,cell,10x 5' v1,T CD4 naive,Group19,"central memory CD4-positive, alpha-beta T cell",Tcm/Naive helper T cells,...,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178


In [189]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [190]:
# delete unwanted columns in obs

In [191]:
del adata.obs['tissue']
del adata.obs['organism']
del adata.obs['disease']
del adata.obs['assay']
del adata.obs['sex']
del adata.obs['development_stage']
del adata.obs['assays']
del adata.obs['barcodes']
del adata.uns['schema_version']
del adata.obs['cell_type']

In [192]:
# view obs

In [193]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,cell,T_CD4_c01-LEF1,Group40,Tcm/Naive helper T cells,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,cell,T_CD4_c05-FOS,Group19,Tcm/Naive helper T cells,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,cell,NK_c01-FCGR3A,Group48,CD16+ NK cells,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,cell,T_CD4_c02-AQP3,Group20,Tem/Effector helper T cells,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,cell,T_CD8_c01-LEF1,Group42,Tcm/Naive cytotoxic T cells,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,cell,T CD4 helper,Group20,Tem/Effector helper T cells,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,cell,Monocyte CD16,Group49,Classical monocytes,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,cell,pDC,Group37,pDC,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,cell,T CD4 naive,Group19,Tcm/Naive helper T cells,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178


In [194]:
# view var

In [195]:
adata.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG,False
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1,False
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF,False
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M,False
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1,False
...,...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B,False
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1,False
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3,False
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253,False


In [196]:
araw.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253


In [197]:
#view uns

In [198]:
adata.uns

{'title': 'Blood', 'default_embedding': 'X_umap'}

In [199]:
list(adata.uns.keys())

['title', 'default_embedding']

In [200]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [201]:
# Remove unwanted columns in uns

In [202]:
#check the format of expression matrix

In [203]:
adata.X

<335916x30476 sparse matrix of type '<class 'numpy.float32'>'
	with 450091156 stored elements in Compressed Sparse Row format>

In [204]:
araw.X

<335916x30476 sparse matrix of type '<class 'numpy.float32'>'
	with 450091156 stored elements in Compressed Sparse Row format>

In [205]:
#Copy raw counts to adata.raw

In [206]:
del adata.raw

In [207]:
adata.raw = araw

In [208]:
obs_dtype = adata.obs.dtypes

In [209]:
obs_dtype

Dataset                                     category
donor_id                                    category
suspension_type                             category
Original_annotation                         category
CellHint_harmonised_group                   category
Curated_annotation                          category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
tissue_type                                 category
tissue_ontology_term_id                     category
dtype: object

In [210]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Final_objects/to_upload/Blood.h5ad', compression = 'gzip')

In [211]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
AAACCTGCAAAGGAAG-30,Ren et al. 2021,P-HC008,cell,T_CD4_c01-LEF1,Group40,Tcm/Naive helper T cells,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCACCCTATC-30,Ren et al. 2021,P-HC008,cell,T_CD4_c05-FOS,Group19,Tcm/Naive helper T cells,EFO:0009900,CL:0000904,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCACGTCAGC-30,Ren et al. 2021,P-HC008,cell,NK_c01-FCGR3A,Group48,CD16+ NK cells,EFO:0009900,CL:0000939,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCAGGATCGA-30,Ren et al. 2021,P-HC008,cell,T_CD4_c02-AQP3,Group20,Tem/Effector helper T cells,EFO:0009900,CL:0000905,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
AAACCTGCATCGGACC-30,Ren et al. 2021,P-HC008,cell,T_CD8_c01-LEF1,Group42,Tcm/Naive cytotoxic T cells,EFO:0009900,CL:0000907,HsapDv:0000138,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM10202407-CV001_KM10202419_TTTACTGAGCTAAGAT-1,Yoshida et al. 2021,AN1,cell,T CD4 helper,Group20,Tem/Effector helper T cells,EFO:0011025,CL:0000905,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCCAATGGT-1,Yoshida et al. 2021,AN1,cell,Monocyte CD16,Group49,Classical monocytes,EFO:0011025,CL:0000860,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTCCTCTCTGGGCCA-1,Yoshida et al. 2021,AN1,cell,pDC,Group37,pDC,EFO:0011025,CL:0000784,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178
CV001_KM10202407-CV001_KM10202419_TTTGTCAAGATGAGAG-1,Yoshida et al. 2021,AN1,cell,T CD4 naive,Group19,Tcm/Naive helper T cells,EFO:0011025,CL:0000904,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0000178


In [212]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [213]:
adata.raw.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253


In [214]:
adata.var

Unnamed: 0_level_0,exist_in_Ren2021,exist_in_DominguezConde2022,exist_in_Stephenson2021,exist_in_Yoshida2021,gene_id,gene_name,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG,False
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1,False
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF,False
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M,False
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1,False
...,...,...,...,...,...,...,...
ENSG00000162378,True,True,True,True,ENSG00000162378,ZYG11B,False
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1,False
ENSG00000036549,False,False,False,True,ENSG00000036549,ZZZ3,False
ENSG00000272920,False,True,False,True,ENSG00000272920,hsa-mir-1253,False


In [215]:
adata.raw.X

<335916x30476 sparse matrix of type '<class 'numpy.float32'>'
	with 450091156 stored elements in Compressed Sparse Row format>

In [216]:
list(adata.obs['is_primary_data'].unique())

[False]