### **Curating Liver.h5ad**

Article: Automatic cell-type harmonization and integration across Human Cell Atlas datasets

DOI: https://doi.org/10.1016/j.cell.2023.11.026

Data Source : https://www.celltypist.org/organs

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Data/Liver.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 259678 × 47951
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'
    var: 'exist_in_Andrews2022', 'exist_in_MacParland2018', 'exist_in_Aizarani2019', 'exist_in_Guilliams2022'
    uns: 'schema_version', 'title'
    obsm: 'X_umap'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<259678x47951 sparse matrix of type '<class 'numpy.float32'>'
	with 382514083 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 9919)	3.1324384
  (0, 15855)	3.8035388
  (0, 42396)	3.1324384
  (0, 40784)	3.1324384
  (0, 5502)	3.1324384
  (0, 5800)	3.1324384
  (0, 42372)	3.1324384
  (0, 20817)	3.1324384
  (0, 40837)	3.8035388
  (0, 25563)	4.2015457
  (0, 40794)	4.2015457
  (0, 25597)	3.1324384
  (0, 13214)	3.1324384
  (0, 15660)	3.1324384
  (0, 45934)	3.1324384
  (0, 4877)	3.1324384
  (0, 5660)	3.1324384
  (0, 40852)	3.8035388
  (0, 23781)	4.2015457
  (0, 12945)	3.1324384
  (0, 44888)	3.1324384
  (0, 27825)	3.1324384
  (0, 23775)	5.9237394
  (0, 40790)	4.2015457
  (0, 23771)	5.730157
  :	:
  (259677, 15512)	1.6239954
  (259677, 20984)	2.2133865
  (259677, 44358)	1.6239954
  (259677, 3855)	2.5817277
  (259677, 3856)	1.6239954
  (259677, 17430)	1.6239954
  (259677, 26288)	2.2133865
  (259677, 43355)	1.6239954
  (259677, 43598)	2.2133865
  (259677, 46519)	1.6239954
  (259677, 5638)	2.2133865
  (259677, 44803)	1.6239954
  (259677, 12574)	1.6239954
  (259677, 15857)	1.6239954
  (259677, 46024)	1.6239954
  (25967

##### **Raw counts matrix**

In [11]:
print(adata.raw.X)

  (0, 9919)	1.0
  (0, 15855)	2.0
  (0, 42396)	1.0
  (0, 40784)	1.0
  (0, 5502)	1.0
  (0, 5800)	1.0
  (0, 42372)	1.0
  (0, 20817)	1.0
  (0, 40837)	2.0
  (0, 25563)	3.0
  (0, 40794)	3.0
  (0, 25597)	1.0
  (0, 13214)	1.0
  (0, 15660)	1.0
  (0, 45934)	1.0
  (0, 4877)	1.0
  (0, 5660)	1.0
  (0, 40852)	2.0
  (0, 23781)	3.0
  (0, 12945)	1.0
  (0, 44888)	1.0
  (0, 27825)	1.0
  (0, 23775)	17.0
  (0, 40790)	3.0
  (0, 23771)	14.0
  :	:
  (259677, 15512)	1.0
  (259677, 20984)	2.0
  (259677, 44358)	1.0
  (259677, 3855)	3.0
  (259677, 3856)	1.0
  (259677, 17430)	1.0
  (259677, 26288)	2.0
  (259677, 43355)	1.0
  (259677, 43598)	2.0
  (259677, 46519)	1.0
  (259677, 5638)	2.0
  (259677, 44803)	1.0
  (259677, 12574)	1.0
  (259677, 15857)	1.0
  (259677, 46024)	1.0
  (259677, 17129)	1.0
  (259677, 46419)	4.0
  (259677, 26638)	1.0
  (259677, 23853)	1.0
  (259677, 8781)	1.0
  (259677, 28144)	1.0
  (259677, 12900)	1.0
  (259677, 6907)	1.0
  (259677, 14635)	2.0
  (259677, 3406)	1.0


In [12]:
adata.layers['counts']= adata.raw.X

In [13]:
# Access the 'counts' layer
counts = adata.layers['counts']

# Create a boolean mask for non-zero elements
non_zero_mask = counts != 0

# Convert non-zero values to integers
counts[non_zero_mask] = counts[non_zero_mask].round().astype(int)

# Update the 'counts' layer
adata.layers['counts'] = counts


In [14]:
araw = ad.AnnData(X=adata.layers['counts'].copy(), obs=adata.obs.copy(), var=adata.raw.var.copy())

In [15]:
adata.raw.X

<259678x47951 sparse matrix of type '<class 'numpy.float32'>'
	with 382514083 stored elements in Compressed Sparse Row format>

In [16]:
adata.raw.var

Unnamed: 0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022
5S_rRNA,False,False,True,True
5_8S_rRNA,False,False,True,False
7SK,False,False,True,True
7SK.1,False,False,False,True
7SK.2,False,False,False,True
...,...,...,...,...
snoZ278,False,False,True,False
snoZ40,False,False,True,False
snoZ6,False,False,True,False
snosnR66,False,False,True,False


In [17]:
#araw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)

##### **Variables(var)**

In [18]:
# View the var of anndata and raw object

In [19]:
adata.var

Unnamed: 0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022
5S_rRNA,False,False,True,True
5_8S_rRNA,False,False,True,False
7SK,False,False,True,True
7SK.1,False,False,False,True
7SK.2,False,False,False,True
...,...,...,...,...
snoZ278,False,False,True,False
snoZ40,False,False,True,False
snoZ6,False,False,True,False
snosnR66,False,False,True,False


In [20]:
ensembl_data = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/symbol2ID.csv')

In [21]:
ensembl_dict = dict(zip(ensembl_data['gene_symbol'], ensembl_data['gene_id']))

In [22]:
ensembl_dict

{'MT-TF': 'ENSG00000210049',
 'MT-RNR1': 'ENSG00000211459',
 'MT-TV': 'ENSG00000210077',
 'MT-RNR2': 'ENSG00000210082',
 'MT-TL1': 'ENSG00000209082',
 'MT-ND1': 'ENSG00000198888',
 'MT-TI': 'ENSG00000210100',
 'MT-TQ': 'ENSG00000210107',
 'MT-TM': 'ENSG00000210112',
 'MT-ND2': 'ENSG00000198763',
 'MT-TW': 'ENSG00000210117',
 'MT-TA': 'ENSG00000210127',
 'MT-TN': 'ENSG00000210135',
 'MT-TC': 'ENSG00000210140',
 'MT-TY': 'ENSG00000210144',
 'MT-CO1': 'ENSG00000198804',
 'MT-TS1': 'ENSG00000210151',
 'MT-TD': 'ENSG00000210154',
 'MT-CO2': 'ENSG00000198712',
 'MT-TK': 'ENSG00000210156',
 'MT-ATP8': 'ENSG00000228253',
 'MT-ATP6': 'ENSG00000198899',
 'MT-CO3': 'ENSG00000198938',
 'MT-TG': 'ENSG00000210164',
 'MT-ND3': 'ENSG00000198840',
 'MT-TR': 'ENSG00000210174',
 'MT-ND4L': 'ENSG00000212907',
 'MT-ND4': 'ENSG00000198886',
 'MT-TH': 'ENSG00000210176',
 'MT-TS2': 'ENSG00000210184',
 'MT-TL2': 'ENSG00000210191',
 'MT-ND5': 'ENSG00000198786',
 'MT-ND6': 'ENSG00000198695',
 'MT-TE': 'ENSG00000

In [23]:
adata.var['gene_id'] = adata.var_names.map(ensembl_dict)
araw.var['gene_id'] = araw.var_names.map(ensembl_dict)


In [24]:
adata.var

Unnamed: 0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,gene_id
5S_rRNA,False,False,True,True,ENSG00000278457
5_8S_rRNA,False,False,True,False,ENSG00000278294
7SK,False,False,True,True,ENSG00000274262
7SK.1,False,False,False,True,
7SK.2,False,False,False,True,
...,...,...,...,...,...
snoZ278,False,False,True,False,ENSG00000252868
snoZ40,False,False,True,False,ENSG00000201410
snoZ6,False,False,True,False,ENSG00000266692
snosnR66,False,False,True,False,ENSG00000212397


In [25]:
araw.var

Unnamed: 0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,gene_id
5S_rRNA,False,False,True,True,ENSG00000278457
5_8S_rRNA,False,False,True,False,ENSG00000278294
7SK,False,False,True,True,ENSG00000274262
7SK.1,False,False,False,True,
7SK.2,False,False,False,True,
...,...,...,...,...,...
snoZ278,False,False,True,False,ENSG00000252868
snoZ40,False,False,True,False,ENSG00000201410
snoZ6,False,False,True,False,ENSG00000266692
snosnR66,False,False,True,False,ENSG00000212397


In [26]:
nan_count = adata.var['gene_id'].isna().sum()
print("Number of NaN values in adata.obs['gene_id']: ", nan_count)

Number of NaN values in adata.obs['gene_id']:  3158


In [27]:
adata.var['gene_name'] = adata.var.index
araw.var['gene_name'] = araw.var.index

In [28]:
adata.var.index = adata.var['gene_id'] 
araw.var.index = araw.var['gene_id']

In [29]:
# Load the approved genes file.

In [30]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [31]:
#Create a dictionary from the approved genes file 

In [32]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [33]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [34]:
len(genedict)

119799

In [35]:
#Filter out the genes which are not in the approved genes file.

In [36]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [37]:
len(var_to_keep_adata)

39891

In [38]:
len(var_to_keep_araw)

39891

In [39]:
adata.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000278457,False,False,True,True,ENSG00000278457,5S_rRNA
ENSG00000278294,False,False,True,False,ENSG00000278294,5_8S_rRNA
ENSG00000274262,False,False,True,True,ENSG00000274262,7SK
,False,False,False,True,,7SK.1
,False,False,False,True,,7SK.2
...,...,...,...,...,...,...
ENSG00000252868,False,False,True,False,ENSG00000252868,snoZ278
ENSG00000201410,False,False,True,False,ENSG00000201410,snoZ40
ENSG00000266692,False,False,True,False,ENSG00000266692,snoZ6
ENSG00000212397,False,False,True,False,ENSG00000212397,snosnR66


In [40]:
araw.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000278457,False,False,True,True,ENSG00000278457,5S_rRNA
ENSG00000278294,False,False,True,False,ENSG00000278294,5_8S_rRNA
ENSG00000274262,False,False,True,True,ENSG00000274262,7SK
,False,False,False,True,,7SK.1
,False,False,False,True,,7SK.2
...,...,...,...,...,...,...
ENSG00000252868,False,False,True,False,ENSG00000252868,snoZ278
ENSG00000201410,False,False,True,False,ENSG00000201410,snoZ40
ENSG00000266692,False,False,True,False,ENSG00000266692,snoZ6
ENSG00000212397,False,False,True,False,ENSG00000212397,snosnR66


In [41]:
# Modify the anndata object by filtering out the filtered genes.

In [42]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [43]:
adata.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000278457,False,False,True,True,ENSG00000278457,5S_rRNA
ENSG00000278294,False,False,True,False,ENSG00000278294,5_8S_rRNA
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
...,...,...,...,...,...,...
ENSG00000252945,False,False,True,False,ENSG00000252945,snoU83B
ENSG00000252868,False,False,True,False,ENSG00000252868,snoZ278
ENSG00000201410,False,False,True,False,ENSG00000201410,snoZ40
ENSG00000266692,False,False,True,False,ENSG00000266692,snoZ6


In [44]:
# View var

In [45]:
araw.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000278457,False,False,True,True,ENSG00000278457,5S_rRNA
ENSG00000278294,False,False,True,False,ENSG00000278294,5_8S_rRNA
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
...,...,...,...,...,...,...
ENSG00000252945,False,False,True,False,ENSG00000252945,snoU83B
ENSG00000252868,False,False,True,False,ENSG00000252868,snoZ278
ENSG00000201410,False,False,True,False,ENSG00000201410,snoZ40
ENSG00000266692,False,False,True,False,ENSG00000266692,snoZ6


In [46]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = adata.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
adata = adata[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", adata.var_names.is_unique)

All variable names are unique: True


In [47]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = araw.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
araw = araw[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", araw.var_names.is_unique)

All variable names are unique: True


feature is filtered

In [48]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [49]:
adata.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,gene_id,gene_name,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000278457,False,False,True,True,ENSG00000278457,5S_rRNA,False
ENSG00000278294,False,False,True,False,ENSG00000278294,5_8S_rRNA,False
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG,False
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1,False
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF,False
...,...,...,...,...,...,...,...
ENSG00000252945,False,False,True,False,ENSG00000252945,snoU83B,False
ENSG00000252868,False,False,True,False,ENSG00000252868,snoZ278,False
ENSG00000201410,False,False,True,False,ENSG00000201410,snoZ40,False
ENSG00000266692,False,False,True,False,ENSG00000266692,snoZ6,False


In [50]:
araw.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000278457,False,False,True,True,ENSG00000278457,5S_rRNA
ENSG00000278294,False,False,True,False,ENSG00000278294,5_8S_rRNA
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
...,...,...,...,...,...,...
ENSG00000252945,False,False,True,False,ENSG00000252945,snoU83B
ENSG00000252868,False,False,True,False,ENSG00000252868,snoZ278
ENSG00000201410,False,False,True,False,ENSG00000201410,snoZ40
ENSG00000266692,False,False,True,False,ENSG00000266692,snoZ6


In [51]:
del adata.var['gene_id']
del araw.var['gene_id']
del adata.var['gene_name']
del araw.var['gene_name']

#### **obs (Cell metadata)**

In [52]:
#view obs

In [53]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,Homo sapiens,normal,liver
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,Homo sapiens,normal,liver
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,Homo sapiens,normal,liver
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,Homo sapiens,normal,liver
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,Homo sapiens,normal,liver
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver


In [54]:
# view the column names in obs

In [55]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'],
      dtype='object')

#### **assay_ontology_term_id**

In [56]:
list(adata.obs['assay'].unique())

["10x 3' v2", "10x 3' v3", 'CEL-seq2']

In [57]:
adata.obs['barcodes'] = adata.obs_names

In [58]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [59]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [60]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [61]:
adata.obs['assays'] = adata.obs['barcodes'].map(mapping)

In [62]:
list(adata.obs['assays'].unique())

['3pv2_5pv1_5pv2',
 '3pv2_5pv1_5pv2+multiome',
 '3pv2_5pv1_5pv2+3pv3',
 '3pv3',
 '3pv3+multiome',
 nan]

In [63]:
import pandas as pd
import scanpy as sc

# Suppose 'adata' is your AnnData object already loaded.

# Step 1: Check unique values in 'assay'
unique_assays = adata.obs['assay'].unique()
print("Unique values in 'assay':", unique_assays)

# Step 2: Display unique 'assays' for each unique 'assay'
for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['assay'] == assay, 'assays'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)


Unique values in 'assay': ['10x 3' v2', '10x 3' v3', 'CEL-seq2']
Categories (3, object): ['10x 3' v2', '10x 3' v3', 'CEL-seq2']
Unique 'assays' for assay 10x 3' v2: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+multiome' '3pv2_5pv1_5pv2+3pv3' nan]
Unique 'assays' for assay 10x 3' v3: ['3pv3' '3pv2_5pv1_5pv2+3pv3' '3pv3+multiome' nan]
Unique 'assays' for assay CEL-seq2: [nan]


In [64]:
mapping ={"10x 5' v2":'EFO:0009900', "10x 5' v1":'EFO:0011025', "10x 3' v3":'EFO:0009922', "10x 3' transcription profiling":'EFO:0009899','Smart-seq2':'EFO:0008931',"10x 3' v2":'EFO:0009899',"10x 5' transcription profiling":'EFO:0030004','CEL-seq2':'EFO:0010010'}

In [65]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay'].map(mapping)

In [66]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,Homo sapiens,normal,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,Homo sapiens,normal,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922


In [67]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [68]:
# view adata.obs

In [69]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,Homo sapiens,normal,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,Homo sapiens,normal,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922


#### **cell_type_ontology_term_id**

In [70]:
#identify the column in adata.obs related. to cell type annotation

In [71]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id'],
      dtype='object')

In [72]:
list(adata.obs['cell_type'].unique())

['cell',
 'endothelial cell of pericentral hepatic sinusoid',
 'midzonal region hepatocyte',
 'inflammatory macrophage',
 'centrilobular region hepatocyte',
 'periportal region hepatocyte',
 'endothelial cell of periportal hepatic sinusoid',
 'macrophage',
 'classical monocyte',
 'cholangiocyte',
 'effector memory CD8-positive, alpha-beta T cell',
 'hepatic stellate cell',
 'plasma cell',
 'CD16-negative, CD56-bright natural killer cell, human',
 'mucosal invariant T cell',
 'CD16-positive, CD56-dim natural killer cell, human',
 'B cell',
 'conventional dendritic cell',
 'non-classical monocyte',
 'natural killer cell',
 'neutrophil',
 'CD4-positive, alpha-beta T cell',
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated',
 'dendritic cell, human',
 'plasmacytoid dendritic cell',
 'fibroblast']

In [73]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/cell_typist_annotation.csv')


In [74]:
mapping = df.set_index('Cell_type')['Cell_ontology_ID'].to_dict()

In [75]:
mapping={'central memory CD4-positive, alpha-beta T cell': 'CL:0000904',
 'CD16-positive, CD56-dim natural killer cell, human': 'CL:0000939',
 'effector memory CD4-positive, alpha-beta T cell': 'CL:0000905',
 'central memory CD8-positive, alpha-beta T cell': 'CL:0000907',
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated': 'CL:0001062',
 'classical monocyte': 'CL:0000860',
 'class switched memory B cell': 'CL:0000972',
 'mucosal invariant T cell': 'CL:0000940',
 'naive B cell': 'CL:0000788',
 'effector memory CD8-positive, alpha-beta T cell': 'CL:0000913',
 'unswitched memory B cell': 'CL:0000970',
 'non-classical monocyte': 'CL:0000875',
 'CD16-negative, CD56-bright natural killer cell, human': 'CL:0000938',
 'gamma-delta T cell': 'CL:0000798',
 'regulatory T cell': 'CL:0000815',
 'conventional dendritic cell': 'CL:0000990',
 'plasma cell': 'CL:0000786',
 'memory B cell': 'CL:0000787',
 'effector memory CD4-positive, alpha-beta T cell, terminally differentiated': 'CL:0001087',
 'megakaryocyte': 'CL:0000556',
 'Cycling immune mix': 'CL:0000738',
 'plasmacytoid dendritic cell': 'CL:0000784',
 'plasmablast': 'CL:0000980',
 'hematopoietic multipotent progenitor cell': 'CL:0000837',
 'neutrophil progenitor cell': 'CL:0000834',
 'megakaryocyte progenitor cell': 'CL:0000553',
 'erythroid lineage cell': 'CL:0000764',
 'common myeloid progenitor': 'CL:0000049',
 'megakaryocyte-erythroid progenitor cell': 'CL:0000050',
 'early lymphoid progenitor': 'CL:0000936',
 'erythrocyte': 'CL:0000232',
 'granulocyte monocyte progenitor cell': 'CL:0000557',
 'promyelocyte': 'CL:0000836',
 'neutrophil': 'CL:0000775',
 'Cycling T&NK': 'CL:0000814',
 'common dendritic progenitor': 'CL:0001029',
 'thymocyte': 'CL:0000893',
 'T follicular helper cell': 'CL:0002038',
 'mast cell': 'CL:0000097',
 'pro-B cell': 'CL:0000826',
 'small pre-B-II cell': 'CL:0000954',
 'plasmacytoid dendritic cell, human': 'CL:0001058',
 'regular atrial cardiac myocyte': 'CL:0002129',
 'fat cell': 'CL:0000136',
 'fibroblast': 'CL:0000057',
 'vascular associated smooth muscle cell': 'CL:0000359',
 'myeloid cell': 'CL:0000763',
 'cardiac muscle cell': 'CL:0000746',
 'capillary endothelial cell': 'CL:0002144',
 'endothelial cell': 'CL:0000115',
 'smooth muscle cell': 'CL:0000192',
 'pericyte': 'CL:0000669',
 'endothelial cell of lymphatic vessel': 'CL:0002138',
 'endothelial cell of artery': 'CL:1000413',
 'vein endothelial cell': 'CL:0002543',
 'regular ventricular cardiac myocyte': 'CL:0002131',
 'lymphocyte': 'CL:0000542',
 'neuron': 'CL:0000540',
 'natural killer cell': 'CL:0000623',
 'mesothelial cell': 'CL:0000077',
 'hippocampal granule cell': 'CL:0001033',
 'hippocampal pyramidal neuron': 'CL:1001571',
 'astrocyte of the hippocampus': 'CL:0002604',
 'oligodendrocyte': 'CL:0000128',
 'hippocampal interneuron': 'CL:1001569',
 'mature microglial cell': 'CL:0002629',
 'oligodendrocyte precursor cell': 'CL:0002453',
 'smooth muscle cell of the brain vasculature': 'CL:0002590',
 'vascular leptomeningeal cell': 'CL:4023051',
 'macrophage': 'CL:0000235',
 'T cell': 'CL:0000084',
 'ependymal cell': 'CL:0000065',
 'transit amplifying cell': 'CL:0009010',
 'enterocyte': 'CL:0000584',
 'tuft cell of colon': 'CL:0009041',
 'goblet cell': 'CL:0000160',
 'BEST4+ intestinal epithelial cell, human': 'CL:4030026',
 'paneth cell': 'CL:0000510',
 'enterocyte of epithelium of large intestine': 'CL:0002071',
 'enteroendocrine cell': 'CL:0000164',
 'stem cell': 'CL:0000034',
 'stromal cell': 'CL:0000499',
 'myofibroblast cell': 'CL:0000186',
 'glial cell': 'CL:0000125',
 'colon macrophage': 'CL:0009008',
 'follicular B cell': 'CL:0000843',
 'innate lymphoid cell': 'CL:0001065',
 'T-helper 17 cell': 'CL:0000899',
 'germinal center B cell': 'CL:0000844',
 'monocyte': 'CL:0000576',
 'T-helper 1 cell': 'CL:0000545',
 'B cell': 'CL:0000236',
 'kidney distal convoluted tubule epithelial cell': 'CL:1000849',
 'epithelial cell of proximal tubule': 'CL:0002306',
 'kidney connecting tubule epithelial cell': 'CL:1000768',
 'kidney loop of Henle thick ascending limb epithelial cell': 'CL:1001106',
 'kidney collecting duct intercalated cell': 'CL:1001432',
 'peritubular capillary endothelial cell': 'CL:1001033',
 'kidney interstitial fibroblast': 'CL:1000692',
 'kidney collecting duct principal cell': 'CL:1001431',
 'glomerular capillary endothelial cell': 'CL:1001005',
 'podocyte': 'CL:0000653',
 'parietal epithelial cell': 'CL:1000452',
 'vasa recta cell': 'CL:1001036',
 'kidney arterial blood vessel cell': 'CL:1000891',
 'kidney inner medulla collecting duct epithelial cell': 'CL:1000547',
 'renal medullary fibroblast': 'CL:4030022',
 'kidney loop of Henle thin descending limb epithelial cell': 'CL:1001111',
 'mural cell': 'CL:0008034',
 'dendritic cell, human': 'CL:0001056',
 'kidney loop of Henle thin ascending limb epithelial cell': 'CL:1001107',
 'Immune mix': 'CL:0000738',
 'endothelial cell of pericentral hepatic sinusoid': 'CL:0019022',
 'midzonal region hepatocyte': 'CL:0019028',
 'inflammatory macrophage': 'CL:0000863',
 'centrilobular region hepatocyte': 'CL:0019029',
 'periportal region hepatocyte': 'CL:0019026',
 'endothelial cell of periportal hepatic sinusoid': 'CL:0019021',
 'cholangiocyte': 'CL:1000488',
 'hepatic stellate cell': 'CL:0000632',
 'CD4-positive, alpha-beta T cell': 'CL:0000624',
 'alveolar capillary type 2 endothelial cell': 'CL:4028003',
 'alveolar macrophage': 'CL:0000583',
 'smooth muscle cell of the pulmonary artery': 'CL:0002591',
 'club cell': 'CL:0000158',
 'type I pneumocyte': 'CL:0002062',
 'type II pneumocyte': 'CL:0002063',
 'basal cell': 'CL:0000646',
 'fibroblast of lung': 'CL:0002553',
 'lung ciliated cell': 'CL:1000271',
 'alveolar type 1 fibroblast cell': 'CL:4028004',
 'ionocyte': 'CL:0005006',
 'muscle cell': 'CL:0000187',
 'activated CD4-positive, alpha-beta T cell': 'CL:0000896',
 'pancreatic A cell': 'CL:0000171',
 'pancreatic D cell': 'CL:0000173',
 'type B pancreatic cell': 'CL:0000169',
 'pancreatic ductal cell': 'CL:0002079',
 'pancreatic PP cell': 'CL:0002275',
 'pancreatic stellate cell': 'CL:0002410',
 'pancreatic acinar cell': 'CL:0002064',
 'skeletal muscle satellite stem cell': 'CL:0008011',
 'cell of skeletal muscle': 'CL:0000188',
 'fast muscle cell': 'CL:0000190',
 'mesenchymal stem cell': 'CL:0000134',
 'slow muscle cell': 'CL:0000189',
 'Lymphoid/Macrophage': 'CL:0000867',
 'tendon cell': 'CL:0000388',
 'cell':'CL:0000738'
        }

In [76]:
# add the cell_type_ontology_term_id column

In [77]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type'].map(mapping)

In [78]:
# change datatype of the column

In [79]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [80]:
# view adata.obs

In [81]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,Homo sapiens,normal,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,Homo sapiens,normal,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057


#### **development_stage_ontology_term_id**

In [82]:
# identify the column in adata which corresponds to age

In [83]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [84]:
list(adata.obs['development_stage'].unique())

['human adult stage',
 '44-year-old human stage',
 '65-year-old human stage',
 '41-year-old human stage',
 '21-year-old human stage',
 '26-year-old human stage',
 '77-year-old human stage',
 '52-year-old human stage',
 '45-year-old human stage',
 '69-year-old human stage',
 '64-year-old human stage',
 '55-year-old human stage',
 '70-year-old human stage',
 '34-year-old human stage',
 '53-year-old human stage',
 '72-year-old human stage',
 '46-year-old human stage',
 '73-year-old human stage',
 '75-year-old human stage',
 '66-year-old human stage',
 '28-year-old human stage',
 '49-year-old human stage',
 '67-year-old human stage',
 '58-year-old human stage',
 '68-year-old human stage']

In [85]:
age = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/age.csv')

In [86]:
age_dict = pd.Series(age['development_stage_ontology_term_id'].values, index=age['age']).to_dict()

In [87]:
age_dict

{'29-year-old human stage': 'HsapDv:0000123',
 '58-year-old human stage': 'HsapDv:0000152',
 '35-year-old human stage': 'HsapDv:0000129',
 '33-year-old human stage': 'HsapDv:0000127',
 '45-year-old human stage': 'HsapDv:0000139',
 '37-year-old human stage': 'HsapDv:0000131',
 '69-year-old human stage': 'HsapDv:0000163',
 '55-year-old human stage': 'HsapDv:0000149',
 '71-year-old human stage': 'HsapDv:0000165',
 '26-year-old human stage': 'HsapDv:0000120',
 '53-year-old human stage': 'HsapDv:0000147',
 '49-year-old human stage': 'HsapDv:0000143',
 '46-year-old human stage': 'HsapDv:0000140',
 '34-year-old human stage': 'HsapDv:0000128',
 '27-year-old human stage': 'HsapDv:0000121',
 '28-year-old human stage': 'HsapDv:0000122',
 '30-year-old human stage': 'HsapDv:0000124',
 'seventh decade human stage': 'HsapDv:0000241',
 'sixth decade human stage': 'HsapDv:0000240',
 '59-year-old human stage': 'HsapDv:0000153',
 '39-year-old human stage': 'HsapDv:0000133',
 '22-year-old human stage': 'H

In [88]:
donor_ids = ["A29", "390C", "A26 (386C)", "A26", "A32 (411C)", "A32", "A34 (417C)", 
             "417C", "356C", "A32 (411C)", "A26 (386C)", "284C", "368C", "296C", 
             "A33 (414C)", "A30 (398B)", "417c", "454C", "A32", "A37", "A40", "A44", 
             "A47", "640C", "390C", "A29", "390c", "302C", "302c", "390C", "390c","411C","A34","386C"]

# Convert `adata.obs['donor_id']` to a set for faster lookup
present_donors_set = set(adata.obs['donor_id'])

# Check which donors are present in `adata.obs['donor_id']`
present_donors = [donor for donor in donor_ids if donor in present_donors_set]

# Display the donors that are present
print("Donors present in adata.obs['donor_id']: ", present_donors)

Donors present in adata.obs['donor_id']:  []


In [89]:
donors_to_replace = ['A29', 'A26', 'A32', '417C','302c', '390c','417c','386C','390C','411C',]
if any(donor in adata.obs['donor_id'].values for donor in donors_to_replace):
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A29', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A26', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A32', 'A32 (411C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417C', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('302c', '302C')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417c', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390C', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('386C', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('411C', 'A32 (411C)')

In [90]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage'].map(age_dict)

In [91]:
update_dict = {
'D5' :'HsapDv:0000241',
'A29 (390C)' :'HsapDv:0000161',
'A37':'HsapDv:0000149',
'A34 (417C)':'HsapDv:0000094',
'356C':'HsapDv:0000239',
'A32 (411C)':'HsapDv:0000123',
'A26 (386C)':'HsapDv:0000169',
'284C':'HsapDv:0000240',
'368C':'HsapDv:0000240',
'296C':'HsapDv:0000238',
'A33 (414C)':'HsapDv:0000090',
'A30 (398B)':'HsapDv:0000090',
'417c':'HsapDv:0000094',
'A32':'HsapDv:0000123',
'A37':'HsapDv:0000153',
'A40':'HsapDv:0000158',
'A44':'HsapDv:0000160',
'A47':'HsapDv:0000152',
'640C':'HsapDv:0000242'}

# Update adata.obs['development_stage_ontology_term_id'] based on the update dictionary
adata.obs['development_stage_ontology_term_id'] = adata.obs.apply(
    lambda row: update_dict[row['donor_id']] if row['donor_id'] in update_dict else row['development_stage_ontology_term_id'], 
    axis=1
)

In [92]:
# change datatype of the column

In [93]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [94]:
# view unique values of development_stage_ontology_term_id column

In [95]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000087',
 'HsapDv:0000138',
 'HsapDv:0000159',
 'HsapDv:0000135',
 'HsapDv:0000115',
 'HsapDv:0000120',
 'HsapDv:0000171',
 'HsapDv:0000146',
 'HsapDv:0000139',
 'HsapDv:0000163',
 'HsapDv:0000158',
 'HsapDv:0000149',
 'HsapDv:0000164',
 'HsapDv:0000128',
 'HsapDv:0000147',
 'HsapDv:0000166',
 'HsapDv:0000140',
 'HsapDv:0000167',
 'HsapDv:0000169',
 'HsapDv:0000160',
 'HsapDv:0000122',
 'HsapDv:0000143',
 'HsapDv:0000161',
 'HsapDv:0000152',
 'HsapDv:0000162']

In [96]:
# view adata.obs

In [97]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,Homo sapiens,normal,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,Homo sapiens,normal,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162


#### **donor_id**

In [98]:
#identify the column in adata.obs which provides donor information

In [99]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [100]:
# add the donor_id column

In [101]:
adata.obs['donor_id'] = adata.obs['donor_id']

In [102]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [103]:
# change datatype of the column

In [104]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [105]:
# view unique values of donor_id column

In [106]:
list(adata.obs['donor_id'].unique())

['C41',
 'C58',
 'C70',
 'C72',
 'P1TLH',
 'P2TLH',
 'P3TLH',
 'P4TLH',
 'P5TLH',
 '325',
 '304',
 '308',
 '310',
 '315',
 'BP1',
 '301',
 '309',
 '311',
 'H02',
 'H04',
 'H06',
 'H07',
 'H10',
 'H11',
 'H13',
 'H14',
 'H16',
 'H18',
 'H21',
 'H22',
 'H23',
 'H25',
 'H33',
 'H30',
 'H37',
 'H38']

In [107]:
#view obs

In [108]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,Homo sapiens,normal,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,Homo sapiens,normal,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162


In [109]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [110]:
list(adata.obs['disease'].unique())

['normal']

In [111]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [112]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,Homo sapiens,normal,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,Homo sapiens,normal,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461


In [113]:
# change datatype of the column

In [114]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [115]:
# view obs

In [116]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,Homo sapiens,normal,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,Homo sapiens,normal,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461


#### **is_primary_data**

In [117]:
#change data type of column

In [118]:
list(adata.obs['Dataset'].unique())

['Andrews et al. 2022',
 'MacParland et al. 2018',
 'Aizarani et al. 2019',
 'Guilliams et al. 2022']

In [119]:
mapping = {'Aizarani et al. 2019':False,
'Andrews et al. 2022':False,
'Guilliams et al. 2022':False,
'MacParland et al. 2018':False,
'HCA Kidney 2022':False,
'Lake et al. 2021':False,
'Muto et al. 2021':False,
'Stewart et al. 2019':False,
'Dominguez Conde et al. 2022':False,
'Elmentaite et al. 2021':False,
'James et al. 2020':False,
'Szabo et al. 2019':False,
'He et al. 2020':False,
'Micheli et al. 2020':True,
'Perez et al. 2022':True,
'Ren et al. 2021':False,
'Stephenson et al. 2021':False,
'Yoshida et al. 2021':False,
'Madissoon et al. 2020':False,
'Tabula Sapiens 2022':False,
'Ayhan et al. 2021':False,
'Franjic et al. 2022':True,
'Siletti et al. 2022':False,
'Tran et al. 2021':True,
'Burclaff et al. 2022':False,
'Smillie et al. 2019':False,
'Roy et al. 2021':True,
'Adams et al. 2020':False,
'Madissoon et al. 2022':False,
'Travaglini et al. 2020':False,
'Koenig et al. 2022':True,
'Kuppe et al. 2022':False,
'Litvinukova et al. 2020':False,
'Tucker et al. 2020':True,
'Fasolino et al. 2022':False,
'Muraro et al. 2016':False,
'Tosti et al. 2021':True,
'Tritschler et al. 2022':False}

In [120]:
adata.obs['is_primary_data']= adata.obs['Dataset'].map(mapping)

In [121]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [122]:
# view obs

In [123]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,Homo sapiens,normal,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,Homo sapiens,normal,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,Homo sapiens,normal,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,Homo sapiens,normal,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False


#### **organism_ontology_term_id**

In [124]:
# assign organism id 

In [125]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [126]:
#change data type of column

In [127]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [128]:
# view obs

In [129]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,...,normal,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,...,normal,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,...,normal,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,...,normal,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,...,normal,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,normal,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,normal,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,normal,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,normal,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [130]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/liver/metadata_liver.csv')
mapping = dict(zip(df['cells'], df['ethnicity']))

In [131]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs_names.map(mapping)

In [132]:
# change data type

In [133]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [134]:
# view obs

In [135]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,...,liver,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,...,liver,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,...,liver,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,...,liver,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,...,liver,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,liver,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,liver,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,liver,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,liver,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown


In [136]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['unknown', nan]

In [137]:
adata.obs['self_reported_ethnicity_ontology_term_id'].fillna('unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['self_reported_ethnicity_ontology_term_id'].fillna('unknown', inplace=True)


In [138]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### **sex_ontology_term_id**

In [139]:
# identify the column in adata.obs which corresponds to sex

In [140]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [141]:
list(adata.obs['sex'].unique())

['female', 'male']

In [142]:
# list the unique values 

In [143]:
mapping= {'female': 'PATO:0000383', 'male': 'PATO:0000384', 'unknown':'unknown'}

In [144]:
# add sex_ontology_term_id column

In [145]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex'].map(mapping)

In [146]:
# change data type

In [147]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [148]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,...,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,...,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,...,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,...,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,...,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383


#### **suspension_type**

In [149]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,...,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,...,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,...,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,...,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,...,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383


In [150]:
list(adata.obs['suspension_type'].unique())

['cell', 'nucleus']

In [151]:
adata.obs['suspension_type'] = adata.obs['suspension_type']

In [152]:
# change data type of column

In [153]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [154]:
# view obs

In [155]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,...,AAACCTGAGCCTTGAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,...,AAACCTGAGGTCATCT,3pv2_5pv1_5pv2,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,...,AAACCTGAGTCGCCGT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,...,AAACCTGAGTGGAGTC,3pv2_5pv1_5pv2,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,...,AAACCTGAGTTAACGA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTACCAAGAGAGGTA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTACCAGTATCGAAA,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTCCTCGTGAGGCAT,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,TTTCGATTCGATAACC,3pv3,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383


#### **tissue_type**

In [156]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [157]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [158]:
# identify the column in adata.obs which corresponds to tissue

In [159]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type'],
      dtype='object')

In [160]:
list(adata.obs['tissue'].unique())

['liver']

In [161]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/liver/metadata_liver.csv')
mapping = dict(zip(df['cells'], df['tissue_ontology_term_id']))

In [162]:
adata.obs['tissue_ontology_term_id'] = adata.obs_names.map(mapping)

In [163]:
# add 'tissue_ontology_term_id' column

In [164]:
# change data type of column

In [165]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [166]:
#list the unique values in 'tissue_ontology_term_id' column

In [167]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0001117', nan, 'UBERON:0002107']

In [168]:
adata.obs['tissue_ontology_term_id'].fillna('UBERON:0002107', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['tissue_ontology_term_id'].fillna('UBERON:0002107', inplace=True)


In [169]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0001117', 'UBERON:0002107']

In [170]:
# view obs

In [171]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,...,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,...,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,...,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,...,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,...,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107


In [172]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [173]:
# view obsm

In [174]:
# check whether all columns are prefixed with X

In [175]:
adata.obsm

AxisArrays with keys: X_umap

#### **uns (Dataset Metadata)**

In [176]:
# View

In [177]:
adata.uns

{'schema_version': '3.0.0', 'title': 'Adult human liver'}

In [178]:
adata.uns.keys

<function dict.keys>

In [179]:
# Give a title for the dataset

In [180]:
adata.uns['title'] = 'Liver'

In [181]:
# Set the default embedding

In [182]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [183]:
# view anndata object

In [184]:
adata

AnnData object with n_obs × n_vars = 259678 × 36368
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue', 'barcodes', 'assays', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'
    var: 'exist_in_Andrews2022', 'exist_in_MacParland2018', 'exist_in_Aizarani2019', 'exist_in_Guilliams2022', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'default_embedding'
    obsm: 'X_umap'
    layers: 'counts'

In [185]:
# view obs and var data types

In [186]:
adata.obs.dtypes

Dataset                                     category
donor_id                                    category
development_stage                           category
sex                                         category
suspension_type                             category
assay                                       category
Original_annotation                         category
CellHint_harmonised_group                   category
cell_type                                   category
Curated_annotation                          category
organism                                    category
disease                                     category
tissue                                      category
barcodes                                      object
assays                                        object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    ca

In [187]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [188]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed barcodes from object to category
changed assays from object to category


In [189]:
# view obs

In [190]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,cell,Immune mix,...,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,cvLSECs,Group27,endothelial cell of pericentral hepatic sinusoid,Central venous LSECs,...,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,Bcells,Group1,cell,Immune mix,...,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,InterHep,Group24,midzonal region hepatocyte,Interzonal hepatocytes,...,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,human adult stage,female,cell,10x 3' v2,NKTcell,Group23,cell,Immune mix,...,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,68-year-old human stage,female,cell,10x 3' v3,Fibroblasts,Group35,fibroblast,Fibroblasts,...,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107


In [191]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [192]:
# delete unwanted columns in obs

In [193]:
del adata.obs['tissue']
del adata.obs['organism']
del adata.obs['disease']
del adata.obs['assay']
del adata.obs['sex']
del adata.obs['development_stage']
del adata.obs['assays']
del adata.obs['barcodes']
del adata.uns['schema_version']
del adata.obs['cell_type']

In [194]:
# view obs

In [195]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,cell,InterHep,Group24,Immune mix,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,cell,cvLSECs,Group27,Central venous LSECs,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,cell,Bcells,Group1,Immune mix,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,cell,InterHep,Group24,Interzonal hepatocytes,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,cell,NKTcell,Group23,Immune mix,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,cell,Fibroblasts,Group35,Fibroblasts,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,cell,Fibroblasts,Group35,Fibroblasts,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,cell,Fibroblasts,Group35,Fibroblasts,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,cell,Fibroblasts,Group35,Fibroblasts,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107


In [196]:
# view var

In [197]:
adata.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000278457,False,False,True,True,False
ENSG00000278294,False,False,True,False,False
ENSG00000121410,True,True,True,True,False
ENSG00000268895,True,True,True,True,False
ENSG00000148584,True,True,True,True,False
...,...,...,...,...,...
ENSG00000252945,False,False,True,False,False
ENSG00000252868,False,False,True,False,False
ENSG00000201410,False,False,True,False,False
ENSG00000266692,False,False,True,False,False


In [198]:
araw.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000278457,False,False,True,True
ENSG00000278294,False,False,True,False
ENSG00000121410,True,True,True,True
ENSG00000268895,True,True,True,True
ENSG00000148584,True,True,True,True
...,...,...,...,...
ENSG00000252945,False,False,True,False
ENSG00000252868,False,False,True,False
ENSG00000201410,False,False,True,False
ENSG00000266692,False,False,True,False


In [199]:
#view uns

In [200]:
adata.uns

{'title': 'Liver', 'default_embedding': 'X_umap'}

In [201]:
list(adata.uns.keys())

['title', 'default_embedding']

In [202]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [203]:
# Remove unwanted columns in uns

In [204]:
#check the format of expression matrix

In [205]:
adata.X

<259678x36368 sparse matrix of type '<class 'numpy.float32'>'
	with 347575650 stored elements in Compressed Sparse Row format>

In [206]:
araw.X

<259678x36368 sparse matrix of type '<class 'numpy.float32'>'
	with 347575650 stored elements in Compressed Sparse Row format>

In [207]:
#Copy raw counts to adata.raw

In [208]:
del adata.raw
del adata.layers['counts']

In [209]:
adata.raw = araw

In [210]:
obs_dtype = adata.obs.dtypes

In [211]:
obs_dtype

Dataset                                     category
donor_id                                    category
suspension_type                             category
Original_annotation                         category
CellHint_harmonised_group                   category
Curated_annotation                          category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
tissue_type                                 category
tissue_ontology_term_id                     category
dtype: object

In [212]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Final_objects/to_upload/Liver.h5ad', compression = 'gzip')

In [213]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
C41_AAACCTGAGCCTTGAT,Andrews et al. 2022,C41,cell,InterHep,Group24,Immune mix,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGGTCATCT,Andrews et al. 2022,C41,cell,cvLSECs,Group27,Central venous LSECs,EFO:0009899,CL:0019022,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTCGCCGT,Andrews et al. 2022,C41,cell,Bcells,Group1,Immune mix,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTGGAGTC,Andrews et al. 2022,C41,cell,InterHep,Group24,Interzonal hepatocytes,EFO:0009899,CL:0019028,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
C41_AAACCTGAGTTAACGA,Andrews et al. 2022,C41,cell,NKTcell,Group23,Immune mix,EFO:0009899,CL:0000738,HsapDv:0000087,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0001117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,Guilliams et al. 2022,H38,cell,Fibroblasts,Group35,Fibroblasts,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTACCAGTATCGAAA-41,Guilliams et al. 2022,H38,cell,Fibroblasts,Group35,Fibroblasts,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTCCTCGTGAGGCAT-41,Guilliams et al. 2022,H38,cell,Fibroblasts,Group35,Fibroblasts,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107
TTTCGATTCGATAACC-41,Guilliams et al. 2022,H38,cell,Fibroblasts,Group35,Fibroblasts,EFO:0009922,CL:0000057,HsapDv:0000162,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000383,tissue,UBERON:0002107


In [214]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [215]:
adata.raw.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000278457,False,False,True,True
ENSG00000278294,False,False,True,False
ENSG00000121410,True,True,True,True
ENSG00000268895,True,True,True,True
ENSG00000148584,True,True,True,True
...,...,...,...,...
ENSG00000252945,False,False,True,False
ENSG00000252868,False,False,True,False
ENSG00000201410,False,False,True,False
ENSG00000266692,False,False,True,False


In [216]:
adata.var

Unnamed: 0_level_0,exist_in_Andrews2022,exist_in_MacParland2018,exist_in_Aizarani2019,exist_in_Guilliams2022,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000278457,False,False,True,True,False
ENSG00000278294,False,False,True,False,False
ENSG00000121410,True,True,True,True,False
ENSG00000268895,True,True,True,True,False
ENSG00000148584,True,True,True,True,False
...,...,...,...,...,...
ENSG00000252945,False,False,True,False,False
ENSG00000252868,False,False,True,False,False
ENSG00000201410,False,False,True,False,False
ENSG00000266692,False,False,True,False,False


In [217]:
adata.raw.X

<259678x36368 sparse matrix of type '<class 'numpy.float32'>'
	with 347575650 stored elements in Compressed Sparse Row format>