### **Curating Kidney.h5ad**

Article: Automatic cell-type harmonization and integration across Human Cell Atlas datasets

DOI: https://doi.org/10.1016/j.cell.2023.11.026

Data Source : https://www.celltypist.org/organs

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Data/Kidney.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 194504 × 43718
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'
    var: 'exist_in_Lake2021', 'exist_in_Muto2021', 'exist_in_Stewart2019', 'exist_in_HCAkidney2022'
    uns: 'schema_version', 'title'
    obsm: 'X_umap'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<194504x43718 sparse matrix of type '<class 'numpy.float32'>'
	with 336676698 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 67)	2.9891067
  (0, 191)	2.9891067
  (0, 2162)	2.9891067
  (0, 2171)	3.6567655
  (0, 2231)	2.9891067
  (0, 2289)	2.9891067
  (0, 2372)	2.9891067
  (0, 2495)	2.9891067
  (0, 2508)	2.9891067
  (0, 2510)	2.9891067
  (0, 2544)	2.9891067
  (0, 2641)	2.9891067
  (0, 2652)	2.9891067
  (0, 2709)	2.9891067
  (0, 2747)	2.9891067
  (0, 2806)	2.9891067
  (0, 2816)	2.9891067
  (0, 2821)	3.6567655
  (0, 2822)	2.9891067
  (0, 2906)	2.9891067
  (0, 2948)	2.9891067
  (0, 3235)	2.9891067
  (0, 3405)	2.9891067
  (0, 3411)	2.9891067
  (0, 3429)	3.6567655
  :	:
  (194503, 42899)	1.5041163
  (194503, 42938)	1.5041163
  (194503, 42950)	1.5041163
  (194503, 42997)	1.5041163
  (194503, 43029)	1.5041163
  (194503, 43101)	1.5041163
  (194503, 43114)	1.5041163
  (194503, 43117)	1.5041163
  (194503, 43270)	1.5041163
  (194503, 43290)	1.5041163
  (194503, 43295)	1.5041163
  (194503, 43312)	1.5041163
  (194503, 43330)	1.5041163
  (194503, 43396)	1.5041163
  (194503, 43420)	1.5041163
  (194503, 43422)	1.5041163

##### **Raw counts matrix**

In [11]:
print(adata.raw.X)

  (0, 67)	1.0
  (0, 191)	1.0
  (0, 2162)	1.0
  (0, 2171)	2.0
  (0, 2231)	1.0
  (0, 2289)	1.0
  (0, 2372)	1.0
  (0, 2495)	1.0
  (0, 2508)	1.0
  (0, 2510)	1.0
  (0, 2544)	1.0
  (0, 2641)	1.0
  (0, 2652)	1.0
  (0, 2709)	1.0
  (0, 2747)	1.0
  (0, 2806)	1.0
  (0, 2816)	1.0
  (0, 2821)	2.0
  (0, 2822)	1.0
  (0, 2906)	1.0
  (0, 2948)	1.0
  (0, 3235)	1.0
  (0, 3405)	1.0
  (0, 3411)	1.0
  (0, 3429)	2.0
  :	:
  (194503, 42899)	1.0
  (194503, 42938)	1.0
  (194503, 42950)	1.0
  (194503, 42997)	1.0
  (194503, 43029)	1.0
  (194503, 43101)	1.0
  (194503, 43114)	1.0
  (194503, 43117)	1.0
  (194503, 43270)	1.0
  (194503, 43290)	1.0
  (194503, 43295)	1.0
  (194503, 43312)	1.0
  (194503, 43330)	1.0
  (194503, 43396)	1.0
  (194503, 43420)	1.0
  (194503, 43422)	1.0
  (194503, 43465)	1.0
  (194503, 43467)	1.0
  (194503, 43482)	1.0
  (194503, 43521)	1.0
  (194503, 43625)	1.0
  (194503, 43626)	1.0
  (194503, 43665)	1.0
  (194503, 43684)	2.0
  (194503, 43705)	2.0


In [12]:
adata.raw.var

Unnamed: 0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022
A1BG,True,True,True,True
A1BG-AS1,True,True,True,True
A1CF,True,True,True,True
A2M,True,True,True,True
A2M-AS1,True,True,True,True
...,...,...,...,...
bP-2189O9.2,True,False,False,False
bP-2189O9.3,False,False,True,False
bP-2189O9.5,False,False,False,True
hsa-mir-1253,False,True,True,True


In [13]:
araw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)

##### **Variables(var)**

In [14]:
# View the var of anndata and raw object

In [15]:
adata.var

Unnamed: 0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022
A1BG,True,True,True,True
A1BG-AS1,True,True,True,True
A1CF,True,True,True,True
A2M,True,True,True,True
A2M-AS1,True,True,True,True
...,...,...,...,...
bP-2189O9.2,True,False,False,False
bP-2189O9.3,False,False,True,False
bP-2189O9.5,False,False,False,True
hsa-mir-1253,False,True,True,True


In [16]:
ensembl_data = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/symbol2ID.csv')

In [17]:
ensembl_dict = dict(zip(ensembl_data['gene_symbol'], ensembl_data['gene_id']))

In [18]:
ensembl_dict

{'MT-TF': 'ENSG00000210049',
 'MT-RNR1': 'ENSG00000211459',
 'MT-TV': 'ENSG00000210077',
 'MT-RNR2': 'ENSG00000210082',
 'MT-TL1': 'ENSG00000209082',
 'MT-ND1': 'ENSG00000198888',
 'MT-TI': 'ENSG00000210100',
 'MT-TQ': 'ENSG00000210107',
 'MT-TM': 'ENSG00000210112',
 'MT-ND2': 'ENSG00000198763',
 'MT-TW': 'ENSG00000210117',
 'MT-TA': 'ENSG00000210127',
 'MT-TN': 'ENSG00000210135',
 'MT-TC': 'ENSG00000210140',
 'MT-TY': 'ENSG00000210144',
 'MT-CO1': 'ENSG00000198804',
 'MT-TS1': 'ENSG00000210151',
 'MT-TD': 'ENSG00000210154',
 'MT-CO2': 'ENSG00000198712',
 'MT-TK': 'ENSG00000210156',
 'MT-ATP8': 'ENSG00000228253',
 'MT-ATP6': 'ENSG00000198899',
 'MT-CO3': 'ENSG00000198938',
 'MT-TG': 'ENSG00000210164',
 'MT-ND3': 'ENSG00000198840',
 'MT-TR': 'ENSG00000210174',
 'MT-ND4L': 'ENSG00000212907',
 'MT-ND4': 'ENSG00000198886',
 'MT-TH': 'ENSG00000210176',
 'MT-TS2': 'ENSG00000210184',
 'MT-TL2': 'ENSG00000210191',
 'MT-ND5': 'ENSG00000198786',
 'MT-ND6': 'ENSG00000198695',
 'MT-TE': 'ENSG00000

In [19]:
adata.var['gene_id'] = adata.var_names.map(ensembl_dict)
araw.var['gene_id'] = araw.var_names.map(ensembl_dict)


In [20]:
adata.var

Unnamed: 0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,gene_id
A1BG,True,True,True,True,ENSG00000121410
A1BG-AS1,True,True,True,True,ENSG00000268895
A1CF,True,True,True,True,ENSG00000148584
A2M,True,True,True,True,ENSG00000175899
A2M-AS1,True,True,True,True,ENSG00000245105
...,...,...,...,...,...
bP-2189O9.2,True,False,False,False,
bP-2189O9.3,False,False,True,False,
bP-2189O9.5,False,False,False,True,
hsa-mir-1253,False,True,True,True,ENSG00000272920


In [21]:
araw.var

Unnamed: 0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,gene_id
A1BG,True,True,True,True,ENSG00000121410
A1BG-AS1,True,True,True,True,ENSG00000268895
A1CF,True,True,True,True,ENSG00000148584
A2M,True,True,True,True,ENSG00000175899
A2M-AS1,True,True,True,True,ENSG00000245105
...,...,...,...,...,...
bP-2189O9.2,True,False,False,False,
bP-2189O9.3,False,False,True,False,
bP-2189O9.5,False,False,False,True,
hsa-mir-1253,False,True,True,True,ENSG00000272920


In [22]:
nan_count = adata.var['gene_id'].isna().sum()
print("Number of NaN values in adata.obs['gene_id']: ", nan_count)

Number of NaN values in adata.obs['gene_id']:  4951


In [23]:
adata.var['gene_name'] = adata.var.index
araw.var['gene_name'] = araw.var.index

In [24]:
adata.var.index = adata.var['gene_id'] 
araw.var.index = araw.var['gene_id']

In [25]:
# Load the approved genes file.

In [26]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [27]:
#Create a dictionary from the approved genes file 

In [28]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [29]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [30]:
len(genedict)

119799

In [31]:
#Filter out the genes which are not in the approved genes file.

In [32]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [33]:
len(var_to_keep_adata)

35931

In [34]:
len(var_to_keep_araw)

35931

In [35]:
adata.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
,True,False,False,False,,bP-2189O9.2
,False,False,True,False,,bP-2189O9.3
,False,False,False,True,,bP-2189O9.5
ENSG00000272920,False,True,True,True,ENSG00000272920,hsa-mir-1253


In [36]:
araw.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
,True,False,False,False,,bP-2189O9.2
,False,False,True,False,,bP-2189O9.3
,False,False,False,True,,bP-2189O9.5
ENSG00000272920,False,True,True,True,ENSG00000272920,hsa-mir-1253


In [37]:
# Modify the anndata object by filtering out the filtered genes.

In [38]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [39]:
adata.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,True,True,True,True,ENSG00000036549,ZZZ3
ENSG00000270533,True,False,False,False,ENSG00000270533,bP-21201H5.1
ENSG00000272920,False,True,True,True,ENSG00000272920,hsa-mir-1253


In [40]:
# View var

In [41]:
araw.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,True,True,True,True,ENSG00000036549,ZZZ3
ENSG00000270533,True,False,False,False,ENSG00000270533,bP-21201H5.1
ENSG00000272920,False,True,True,True,ENSG00000272920,hsa-mir-1253


In [42]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = adata.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
adata = adata[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", adata.var_names.is_unique)

All variable names are unique: True


In [43]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = araw.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
araw = araw[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", araw.var_names.is_unique)

All variable names are unique: True


feature is filtered

In [44]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [45]:
adata.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,gene_id,gene_name,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG,False
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1,False
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF,False
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M,False
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1,False
...,...,...,...,...,...,...,...
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1,False
ENSG00000036549,True,True,True,True,ENSG00000036549,ZZZ3,False
ENSG00000270533,True,False,False,False,ENSG00000270533,bP-21201H5.1,False
ENSG00000272920,False,True,True,True,ENSG00000272920,hsa-mir-1253,False


In [46]:
araw.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,True,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000074755,True,True,True,True,ENSG00000074755,ZZEF1
ENSG00000036549,True,True,True,True,ENSG00000036549,ZZZ3
ENSG00000270533,True,False,False,False,ENSG00000270533,bP-21201H5.1
ENSG00000272920,False,True,True,True,ENSG00000272920,hsa-mir-1253


In [47]:
del adata.var['gene_id']
del araw.var['gene_id']
del adata.var['gene_name']
del araw.var['gene_name']

#### **obs (Cell metadata)**

In [48]:
#view obs

In [49]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,Homo sapiens,normal,kidney
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,Homo sapiens,normal,kidney
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,Homo sapiens,normal,kidney
...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,Homo sapiens,normal,kidney
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,Homo sapiens,normal,kidney
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,Homo sapiens,normal,kidney
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,Homo sapiens,normal,kidney


In [50]:
# view the column names in obs

In [51]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'],
      dtype='object')

#### **assay_ontology_term_id**

In [52]:
list(adata.obs['assay'].unique())

["10x 3' v3", "10x 5' v1", "10x 3' v2"]

In [53]:
adata.obs['barcodes'] = adata.obs_names

In [54]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [55]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [56]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [57]:
adata.obs['assays'] = adata.obs['barcodes'].map(mapping)

In [58]:
list(adata.obs['assays'].unique())

['3pv3',
 '3pv2_5pv1_5pv2+3pv3',
 '3pv3+multiome',
 nan,
 '3pv2_5pv1_5pv2',
 '3pv2_5pv1_5pv2+multiome']

In [59]:
import pandas as pd
import scanpy as sc

# Suppose 'adata' is your AnnData object already loaded.

# Step 1: Check unique values in 'assay'
unique_assays = adata.obs['assay'].unique()
print("Unique values in 'assay':", unique_assays)

# Step 2: Display unique 'assays' for each unique 'assay'
for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['assay'] == assay, 'assays'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)


Unique values in 'assay': ['10x 3' v3', '10x 5' v1', '10x 3' v2']
Categories (3, object): ['10x 3' v2', '10x 3' v3', '10x 5' v1']
Unique 'assays' for assay 10x 3' v3: ['3pv3' '3pv2_5pv1_5pv2+3pv3' '3pv3+multiome' nan]
Unique 'assays' for assay 10x 5' v1: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay 10x 3' v2: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]


In [60]:
mapping ={"10x 5' v2":'EFO:0009900', "10x 5' v1":'EFO:0011025', "10x 3' v3":'EFO:0009922', "10x 3' transcription profiling":'EFO:0009899',"10x 3' v1":'EFO:0009901',"10x 3' v2":'EFO:0009899'}

In [61]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay'].map(mapping)

In [62]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,Homo sapiens,normal,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,Homo sapiens,normal,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,Homo sapiens,normal,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,Homo sapiens,normal,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,Homo sapiens,normal,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,Homo sapiens,normal,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,Homo sapiens,normal,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922


In [63]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [64]:
# view adata.obs

In [65]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,Homo sapiens,normal,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,Homo sapiens,normal,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,Homo sapiens,normal,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,Homo sapiens,normal,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,Homo sapiens,normal,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,Homo sapiens,normal,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,Homo sapiens,normal,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922


#### **cell_type_ontology_term_id**

In [66]:
#identify the column in adata.obs related. to cell type annotation

In [67]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id'],
      dtype='object')

In [68]:
list(adata.obs['cell_type'].unique())

['myeloid cell',
 'kidney distal convoluted tubule epithelial cell',
 'epithelial cell of proximal tubule',
 'lymphocyte',
 'kidney connecting tubule epithelial cell',
 'kidney loop of Henle thick ascending limb epithelial cell',
 'kidney collecting duct intercalated cell',
 'peritubular capillary endothelial cell',
 'kidney interstitial fibroblast',
 'kidney collecting duct principal cell',
 'glomerular capillary endothelial cell',
 'podocyte',
 'parietal epithelial cell',
 'vasa recta cell',
 'kidney arterial blood vessel cell',
 'kidney inner medulla collecting duct epithelial cell',
 'endothelial cell of lymphatic vessel',
 'renal medullary fibroblast',
 'kidney loop of Henle thin descending limb epithelial cell',
 'mural cell',
 'dendritic cell, human',
 'vascular associated smooth muscle cell',
 'kidney loop of Henle thin ascending limb epithelial cell',
 'conventional dendritic cell',
 'natural killer cell',
 'B cell',
 'T cell',
 'classical monocyte',
 'non-classical monocyte']

In [69]:

df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/cell_typist_annotation.csv')


In [70]:
mapping = df.set_index('Cell_type')['Cell_ontology_ID'].to_dict()

In [71]:
mapping

{'central memory CD4-positive, alpha-beta T cell': 'CL:0000904',
 'CD16-positive, CD56-dim natural killer cell, human': 'CL:0000939',
 'effector memory CD4-positive, alpha-beta T cell': 'CL:0000905',
 'central memory CD8-positive, alpha-beta T cell': 'CL:0000907',
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated': 'CL:0001062',
 'classical monocyte': 'CL:0000860',
 'class switched memory B cell': 'CL:0000972',
 'mucosal invariant T cell': 'CL:0000940',
 'naive B cell': 'CL:0000788',
 'effector memory CD8-positive, alpha-beta T cell': 'CL:0000913',
 'unswitched memory B cell': 'CL:0000970',
 'non-classical monocyte': 'CL:0000875',
 'CD16-negative, CD56-bright natural killer cell, human': 'CL:0000938',
 'gamma-delta T cell': 'CL:0000798',
 'regulatory T cell': 'CL:0000815',
 'conventional dendritic cell': 'CL:0000990',
 'plasma cell': 'CL:0000786',
 'memory B cell': 'CL:0000787',
 'effector memory CD4-positive, alpha-beta T cell, terminally differentiated': 'C

In [72]:
# add the cell_type_ontology_term_id column

In [73]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type'].map(mapping)

In [74]:
# change datatype of the column

In [75]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [76]:
# view adata.obs

In [77]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,Homo sapiens,normal,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,Homo sapiens,normal,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,Homo sapiens,normal,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,Homo sapiens,normal,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,Homo sapiens,normal,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,Homo sapiens,normal,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,Homo sapiens,normal,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306


#### **development_stage_ontology_term_id**

In [78]:
# identify the column in adata which corresponds to age

In [79]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [80]:
list(adata.obs['development_stage'].unique())

['eighth decade human stage',
 'seventh decade human stage',
 'sixth decade human stage',
 'fifth decade human stage',
 'third decade human stage',
 'fourth decade human stage',
 '54-year-old human stage',
 '62-year-old human stage',
 '61-year-old human stage',
 '50-year-old human stage',
 '52-year-old human stage',
 '72-year-old human stage',
 '67-year-old human stage',
 '63-year-old human stage',
 '49-year-old human stage',
 '53-year-old human stage',
 '64-year-old human stage',
 '44-year-old human stage',
 '70-year-old human stage',
 '69-year-old human stage',
 '59-year-old human stage',
 '68-year-old human stage',
 '74-year-old human stage',
 '60-year-old human stage',
 '71-year-old human stage',
 '46-year-old human stage',
 '43-year-old human stage',
 '58-year-old human stage',
 '57-year-old human stage']

In [81]:
age = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/age.csv')

In [82]:
age_dict = pd.Series(age['development_stage_ontology_term_id'].values, index=age['age']).to_dict()

In [83]:
age_dict

{'29-year-old human stage': 'HsapDv:0000123',
 '58-year-old human stage': 'HsapDv:0000152',
 '35-year-old human stage': 'HsapDv:0000129',
 '33-year-old human stage': 'HsapDv:0000127',
 '45-year-old human stage': 'HsapDv:0000139',
 '37-year-old human stage': 'HsapDv:0000131',
 '69-year-old human stage': 'HsapDv:0000163',
 '55-year-old human stage': 'HsapDv:0000149',
 '71-year-old human stage': 'HsapDv:0000165',
 '26-year-old human stage': 'HsapDv:0000120',
 '53-year-old human stage': 'HsapDv:0000147',
 '49-year-old human stage': 'HsapDv:0000143',
 '46-year-old human stage': 'HsapDv:0000140',
 '34-year-old human stage': 'HsapDv:0000128',
 '27-year-old human stage': 'HsapDv:0000121',
 '28-year-old human stage': 'HsapDv:0000122',
 '30-year-old human stage': 'HsapDv:0000124',
 'seventh decade human stage': 'HsapDv:0000241',
 'sixth decade human stage': 'HsapDv:0000240',
 '59-year-old human stage': 'HsapDv:0000153',
 '39-year-old human stage': 'HsapDv:0000133',
 '22-year-old human stage': 'H

In [84]:
donor_ids = ["A29", "390C", "A26 (386C)", "A26", "A32 (411C)", "A32", "A34 (417C)", 
             "417C", "356C", "A32 (411C)", "A26 (386C)", "284C", "368C", "296C", 
             "A33 (414C)", "A30 (398B)", "417c", "454C", "A32", "A37", "A40", "A44", 
             "A47", "640C", "390C", "A29", "390c", "302C", "302c", "390C", "390c","411C","A34","386C"]

# Convert `adata.obs['donor_id']` to a set for faster lookup
present_donors_set = set(adata.obs['donor_id'])

# Check which donors are present in `adata.obs['donor_id']`
present_donors = [donor for donor in donor_ids if donor in present_donors_set]

# Display the donors that are present
print("Donors present in adata.obs['donor_id']: ", present_donors)

Donors present in adata.obs['donor_id']:  []


In [85]:
donors_to_replace = ['A29', 'A26', 'A32', '417C','302c', '390c','417c','386C','390C','411C',]
if any(donor in adata.obs['donor_id'].values for donor in donors_to_replace):
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A29', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A26', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A32', 'A32 (411C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417C', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('302c', '302C')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417c', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390C', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('386C', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('411C', 'A32 (411C)')

In [86]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage'].map(age_dict)

In [87]:
update_dict = {
'D5' :'HsapDv:0000241',
'A29 (390C)' :'HsapDv:0000161',
'A37':'HsapDv:0000149',
'A34 (417C)':'HsapDv:0000094',
'356C':'HsapDv:0000239',
'A32 (411C)':'HsapDv:0000123',
'A26 (386C)':'HsapDv:0000169',
'284C':'HsapDv:0000240',
'368C':'HsapDv:0000240',
'296C':'HsapDv:0000238',
'A33 (414C)':'HsapDv:0000090',
'A30 (398B)':'HsapDv:0000090',
'417c':'HsapDv:0000094',
'A32':'HsapDv:0000123',
'A37':'HsapDv:0000153',
'A40':'HsapDv:0000158',
'A44':'HsapDv:0000160',
'A47':'HsapDv:0000152',
'640C':'HsapDv:0000242'}

# Update adata.obs['development_stage_ontology_term_id'] based on the update dictionary
adata.obs['development_stage_ontology_term_id'] = adata.obs.apply(
    lambda row: update_dict[row['donor_id']] if row['donor_id'] in update_dict else row['development_stage_ontology_term_id'], 
    axis=1
)

In [88]:
# change datatype of the column

In [89]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [90]:
# view unique values of development_stage_ontology_term_id column

In [91]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000242',
 'HsapDv:0000241',
 'HsapDv:0000240',
 'HsapDv:0000239',
 'HsapDv:0000237',
 'HsapDv:0000238',
 'HsapDv:0000148',
 'HsapDv:0000156',
 'HsapDv:0000155',
 'HsapDv:0000144',
 'HsapDv:0000146',
 'HsapDv:0000166',
 'HsapDv:0000161',
 'HsapDv:0000157',
 'HsapDv:0000143',
 'HsapDv:0000147',
 'HsapDv:0000158',
 'HsapDv:0000138',
 'HsapDv:0000164',
 'HsapDv:0000163',
 'HsapDv:0000153',
 'HsapDv:0000162',
 'HsapDv:0000168',
 'HsapDv:0000154',
 'HsapDv:0000165',
 'HsapDv:0000140',
 'HsapDv:0000137',
 'HsapDv:0000152',
 'HsapDv:0000151']

In [92]:
# view adata.obs

In [93]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,Homo sapiens,normal,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,Homo sapiens,normal,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,Homo sapiens,normal,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,Homo sapiens,normal,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,Homo sapiens,normal,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,Homo sapiens,normal,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,Homo sapiens,normal,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151


#### **donor_id**

In [94]:
#identify the column in adata.obs which provides donor information

In [95]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [96]:
# add the donor_id column

In [97]:
adata.obs['donor_id'] = adata.obs['donor_id']

In [98]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [99]:
# change datatype of the column

In [100]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [101]:
# view unique values of donor_id column

In [102]:
list(adata.obs['donor_id'].unique())

['3535',
 '18-162',
 '18-142',
 '18-312',
 '3499',
 '3504',
 '3593',
 '3613',
 'KRP446',
 'KRP460',
 'KRP461',
 'KRP462',
 'PRE018-1',
 'PRE019',
 'PRE027',
 'PRE038',
 'PRE055-1',
 'PRE062-1',
 'PRE98sc',
 'Sample1153-EO1',
 'Sample1153-EO2',
 'Sample1153-EO3',
 'Sample1157-EO1',
 'Sample1157-EO2',
 'Sample1157-EO3',
 'Sample1158-EO1',
 'Sample1158-EO2',
 'Sample1158-EO3',
 'Sample1162-EO1',
 'Sample1162-EO2',
 'control_1',
 'control_2',
 'control_3',
 'healthy_4',
 'healthy_5',
 'TxK4',
 'RCC1',
 'RCC2',
 'RCC3',
 'VHL_RCC',
 'TxK1',
 'TxK3',
 'TxK2',
 'PapRCC',
 '57-ref',
 '56-ref',
 'HCA_55_ref',
 'HCA_54_ref',
 'HCA_51_ref',
 'HCA_29_ref',
 'PRECISE_023_N',
 '20-688_N',
 '20-687_N',
 '20-668_N',
 '20-649_N',
 'PRECISE165536-000-003N',
 'PRECISE-943776',
 'PRECISE-945420',
 '21_020',
 '21_019',
 '21_016',
 '21_015']

In [103]:
#view obs

In [104]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,Homo sapiens,normal,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,Homo sapiens,normal,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,Homo sapiens,normal,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,Homo sapiens,normal,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,Homo sapiens,normal,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,Homo sapiens,normal,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,Homo sapiens,normal,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151


In [105]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [106]:
list(adata.obs['disease'].unique())

['normal']

In [107]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [108]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,Homo sapiens,normal,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,Homo sapiens,normal,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,Homo sapiens,normal,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,Homo sapiens,normal,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,Homo sapiens,normal,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,Homo sapiens,normal,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,Homo sapiens,normal,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461


In [109]:
# change datatype of the column

In [110]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [111]:
# view obs

In [112]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,Homo sapiens,normal,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,Homo sapiens,normal,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,Homo sapiens,normal,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,Homo sapiens,normal,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,Homo sapiens,normal,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,Homo sapiens,normal,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,Homo sapiens,normal,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461


#### **is_primary_data**

In [113]:
#change data type of column

In [114]:
list(adata.obs['Dataset'].unique())

['Lake et al. 2021',
 'Muto et al. 2021',
 'Stewart et al. 2019',
 'HCA Kidney 2022']

In [115]:
mapping = {'Aizarani et al. 2019':False,
'Andrews et al. 2022':False,
'Guilliams et al. 2022':False,
'MacParland et al. 2018':False,
'HCA Kidney 2022':False,
'Lake et al. 2021':False,
'Muto et al. 2021':False,
'Stewart et al. 2019':False,
'Dominguez Conde et al. 2022':False,
'Elmentaite et al. 2021':False,
'James et al. 2020':False,
'Szabo et al. 2019':False,
'He et al. 2020':False,
'Micheli et al. 2020':True,
'Perez et al. 2022':True,
'Ren et al. 2021':False,
'Stephenson et al. 2021':False,
'Yoshida et al. 2021':False,
'Madissoon et al. 2020':False,
'Tabula Sapiens 2022':False,
'Ayhan et al. 2021':False,
'Franjic et al. 2022':True,
'Siletti et al. 2022':False,
'Tran et al. 2021':True,
'Burclaff et al. 2022':False,
'Smillie et al. 2019':False,
'Roy et al. 2021':True,
'Adams et al. 2020':False,
'Madissoon et al. 2022':False,
'Travaglini et al. 2020':False,
'Koenig et al. 2022':True,
'Kuppe et al. 2022':False,
'Litvinukova et al. 2020':False,
'Tucker et al. 2020':True,
'Fasolino et al. 2022':False,
'Muraro et al. 2016':False,
'Tosti et al. 2021':True,
'Tritschler et al. 2022':False}

In [116]:
adata.obs['is_primary_data']= adata.obs['Dataset'].map(mapping)

In [117]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [118]:
# view obs

In [119]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,Homo sapiens,normal,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,Homo sapiens,normal,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,Homo sapiens,normal,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,Homo sapiens,normal,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,Homo sapiens,normal,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,Homo sapiens,normal,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,Homo sapiens,normal,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,Homo sapiens,normal,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False


#### **organism_ontology_term_id**

In [120]:
# assign organism id 

In [121]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [122]:
#change data type of column

In [123]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [124]:
# view obs

In [125]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,...,normal,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,...,normal,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,normal,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,normal,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,...,normal,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,...,normal,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,...,normal,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,...,normal,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,...,normal,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [126]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/kidney/metadata_kidney.csv')
mapping = dict(zip(df['cells'], df['ethnicity']))
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs_names.map(mapping)

In [127]:
df

Unnamed: 0.1,Unnamed: 0,Donor,Study,cells,tissue_ontology_term_id,tissue,ethnicity,age
0,0,control_3,muto_atac.h5ad,AAACGAAAGCGTAGCA-3,UBERON:0001225,cortex of kidney,HANCESTRO:0005,HsapDv:0000155
1,1,control_3,muto_atac.h5ad,AAACGAAAGGTTGTTC-3,UBERON:0001225,cortex of kidney,HANCESTRO:0005,HsapDv:0000155
2,2,control_1,muto_atac.h5ad,AAACGAACAAATTGAG-1,UBERON:0001225,cortex of kidney,HANCESTRO:0005,HsapDv:0000148
3,3,control_3,muto_atac.h5ad,AAACGAACAGCTATAC-3,UBERON:0001225,cortex of kidney,HANCESTRO:0005,HsapDv:0000155
4,4,healthy_5,muto_atac.h5ad,AAACGAAGTCCTTCAC-5,UBERON:0001225,cortex of kidney,HANCESTRO:0005,HsapDv:0000146
...,...,...,...,...,...,...,...,...
467914,467914,F45,stewart_fetal_kidney.h5ad,FCAImmP7579215_F45_KI_45N_TAGTTGGTCGGAATCT-1,UBERON:0002113,kidney,unknown,HsapDv:0000049
467915,467915,F45,stewart_fetal_kidney.h5ad,FCAImmP7579215_F45_KI_45N_TCAGGATCAACGATCT-1,UBERON:0002113,kidney,unknown,HsapDv:0000049
467916,467916,F45,stewart_fetal_kidney.h5ad,FCAImmP7579215_F45_KI_45N_TCGAGGCCATCTATGG-1,UBERON:0002113,kidney,unknown,HsapDv:0000049
467917,467917,F45,stewart_fetal_kidney.h5ad,FCAImmP7579215_F45_KI_45N_TGTCCCATCTTGCAAG-1,UBERON:0002113,kidney,unknown,HsapDv:0000049


In [128]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['HANCESTRO:0005', 'unknown', 'HANCESTRO:0568', 'HANCESTRO:0014']

In [129]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = np.where(adata.obs['self_reported_ethnicity_ontology_term_id'].isna(), 'unknown', adata.obs['self_reported_ethnicity_ontology_term_id'])

In [130]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['HANCESTRO:0005', 'unknown', 'HANCESTRO:0568', 'HANCESTRO:0014']

In [131]:
# change data type

In [132]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [133]:
# view obs

In [134]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,...,kidney,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,...,kidney,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,kidney,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,kidney,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,...,kidney,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,...,kidney,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,...,kidney,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,...,kidney,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,...,kidney,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005


In [135]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [136]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['HANCESTRO:0005', 'unknown', 'HANCESTRO:0568', 'HANCESTRO:0014']

#### **sex_ontology_term_id**

In [137]:
# identify the column in adata.obs which corresponds to sex

In [138]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [139]:
list(adata.obs['sex'].unique())

['male', 'female']

In [140]:
# list the unique values 

In [141]:
mapping= {'female': 'PATO:0000383', 'male': 'PATO:0000384', 'unknown':'unknown'}

In [142]:
# add sex_ontology_term_id column

In [143]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex'].map(mapping)

In [144]:
# change data type

In [145]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [146]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,...,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,...,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,...,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,...,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,...,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,...,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,...,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383


#### **suspension_type**

In [147]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,...,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,...,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,...,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,...,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,...,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,...,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,...,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383


In [148]:
list(adata.obs['suspension_type'].unique())

['nucleus', 'cell']

In [149]:
adata.obs['suspension_type'] = adata.obs['suspension_type']

In [150]:
# change data type of column

In [151]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [152]:
# view obs

In [153]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,...,AAACCCAAGTAGTCAA,3pv3,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,...,AAACCCACAACTGAAA,3pv3,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,AAACCCACACGTTGGC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,AAACCCACATTGCTTT,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,...,AAACCCATCACTTATC,3pv3,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,...,TTTGGAGTCGAAGCAG,3pv3,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,...,TTTGGTTGTAGTCACT,3pv3,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,...,TTTGGTTGTCGGAACA,3pv3,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,...,TTTGTTGTCACAGAGG,3pv3,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383


#### **tissue_type**

In [154]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [155]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [156]:
# identify the column in adata.obs which corresponds to tissue

In [157]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type'],
      dtype='object')

In [158]:
list(adata.obs['tissue'].unique())

['kidney']

In [159]:
mapping= {'blood' : 'UBERON:0000178',
         'bone marrow':'UBERON:0002371',
          'heart':'UBERON:0000948',
          'intestine':'UBERON:0000160',
          'kidney':'UBERON:0002113',
          'hippocampal formation':'UBERON:0002421',
          'liver':'UBERON:0002107',
          'lung':'UBERON:0002048',
          'lymph node':'UBERON:0000029',
          'pancreas':'UBERON:0001264',
          'skeletal muscle organ':'UBERON:0014892',
          'spleen':'UBERON:0002106'}

In [160]:
# add 'tissue_ontology_term_id' column

In [161]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/kidney/metadata_kidney.csv')
mapping = dict(zip(df['cells'], df['tissue_ontology_term_id']))
adata.obs['tissue_ontology_term_id'] = adata.obs_names.map(mapping)

In [162]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0001225',
 'UBERON:0000362',
 'UBERON:0001228',
 'UBERON:0002113',
 'UBERON:0001224']

In [163]:
adata.obs['tissue_ontology_term_id'] = np.where(adata.obs['tissue_ontology_term_id'].isna(), 'UBERON:0002113', adata.obs['tissue_ontology_term_id'])

In [164]:
# change data type of column

In [165]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [166]:
#list the unique values in 'tissue_ontology_term_id' column

In [167]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0001225',
 'UBERON:0000362',
 'UBERON:0001228',
 'UBERON:0002113',
 'UBERON:0001224']

In [168]:
# view obs

In [169]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,...,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,...,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,...,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,...,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,...,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,...,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,...,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225


In [170]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [171]:
# view obsm

In [172]:
# check whether all columns are prefixed with X

In [173]:
adata.obsm

AxisArrays with keys: X_umap

#### **uns (Dataset Metadata)**

In [174]:
# View

In [175]:
adata.uns

{'schema_version': '3.0.0', 'title': 'Adult human kidney'}

In [176]:
adata.uns.keys

<function dict.keys>

In [177]:
# Give a title for the dataset

In [178]:
adata.uns['title'] = 'Kidney'

In [179]:
# Set the default embedding

In [180]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [181]:
# view anndata object

In [182]:
adata

AnnData object with n_obs × n_vars = 194504 × 33418
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue', 'barcodes', 'assays', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'
    var: 'exist_in_Lake2021', 'exist_in_Muto2021', 'exist_in_Stewart2019', 'exist_in_HCAkidney2022', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'default_embedding'
    obsm: 'X_umap'

In [183]:
# view obs and var data types

In [184]:
adata.obs.dtypes

Dataset                                     category
donor_id                                    category
development_stage                           category
sex                                         category
suspension_type                             category
assay                                       category
Original_annotation                         category
CellHint_harmonised_group                   category
cell_type                                   category
Curated_annotation                          category
organism                                    category
disease                                     category
tissue                                      category
barcodes                                      object
assays                                        object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    ca

In [185]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [186]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed barcodes from object to category
changed assays from object to category


In [187]:
# view obs

In [188]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,MAC-M2,Group62,myeloid cell,Myeloid,...,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,DCT1,Group59,kidney distal convoluted tubule epithelial cell,DCT,...,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,PT-S1/2,Group52,epithelial cell of proximal tubule,PT-S1/2,...,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,eighth decade human stage,male,nucleus,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,dPT,...,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,CNT-IC-A,Group4,kidney collecting duct intercalated cell,dC-IC-A,...,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,PC,Group41,kidney collecting duct principal cell,PC,...,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,EC-GC,Group11,glomerular capillary endothelial cell,EC-GC,...,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,57-year-old human stage,female,cell,10x 3' v3,dPT,Group52,epithelial cell of proximal tubule,PT,...,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225


In [189]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [190]:
# delete unwanted columns in obs

In [191]:
del adata.obs['tissue']
del adata.obs['organism']
del adata.obs['disease']
del adata.obs['assay']
del adata.obs['sex']
del adata.obs['development_stage']
del adata.obs['assays']
del adata.obs['barcodes']
del adata.uns['schema_version']
del adata.obs['cell_type']

In [192]:
# view obs

In [193]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,nucleus,MAC-M2,Group62,Myeloid,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,nucleus,DCT1,Group59,DCT,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,nucleus,PT-S1/2,Group52,PT-S1/2,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,nucleus,PT-S1/2,Group52,PT-S1/2,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,nucleus,dPT,Group52,dPT,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,cell,CNT-IC-A,Group4,dC-IC-A,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,cell,PC,Group41,PC,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,cell,EC-GC,Group11,EC-GC,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,cell,dPT,Group52,PT,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225


In [194]:
# view var

In [195]:
adata.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000121410,True,True,True,True,False
ENSG00000268895,True,True,True,True,False
ENSG00000148584,True,True,True,True,False
ENSG00000175899,True,True,True,True,False
ENSG00000245105,True,True,True,True,False
...,...,...,...,...,...
ENSG00000074755,True,True,True,True,False
ENSG00000036549,True,True,True,True,False
ENSG00000270533,True,False,False,False,False
ENSG00000272920,False,True,True,True,False


In [196]:
araw.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000121410,True,True,True,True
ENSG00000268895,True,True,True,True
ENSG00000148584,True,True,True,True
ENSG00000175899,True,True,True,True
ENSG00000245105,True,True,True,True
...,...,...,...,...
ENSG00000074755,True,True,True,True
ENSG00000036549,True,True,True,True
ENSG00000270533,True,False,False,False
ENSG00000272920,False,True,True,True


In [197]:
#view uns

In [198]:
adata.uns

{'title': 'Kidney', 'default_embedding': 'X_umap'}

In [199]:
list(adata.uns.keys())

['title', 'default_embedding']

In [200]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [201]:
# Remove unwanted columns in uns

In [202]:
#check the format of expression matrix

In [203]:
adata.X

<194504x33418 sparse matrix of type '<class 'numpy.float32'>'
	with 308414812 stored elements in Compressed Sparse Row format>

In [204]:
araw.X

<194504x33418 sparse matrix of type '<class 'numpy.float32'>'
	with 308414812 stored elements in Compressed Sparse Row format>

In [205]:
#Copy raw counts to adata.raw

In [206]:
del adata.raw

In [207]:
adata.raw = araw

In [208]:
obs_dtype = adata.obs.dtypes

In [209]:
obs_dtype

Dataset                                     category
donor_id                                    category
suspension_type                             category
Original_annotation                         category
CellHint_harmonised_group                   category
Curated_annotation                          category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
tissue_type                                 category
tissue_ontology_term_id                     category
dtype: object

In [210]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Final_objects/to_upload/Kidney.h5ad', compression = 'gzip')

In [211]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
KC33_AAACCCAAGTAGTCAA,Lake et al. 2021,3535,nucleus,MAC-M2,Group62,Myeloid,EFO:0009922,CL:0000763,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACAACTGAAA,Lake et al. 2021,3535,nucleus,DCT1,Group59,DCT,EFO:0009922,CL:1000849,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACACGTTGGC,Lake et al. 2021,3535,nucleus,PT-S1/2,Group52,PT-S1/2,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCACATTGCTTT,Lake et al. 2021,3535,nucleus,PT-S1/2,Group52,PT-S1/2,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
KC33_AAACCCATCACTTATC,Lake et al. 2021,3535,nucleus,dPT,Group52,dPT,EFO:0009922,CL:0002306,HsapDv:0000242,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PilotV221015_TTTGGAGTCGAAGCAG-1,HCA Kidney 2022,21_015,cell,CNT-IC-A,Group4,dC-IC-A,EFO:0009922,CL:1001432,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGGTTGTAGTCACT-1,HCA Kidney 2022,21_015,cell,PC,Group41,PC,EFO:0009922,CL:1001431,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGGTTGTCGGAACA-1,HCA Kidney 2022,21_015,cell,EC-GC,Group11,EC-GC,EFO:0009922,CL:1001005,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225
PilotV221015_TTTGTTGTCACAGAGG-1,HCA Kidney 2022,21_015,cell,dPT,Group52,PT,EFO:0009922,CL:0002306,HsapDv:0000151,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000383,tissue,UBERON:0001225


In [212]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [213]:
adata.raw.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000121410,True,True,True,True
ENSG00000268895,True,True,True,True
ENSG00000148584,True,True,True,True
ENSG00000175899,True,True,True,True
ENSG00000245105,True,True,True,True
...,...,...,...,...
ENSG00000074755,True,True,True,True
ENSG00000036549,True,True,True,True
ENSG00000270533,True,False,False,False
ENSG00000272920,False,True,True,True


In [214]:
adata.var

Unnamed: 0_level_0,exist_in_Lake2021,exist_in_Muto2021,exist_in_Stewart2019,exist_in_HCAkidney2022,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000121410,True,True,True,True,False
ENSG00000268895,True,True,True,True,False
ENSG00000148584,True,True,True,True,False
ENSG00000175899,True,True,True,True,False
ENSG00000245105,True,True,True,True,False
...,...,...,...,...,...
ENSG00000074755,True,True,True,True,False
ENSG00000036549,True,True,True,True,False
ENSG00000270533,True,False,False,False,False
ENSG00000272920,False,True,True,True,False


In [215]:
adata.raw.X

<194504x33418 sparse matrix of type '<class 'numpy.float32'>'
	with 308414812 stored elements in Compressed Sparse Row format>