### **Curating Intestine.h5ad**

Article: Automatic cell-type harmonization and integration across Human Cell Atlas datasets

DOI: https://doi.org/10.1016/j.cell.2023.11.026

Data Source : https://www.celltypist.org/organs

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Data/Intestine.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 263275 × 38450
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'
    var: 'exist_in_Burclaff2022', 'exist_in_Elmentaite2021', 'exist_in_Smillie2019', 'exist_in_James2020'
    uns: 'schema_version', 'title'
    obsm: 'X_umap'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<263275x38450 sparse matrix of type '<class 'numpy.float32'>'
	with 342614884 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 38437)	1.2511581
  (0, 38436)	1.2511581
  (0, 21418)	1.2511581
  (0, 38357)	1.2511581
  (0, 38265)	1.2511581
  (0, 38244)	1.2511581
  (0, 38235)	1.789887
  (0, 38227)	1.2511581
  (0, 38219)	1.2511581
  (0, 38176)	1.2511581
  (0, 38175)	2.1380835
  (0, 38138)	1.2511581
  (0, 38066)	1.2511581
  (0, 38062)	1.2511581
  (0, 38001)	1.2511581
  (0, 37991)	1.2511581
  (0, 37927)	1.2511581
  (0, 37877)	1.2511581
  (0, 37834)	1.2511581
  (0, 37833)	1.2511581
  (0, 37807)	1.789887
  (0, 37790)	1.2511581
  (0, 37767)	1.2511581
  (0, 37766)	1.2511581
  (0, 37765)	1.2511581
  :	:
  (263274, 12236)	0.40344205
  (263274, 5120)	0.40344205
  (263274, 35469)	0.69011104
  (263274, 5871)	0.69011104
  (263274, 34078)	0.40344205
  (263274, 12638)	0.40344205
  (263274, 34086)	0.40344205
  (263274, 31907)	1.9406997
  (263274, 12566)	0.69011104
  (263274, 36894)	0.40344205
  (263274, 8349)	0.40344205
  (263274, 21065)	0.40344205
  (263274, 4618)	0.69011104
  (263274, 33286)	0.40344205
  (263274, 21738)	0.

##### **Raw counts matrix**

In [11]:
print(adata.raw.X)

  (0, 38437)	1.0
  (0, 38436)	1.0
  (0, 21418)	1.0
  (0, 38357)	1.0
  (0, 38265)	1.0
  (0, 38244)	1.0
  (0, 38235)	2.0
  (0, 38227)	1.0
  (0, 38219)	1.0
  (0, 38176)	1.0
  (0, 38175)	3.0
  (0, 38138)	1.0
  (0, 38066)	1.0
  (0, 38062)	1.0
  (0, 38001)	1.0
  (0, 37991)	1.0
  (0, 37927)	1.0
  (0, 37877)	1.0
  (0, 37834)	1.0
  (0, 37833)	1.0
  (0, 37807)	2.0
  (0, 37790)	1.0
  (0, 37767)	1.0
  (0, 37766)	1.0
  (0, 37765)	1.0
  :	:
  (263274, 12236)	1.0
  (263274, 5120)	1.0
  (263274, 35469)	2.0
  (263274, 5871)	2.0
  (263274, 34078)	1.0
  (263274, 12638)	1.0
  (263274, 34086)	1.0
  (263274, 31907)	12.0
  (263274, 12566)	2.0
  (263274, 36894)	1.0
  (263274, 8349)	1.0
  (263274, 21065)	1.0
  (263274, 4618)	2.0
  (263274, 33286)	1.0
  (263274, 21738)	1.0
  (263274, 13970)	1.0
  (263274, 20464)	2.0
  (263274, 37585)	1.0
  (263274, 31919)	4.0
  (263274, 35593)	1.0
  (263274, 22730)	1.0
  (263274, 9756)	1.0
  (263274, 18326)	1.0
  (263274, 5015)	1.0
  (263274, 3347)	3.0


In [12]:
adata.raw.var

Unnamed: 0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020
7SK,False,False,True,False
A1BG,True,True,True,True
A1BG-AS1,True,True,True,True
A1CF,True,True,True,True
A2M,False,True,True,True
...,...,...,...,...
hsa-mir-5571,False,False,True,False
hsa-mir-6080,False,False,True,False
hsa-mir-8072,False,False,True,False
snoU109,False,False,True,False


In [13]:
araw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)

##### **Variables(var)**

In [14]:
# View the var of anndata and raw object

In [15]:
adata.var

Unnamed: 0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020
7SK,False,False,True,False
A1BG,True,True,True,True
A1BG-AS1,True,True,True,True
A1CF,True,True,True,True
A2M,False,True,True,True
...,...,...,...,...
hsa-mir-5571,False,False,True,False
hsa-mir-6080,False,False,True,False
hsa-mir-8072,False,False,True,False
snoU109,False,False,True,False


In [16]:
ensembl_data = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/symbol2ID.csv')

In [17]:
ensembl_dict = dict(zip(ensembl_data['gene_symbol'], ensembl_data['gene_id']))

In [18]:
ensembl_dict

{'MT-TF': 'ENSG00000210049',
 'MT-RNR1': 'ENSG00000211459',
 'MT-TV': 'ENSG00000210077',
 'MT-RNR2': 'ENSG00000210082',
 'MT-TL1': 'ENSG00000209082',
 'MT-ND1': 'ENSG00000198888',
 'MT-TI': 'ENSG00000210100',
 'MT-TQ': 'ENSG00000210107',
 'MT-TM': 'ENSG00000210112',
 'MT-ND2': 'ENSG00000198763',
 'MT-TW': 'ENSG00000210117',
 'MT-TA': 'ENSG00000210127',
 'MT-TN': 'ENSG00000210135',
 'MT-TC': 'ENSG00000210140',
 'MT-TY': 'ENSG00000210144',
 'MT-CO1': 'ENSG00000198804',
 'MT-TS1': 'ENSG00000210151',
 'MT-TD': 'ENSG00000210154',
 'MT-CO2': 'ENSG00000198712',
 'MT-TK': 'ENSG00000210156',
 'MT-ATP8': 'ENSG00000228253',
 'MT-ATP6': 'ENSG00000198899',
 'MT-CO3': 'ENSG00000198938',
 'MT-TG': 'ENSG00000210164',
 'MT-ND3': 'ENSG00000198840',
 'MT-TR': 'ENSG00000210174',
 'MT-ND4L': 'ENSG00000212907',
 'MT-ND4': 'ENSG00000198886',
 'MT-TH': 'ENSG00000210176',
 'MT-TS2': 'ENSG00000210184',
 'MT-TL2': 'ENSG00000210191',
 'MT-ND5': 'ENSG00000198786',
 'MT-ND6': 'ENSG00000198695',
 'MT-TE': 'ENSG00000

In [19]:
adata.var['gene_id'] = adata.var_names.map(ensembl_dict)
araw.var['gene_id'] = araw.var_names.map(ensembl_dict)


In [20]:
adata.var

Unnamed: 0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,gene_id
7SK,False,False,True,False,ENSG00000274262
A1BG,True,True,True,True,ENSG00000121410
A1BG-AS1,True,True,True,True,ENSG00000268895
A1CF,True,True,True,True,ENSG00000148584
A2M,False,True,True,True,ENSG00000175899
...,...,...,...,...,...
hsa-mir-5571,False,False,True,False,ENSG00000264824
hsa-mir-6080,False,False,True,False,ENSG00000215769
hsa-mir-8072,False,False,True,False,ENSG00000256092
snoU109,False,False,True,False,ENSG00000239197


In [21]:
araw.var

Unnamed: 0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,gene_id
7SK,False,False,True,False,ENSG00000274262
A1BG,True,True,True,True,ENSG00000121410
A1BG-AS1,True,True,True,True,ENSG00000268895
A1CF,True,True,True,True,ENSG00000148584
A2M,False,True,True,True,ENSG00000175899
...,...,...,...,...,...
hsa-mir-5571,False,False,True,False,ENSG00000264824
hsa-mir-6080,False,False,True,False,ENSG00000215769
hsa-mir-8072,False,False,True,False,ENSG00000256092
snoU109,False,False,True,False,ENSG00000239197


In [22]:
nan_count = adata.var['gene_id'].isna().sum()
print("Number of NaN values in adata.obs['gene_id']: ", nan_count)

Number of NaN values in adata.obs['gene_id']:  2451


In [23]:
adata.var['gene_name'] = adata.var.index
araw.var['gene_name'] = araw.var.index

In [24]:
adata.var.index = adata.var['gene_id'] 
araw.var.index = araw.var['gene_id']

In [25]:
# Load the approved genes file.

In [26]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [27]:
#Create a dictionary from the approved genes file 

In [28]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [29]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [30]:
len(genedict)

119799

In [31]:
#Filter out the genes which are not in the approved genes file.

In [32]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [33]:
len(var_to_keep_adata)

33178

In [34]:
len(var_to_keep_araw)

33178

In [35]:
adata.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000274262,False,False,True,False,ENSG00000274262,7SK
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,False,True,True,True,ENSG00000175899,A2M
...,...,...,...,...,...,...
ENSG00000264824,False,False,True,False,ENSG00000264824,hsa-mir-5571
ENSG00000215769,False,False,True,False,ENSG00000215769,hsa-mir-6080
ENSG00000256092,False,False,True,False,ENSG00000256092,hsa-mir-8072
ENSG00000239197,False,False,True,False,ENSG00000239197,snoU109


In [36]:
araw.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000274262,False,False,True,False,ENSG00000274262,7SK
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,False,True,True,True,ENSG00000175899,A2M
...,...,...,...,...,...,...
ENSG00000264824,False,False,True,False,ENSG00000264824,hsa-mir-5571
ENSG00000215769,False,False,True,False,ENSG00000215769,hsa-mir-6080
ENSG00000256092,False,False,True,False,ENSG00000256092,hsa-mir-8072
ENSG00000239197,False,False,True,False,ENSG00000239197,snoU109


In [37]:
# Modify the anndata object by filtering out the filtered genes.

In [38]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [39]:
adata.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,False,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000265929,False,False,True,False,ENSG00000265929,hsa-mir-5195
ENSG00000264824,False,False,True,False,ENSG00000264824,hsa-mir-5571
ENSG00000215769,False,False,True,False,ENSG00000215769,hsa-mir-6080
ENSG00000256092,False,False,True,False,ENSG00000256092,hsa-mir-8072


In [40]:
# View var

In [41]:
araw.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,False,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000265929,False,False,True,False,ENSG00000265929,hsa-mir-5195
ENSG00000264824,False,False,True,False,ENSG00000264824,hsa-mir-5571
ENSG00000215769,False,False,True,False,ENSG00000215769,hsa-mir-6080
ENSG00000256092,False,False,True,False,ENSG00000256092,hsa-mir-8072


In [42]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = adata.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
adata = adata[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", adata.var_names.is_unique)

All variable names are unique: True


In [43]:
import scanpy as sc

# Assuming 'adata' is your AnnData object

# Identify duplicate variable names
duplicate_var_mask = araw.var_names.duplicated(keep='first')

# Invert the mask to keep unique variable names
unique_var_mask = ~duplicate_var_mask

# Update AnnData object with unique variable names and data
araw = araw[:, unique_var_mask].copy()

# Ensure variable names are unique
print("All variable names are unique:", araw.var_names.is_unique)

All variable names are unique: True


feature is filtered

In [44]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [45]:
adata.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,gene_id,gene_name,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG,False
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1,False
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF,False
ENSG00000175899,False,True,True,True,ENSG00000175899,A2M,False
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1,False
...,...,...,...,...,...,...,...
ENSG00000272920,False,True,False,False,ENSG00000272920,hsa-mir-1253,False
ENSG00000265929,False,False,True,False,ENSG00000265929,hsa-mir-5195,False
ENSG00000264824,False,False,True,False,ENSG00000264824,hsa-mir-5571,False
ENSG00000215769,False,False,True,False,ENSG00000215769,hsa-mir-6080,False


In [46]:
araw.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,True,True,True,True,ENSG00000121410,A1BG
ENSG00000268895,True,True,True,True,ENSG00000268895,A1BG-AS1
ENSG00000148584,True,True,True,True,ENSG00000148584,A1CF
ENSG00000175899,False,True,True,True,ENSG00000175899,A2M
ENSG00000245105,True,True,True,True,ENSG00000245105,A2M-AS1
...,...,...,...,...,...,...
ENSG00000272920,False,True,False,False,ENSG00000272920,hsa-mir-1253
ENSG00000265929,False,False,True,False,ENSG00000265929,hsa-mir-5195
ENSG00000264824,False,False,True,False,ENSG00000264824,hsa-mir-5571
ENSG00000215769,False,False,True,False,ENSG00000215769,hsa-mir-6080


In [47]:
del adata.var['gene_id']
del araw.var['gene_id']
del adata.var['gene_name']
del araw.var['gene_name']

#### **obs (Cell metadata)**

In [48]:
#view obs

In [49]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,Homo sapiens,normal,intestine
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,Homo sapiens,normal,intestine
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,Homo sapiens,normal,intestine
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,Homo sapiens,normal,intestine
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine


In [50]:
# view the column names in obs

In [51]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue'],
      dtype='object')

#### **assay_ontology_term_id**

In [52]:
list(adata.obs['assay'].unique())

["10x 3' v3", "10x 5' v2", "10x 3' v2", "10x 3' v1"]

In [53]:
adata.obs['barcodes'] = adata.obs_names

In [54]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [55]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [56]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [57]:
adata.obs['assays'] = adata.obs['barcodes'].map(mapping)

In [58]:
list(adata.obs['assays'].unique())

['3pv3',
 '3pv2_5pv1_5pv2+3pv3',
 '3pv3+multiome',
 '3pv2_5pv1_5pv2',
 '3pv2_5pv1_5pv2+multiome',
 nan]

In [59]:
import pandas as pd
import scanpy as sc

# Suppose 'adata' is your AnnData object already loaded.

# Step 1: Check unique values in 'assay'
unique_assays = adata.obs['assay'].unique()
print("Unique values in 'assay':", unique_assays)

# Step 2: Display unique 'assays' for each unique 'assay'
for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['assay'] == assay, 'assays'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)


Unique values in 'assay': ['10x 3' v3', '10x 5' v2', '10x 3' v2', '10x 3' v1']
Categories (4, object): ['10x 3' v1', '10x 3' v2', '10x 3' v3', '10x 5' v2']
Unique 'assays' for assay 10x 3' v3: ['3pv3' '3pv2_5pv1_5pv2+3pv3' '3pv3+multiome']
Unique 'assays' for assay 10x 5' v2: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay 10x 3' v2: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay 10x 3' v1: [nan]


In [60]:
mapping ={"10x 5' v2":'EFO:0009900', "10x 5' v1":'EFO:0011025', "10x 3' v3":'EFO:0009922', "10x 3' transcription profiling":'EFO:0009899',"10x 3' v1":'EFO:0009901',"10x 3' v2":'EFO:0009899'}

In [61]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay'].map(mapping)

In [62]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,Homo sapiens,normal,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,Homo sapiens,normal,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,Homo sapiens,normal,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900


In [63]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [64]:
# view adata.obs

In [65]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,Homo sapiens,normal,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,Homo sapiens,normal,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,Homo sapiens,normal,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900


#### **cell_type_ontology_term_id**

In [66]:
#identify the column in adata.obs related. to cell type annotation

In [67]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id'],
      dtype='object')

In [68]:
list(adata.obs['cell_type'].unique())

['transit amplifying cell',
 'enterocyte',
 'tuft cell of colon',
 'goblet cell',
 'BEST4+ intestinal epithelial cell, human',
 'paneth cell',
 'enterocyte of epithelium of large intestine',
 'enteroendocrine cell',
 'stem cell',
 'plasma cell',
 'stromal cell',
 'myofibroblast cell',
 'vein endothelial cell',
 'pericyte',
 'endothelial cell of lymphatic vessel',
 'endothelial cell of artery',
 'capillary endothelial cell',
 'macrophage',
 'conventional dendritic cell',
 'glial cell',
 'gamma-delta T cell',
 'colon macrophage',
 'follicular B cell',
 'innate lymphoid cell',
 'natural killer cell',
 'mast cell',
 'T-helper 17 cell',
 'germinal center B cell',
 'effector memory CD8-positive, alpha-beta T cell',
 'naive B cell',
 'memory B cell',
 'monocyte',
 'regulatory T cell',
 'central memory CD4-positive, alpha-beta T cell',
 'T-helper 1 cell',
 'T follicular helper cell',
 'B cell',
 'effector memory CD4-positive, alpha-beta T cell']

In [69]:

df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/cell_typist_annotation.csv')



In [70]:
mapping = df.set_index('Cell_type')['Cell_ontology_ID'].to_dict()

In [71]:
mapping

{'central memory CD4-positive, alpha-beta T cell': 'CL:0000904',
 'CD16-positive, CD56-dim natural killer cell, human': 'CL:0000939',
 'effector memory CD4-positive, alpha-beta T cell': 'CL:0000905',
 'central memory CD8-positive, alpha-beta T cell': 'CL:0000907',
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated': 'CL:0001062',
 'classical monocyte': 'CL:0000860',
 'class switched memory B cell': 'CL:0000972',
 'mucosal invariant T cell': 'CL:0000940',
 'naive B cell': 'CL:0000788',
 'effector memory CD8-positive, alpha-beta T cell': 'CL:0000913',
 'unswitched memory B cell': 'CL:0000970',
 'non-classical monocyte': 'CL:0000875',
 'CD16-negative, CD56-bright natural killer cell, human': 'CL:0000938',
 'gamma-delta T cell': 'CL:0000798',
 'regulatory T cell': 'CL:0000815',
 'conventional dendritic cell': 'CL:0000990',
 'plasma cell': 'CL:0000786',
 'memory B cell': 'CL:0000787',
 'effector memory CD4-positive, alpha-beta T cell, terminally differentiated': 'C

In [72]:
# add the cell_type_ontology_term_id column

In [73]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type'].map(mapping)

In [74]:
# change datatype of the column

In [75]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [76]:
# view adata.obs

In [77]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,Homo sapiens,normal,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,Homo sapiens,normal,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,Homo sapiens,normal,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,417c,eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786


#### **development_stage_ontology_term_id**

In [78]:
# identify the column in adata which corresponds to age

In [79]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [80]:
list(adata.obs['development_stage'].unique())

['45-year-old human stage',
 '29-year-old human stage',
 '53-year-old human stage',
 'fourth decade human stage',
 'sixth decade human stage',
 'seventh decade human stage',
 'human adult stage',
 'eighth decade human stage',
 'fifth decade human stage']

In [81]:
age = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/age.csv')

In [82]:
age_dict = pd.Series(age['development_stage_ontology_term_id'].values, index=age['age']).to_dict()

In [83]:
age_dict

{'29-year-old human stage': 'HsapDv:0000123',
 '58-year-old human stage': 'HsapDv:0000152',
 '35-year-old human stage': 'HsapDv:0000129',
 '33-year-old human stage': 'HsapDv:0000127',
 '45-year-old human stage': 'HsapDv:0000139',
 '37-year-old human stage': 'HsapDv:0000131',
 '69-year-old human stage': 'HsapDv:0000163',
 '55-year-old human stage': 'HsapDv:0000149',
 '71-year-old human stage': 'HsapDv:0000165',
 '26-year-old human stage': 'HsapDv:0000120',
 '53-year-old human stage': 'HsapDv:0000147',
 '49-year-old human stage': 'HsapDv:0000143',
 '46-year-old human stage': 'HsapDv:0000140',
 '34-year-old human stage': 'HsapDv:0000128',
 '27-year-old human stage': 'HsapDv:0000121',
 '28-year-old human stage': 'HsapDv:0000122',
 '30-year-old human stage': 'HsapDv:0000124',
 'seventh decade human stage': 'HsapDv:0000241',
 'sixth decade human stage': 'HsapDv:0000240',
 '59-year-old human stage': 'HsapDv:0000153',
 '39-year-old human stage': 'HsapDv:0000133',
 '22-year-old human stage': 'H

In [84]:
donor_ids = ["A29", "390C", "A26 (386C)", "A26", "A32 (411C)", "A32", "A34 (417C)", 
             "417C", "356C", "A32 (411C)", "A26 (386C)", "284C", "368C", "296C", 
             "A33 (414C)", "A30 (398B)", "417c", "454C", "A32", "A37", "A40", "A44", 
             "A47", "640C", "390C", "A29", "390c", "302C", "302c", "390C", "390c","411C","A34","386C",'D5','A29 (390C)','A37' ]

# Convert `adata.obs['donor_id']` to a set for faster lookup
present_donors_set = set(adata.obs['donor_id'])

# Check which donors are present in `adata.obs['donor_id']`
present_donors = [donor for donor in donor_ids if donor in present_donors_set]

# Display the donors that are present
print("Donors present in adata.obs['donor_id']: ", present_donors)

Donors present in adata.obs['donor_id']:  ['A26 (386C)', 'A32 (411C)', 'A34 (417C)', 'A32 (411C)', 'A26 (386C)', 'A33 (414C)', 'A30 (398B)', '417c', '390c', '302c', '390c']


In [85]:
donors_to_replace = ['A29', 'A26', 'A32', '417C','302c', '390c','417c','386C','390C','411C',]
if any(donor in adata.obs['donor_id'].values for donor in donors_to_replace):
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A29', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A26', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('A32', 'A32 (411C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417C', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('302c', '302C')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('417c', 'A34 (417C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('390C', 'A29 (390C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('386C', 'A26 (386C)')
    adata.obs['donor_id'] = adata.obs['donor_id'].replace('411C', 'A32 (411C)')

  adata.obs['donor_id'] = adata.obs['donor_id'].replace('302c', '302C')
  adata.obs['donor_id'] = adata.obs['donor_id'].replace('390c', 'A29 (390C)')
  adata.obs['donor_id'] = adata.obs['donor_id'].replace('417c', 'A34 (417C)')


In [86]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage'].map(age_dict)

In [87]:
obs_df = adata.obs[['development_stage_ontology_term_id', 'donor_id']]

# Dropping duplicates to get unique pairs
unique_pairs = obs_df.drop_duplicates()

# Converting to a list of tuples if needed
unique_pairs_list = list(unique_pairs.itertuples(index=False, name=None))

unique_pairs_list

[('HsapDv:0000139', 'Donor 2'),
 ('HsapDv:0000123', 'Donor 1'),
 ('HsapDv:0000147', 'Donor 3'),
 ('HsapDv:0000238', 'A32 (411C)'),
 ('HsapDv:0000240', 'A34 (417C)'),
 ('HsapDv:0000238', 'A39 (440C)'),
 ('HsapDv:0000240', 'A38 (432C)'),
 ('HsapDv:0000241', 'A26 (386C)'),
 ('HsapDv:0000238', 'A33 (414C)'),
 ('HsapDv:0000238', 'A30 (398B)'),
 ('HsapDv:0000087', 'N10'),
 ('HsapDv:0000087', 'N8'),
 ('HsapDv:0000087', 'N11'),
 ('HsapDv:0000087', 'N13'),
 ('HsapDv:0000087', 'N15'),
 ('HsapDv:0000087', 'N16'),
 ('HsapDv:0000087', 'N17'),
 ('HsapDv:0000087', 'N18'),
 ('HsapDv:0000087', 'N20'),
 ('HsapDv:0000087', 'N21'),
 ('HsapDv:0000087', 'N51'),
 ('HsapDv:0000087', 'N46'),
 ('HsapDv:0000242', 'A29 (390C)'),
 ('HsapDv:0000240', '298c'),
 ('HsapDv:0000241', '290b'),
 ('HsapDv:0000239', '302C'),
 ('HsapDv:0000242', 'A34 (417C)')]

In [88]:
update_dict = {
'D5' :'HsapDv:0000241',
'A29 (390C)' :'HsapDv:0000161',
'A37':'HsapDv:0000149',
'A34 (417C)':'HsapDv:0000094',
'356C':'HsapDv:0000239',
'A32 (411C)':'HsapDv:0000123',
'A26 (386C)':'HsapDv:0000169',
'284C':'HsapDv:0000240',
'368C':'HsapDv:0000240',
'296C':'HsapDv:0000238',
'A33 (414C)':'HsapDv:0000090',
'A30 (398B)':'HsapDv:0000090',
'417c':'HsapDv:0000094',
'A32':'HsapDv:0000123',
'A37':'HsapDv:0000153',
'A40':'HsapDv:0000158',
'A44':'HsapDv:0000160',
'A47':'HsapDv:0000152',
'640C':'HsapDv:0000242'}

# Update adata.obs['development_stage_ontology_term_id'] based on the update dictionary
adata.obs['development_stage_ontology_term_id'] = adata.obs.apply(
    lambda row: update_dict[row['donor_id']] if row['donor_id'] in update_dict else row['development_stage_ontology_term_id'], 
    axis=1
)

In [89]:
# change datatype of the column

In [90]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [91]:
obs_df = adata.obs[['development_stage_ontology_term_id', 'donor_id']]

# Dropping duplicates to get unique pairs
unique_pairs = obs_df.drop_duplicates()

# Converting to a list of tuples if needed
unique_pairs_list = list(unique_pairs.itertuples(index=False, name=None))

unique_pairs_list

[('HsapDv:0000139', 'Donor 2'),
 ('HsapDv:0000123', 'Donor 1'),
 ('HsapDv:0000147', 'Donor 3'),
 ('HsapDv:0000123', 'A32 (411C)'),
 ('HsapDv:0000094', 'A34 (417C)'),
 ('HsapDv:0000238', 'A39 (440C)'),
 ('HsapDv:0000240', 'A38 (432C)'),
 ('HsapDv:0000169', 'A26 (386C)'),
 ('HsapDv:0000090', 'A33 (414C)'),
 ('HsapDv:0000090', 'A30 (398B)'),
 ('HsapDv:0000087', 'N10'),
 ('HsapDv:0000087', 'N8'),
 ('HsapDv:0000087', 'N11'),
 ('HsapDv:0000087', 'N13'),
 ('HsapDv:0000087', 'N15'),
 ('HsapDv:0000087', 'N16'),
 ('HsapDv:0000087', 'N17'),
 ('HsapDv:0000087', 'N18'),
 ('HsapDv:0000087', 'N20'),
 ('HsapDv:0000087', 'N21'),
 ('HsapDv:0000087', 'N51'),
 ('HsapDv:0000087', 'N46'),
 ('HsapDv:0000161', 'A29 (390C)'),
 ('HsapDv:0000240', '298c'),
 ('HsapDv:0000241', '290b'),
 ('HsapDv:0000239', '302C')]

In [92]:
# view unique values of development_stage_ontology_term_id column

In [93]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000139',
 'HsapDv:0000123',
 'HsapDv:0000147',
 'HsapDv:0000094',
 'HsapDv:0000238',
 'HsapDv:0000240',
 'HsapDv:0000169',
 'HsapDv:0000090',
 'HsapDv:0000087',
 'HsapDv:0000161',
 'HsapDv:0000241',
 'HsapDv:0000239']

In [94]:
# view adata.obs

In [95]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,Homo sapiens,normal,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,Homo sapiens,normal,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,Homo sapiens,normal,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094


#### **donor_id**

In [96]:
#identify the column in adata.obs which provides donor information

In [97]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [98]:
# add the donor_id column

In [99]:
adata.obs['donor_id'] = adata.obs['donor_id']

In [100]:
list(adata.obs['donor_id'].unique())

['Donor 2',
 'Donor 1',
 'Donor 3',
 'A32 (411C)',
 'A34 (417C)',
 'A39 (440C)',
 'A38 (432C)',
 'A26 (386C)',
 'A33 (414C)',
 'A30 (398B)',
 'N10',
 'N8',
 'N11',
 'N13',
 'N15',
 'N16',
 'N17',
 'N18',
 'N20',
 'N21',
 'N51',
 'N46',
 'A29 (390C)',
 '298c',
 '290b',
 '302C']

In [101]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [102]:
# change datatype of the column

In [103]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [104]:
# view unique values of donor_id column

In [105]:
list(adata.obs['donor_id'].unique())

['Donor 2',
 'Donor 1',
 'Donor 3',
 'A32 (411C)',
 'A34 (417C)',
 'A39 (440C)',
 'A38 (432C)',
 'A26 (386C)',
 'A33 (414C)',
 'A30 (398B)',
 'N10',
 'N8',
 'N11',
 'N13',
 'N15',
 'N16',
 'N17',
 'N18',
 'N20',
 'N21',
 'N51',
 'N46',
 'A29 (390C)',
 '298c',
 '290b',
 '302C']

In [106]:
#view obs

In [107]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,Homo sapiens,normal,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,Homo sapiens,normal,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,Homo sapiens,normal,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094


In [108]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [109]:
list(adata.obs['disease'].unique())

['normal']

In [110]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [111]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,Homo sapiens,normal,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,Homo sapiens,normal,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,Homo sapiens,normal,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461


In [112]:
# change datatype of the column

In [113]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [114]:
# view obs

In [115]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,Homo sapiens,normal,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,Homo sapiens,normal,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,Homo sapiens,normal,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461


#### **is_primary_data**

In [116]:
#change data type of column

In [117]:
list(adata.obs['Dataset'].unique())

['Burclaff et al. 2022',
 'Elmentaite et al. 2021',
 'Smillie et al. 2019',
 'James et al. 2020']

In [118]:
mapping = {'Aizarani et al. 2019':False,
'Andrews et al. 2022':False,
'Guilliams et al. 2022':False,
'MacParland et al. 2018':False,
'HCA Kidney 2022':False,
'Lake et al. 2021':False,
'Muto et al. 2021':False,
'Stewart et al. 2019':False,
'Dominguez Conde et al. 2022':False,
'Elmentaite et al. 2021':False,
'James et al. 2020':False,
'Szabo et al. 2019':False,
'He et al. 2020':False,
'Micheli et al. 2020':True,
'Perez et al. 2022':True,
'Ren et al. 2021':False,
'Stephenson et al. 2021':False,
'Yoshida et al. 2021':False,
'Madissoon et al. 2020':False,
'Tabula Sapiens 2022':False,
'Ayhan et al. 2021':False,
'Franjic et al. 2022':True,
'Siletti et al. 2022':False,
'Tran et al. 2021':True,
'Burclaff et al. 2022':False,
'Smillie et al. 2019':False,
'Roy et al. 2021':True,
'Adams et al. 2020':False,
'Madissoon et al. 2022':False,
'Travaglini et al. 2020':False,
'Koenig et al. 2022':True,
'Kuppe et al. 2022':False,
'Litvinukova et al. 2020':False,
'Tucker et al. 2020':True,
'Fasolino et al. 2022':False,
'Muraro et al. 2016':False,
'Tosti et al. 2021':True,
'Tritschler et al. 2022':False}

In [119]:
adata.obs['is_primary_data']= adata.obs['Dataset'].map(mapping)

In [120]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [121]:
# view obs

In [122]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,organism,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,Homo sapiens,normal,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,Homo sapiens,normal,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,Homo sapiens,normal,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,Homo sapiens,normal,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,Homo sapiens,normal,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,Homo sapiens,normal,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False


#### **organism_ontology_term_id**

In [123]:
# assign organism id 

In [124]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [125]:
#change data type of column

In [126]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [127]:
# view obs

In [128]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,disease,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,normal,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,...,normal,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,...,normal,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,...,normal,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,normal,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,...,normal,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,normal,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,normal,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,normal,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [129]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/intestine/metadata_intestine.csv')
mapping = dict(zip(df['cells'], df['ethnicity']))
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs_names.map(mapping)

In [130]:
# change data type

In [131]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [132]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['HANCESTRO:0005', 'HANCESTRO:0568', 'unknown', nan]

In [133]:
adata.obs['self_reported_ethnicity_ontology_term_id'].fillna('unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['self_reported_ethnicity_ontology_term_id'].fillna('unknown', inplace=True)


In [134]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['HANCESTRO:0005', 'HANCESTRO:0568', 'unknown']

In [135]:
# view obs

In [136]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,tissue,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,intestine,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,...,intestine,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,...,intestine,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,...,intestine,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,intestine,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,...,intestine,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,intestine,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,intestine,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,intestine,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown


In [137]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### **sex_ontology_term_id**

In [138]:
# identify the column in adata.obs which corresponds to sex

In [139]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [140]:
list(adata.obs['sex'].unique())

['male', 'female']

In [141]:
# list the unique values 

In [142]:
mapping= {'female': 'PATO:0000383', 'male': 'PATO:0000384', 'unknown':'unknown'}

In [143]:
# add sex_ontology_term_id column

In [144]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex'].map(mapping)

In [145]:
# change data type

In [146]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [147]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,...,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,...,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,...,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,...,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384


#### **suspension_type**

In [148]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,...,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,...,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,...,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,...,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384


In [149]:
list(adata.obs['suspension_type'].unique())

['cell']

In [150]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [151]:
# change data type of column

In [152]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [153]:
# view obs

In [154]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,barcodes,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,TGAATCGAGTTTCGAC,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,...,ACCCTCAAGTGTTCCA,3pv3,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,...,CCACACTTCTCCTGTG,3pv3,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,...,TGTCAGAGTACGTTCA,3pv3,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,TTGTGGATCCGCGGAT,3pv3,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,...,TTTATGCTCTCTAAGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,TTTCCTCCAGGACCCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,TTTGCGCGTGGGTCAA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,TTTGGTTTCCACGTGG,3pv2_5pv1_5pv2+3pv3,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384


#### **tissue_type**

In [155]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [156]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [157]:
# identify the column in adata.obs which corresponds to tissue

In [158]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type'],
      dtype='object')

In [159]:
list(adata.obs['tissue'].unique())

['intestine']

In [160]:
mapping= {'blood' : 'UBERON:0000178',
         'bone marrow':'UBERON:0002371',
          'heart':'UBERON:0000948',
          'intestine':'UBERON:0000160',
          'kidney':'UBERON:0002113',
          'hippocampal formation':'UBERON:0002421',
          'liver':'UBERON:0002107',
          'lung':'UBERON:0002048',
          'lymph node':'UBERON:0000029',
          'pancreas':'UBERON:0001264',
          'skeletal muscle organ':'UBERON:0014892',
          'spleen':'UBERON:0002106'}

In [161]:
# add 'tissue_ontology_term_id' column

In [162]:
# change data type of column

In [163]:
df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Suppl_info/previous_datasets/intestine/metadata_intestine.csv')
mapping = dict(zip(df['cells'], df['tissue_ontology_term_id']))
adata.obs['tissue_ontology_term_id'] = adata.obs_names.map(mapping)

In [164]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [165]:
#list the unique values in 'tissue_ontology_term_id' column

In [166]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002115',
 'UBERON:0001156',
 'UBERON:0002116',
 'UBERON:0001157',
 'UBERON:0001158',
 'UBERON:0002114',
 'UBERON:0001159',
 'UBERON:0001154',
 'UBERON:0001153',
 'UBERON:0001052',
 'UBERON:0002108',
 'UBERON:0000059',
 nan]

In [167]:

adata.obs['tissue_ontology_term_id'] = np.where(adata.obs['tissue_ontology_term_id'].isna(), 'UBERON:0000160', adata.obs['tissue_ontology_term_id'])


In [168]:
# view obs

In [169]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,...,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,...,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,...,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001156
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,...,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0001159
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160


In [170]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [171]:
# view obsm

In [172]:
# check whether all columns are prefixed with X

In [173]:
adata.obsm

AxisArrays with keys: X_umap

#### **uns (Dataset Metadata)**

In [174]:
# View

In [175]:
adata.uns

{'schema_version': '3.0.0', 'title': 'Adult human intestine'}

In [176]:
adata.uns.keys

<function dict.keys>

In [177]:
# Give a title for the dataset

In [178]:
adata.uns['title'] = 'Intestine'

In [179]:
# Set the default embedding

In [180]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [181]:
# view anndata object

In [182]:
adata

AnnData object with n_obs × n_vars = 263275 × 31345
    obs: 'Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type', 'assay', 'Original_annotation', 'CellHint_harmonised_group', 'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue', 'barcodes', 'assays', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'
    var: 'exist_in_Burclaff2022', 'exist_in_Elmentaite2021', 'exist_in_Smillie2019', 'exist_in_James2020', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'default_embedding'
    obsm: 'X_umap'

In [183]:
# view obs and var data types

In [184]:
adata.obs.dtypes

Dataset                                     category
donor_id                                    category
development_stage                           category
sex                                         category
suspension_type                             category
assay                                       category
Original_annotation                         category
CellHint_harmonised_group                   category
cell_type                                   category
Curated_annotation                          category
organism                                    category
disease                                     category
tissue                                      category
barcodes                                      object
assays                                        object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    ca

In [185]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [186]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed barcodes from object to category
changed assays from object to category
changed tissue_ontology_term_id from object to category


In [187]:
# view obs

In [188]:
adata.obs

Unnamed: 0,Dataset,donor_id,development_stage,sex,suspension_type,assay,Original_annotation,CellHint_harmonised_group,cell_type,Curated_annotation,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_AE2,Group60,enterocyte,Enterocytes,...,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,SI_tuft,Group76,tuft cell of colon,Tuft cells,...,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,29-year-old human stage,male,cell,10x 3' v3,C_earlyCC,Group61,transit amplifying cell,TA,...,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001156
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,45-year-old human stage,male,cell,10x 3' v3,SI_earlyAE,Group60,transit amplifying cell,TA,...,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,LYVE1 Macrophage,Group7,macrophage,Macrophages,...,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0001159
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),eighth decade human stage,male,cell,10x 5' v2,B cell IgA Plasma,Group2,plasma cell,Plasma cells,...,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160


In [189]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'development_stage', 'sex', 'suspension_type',
       'assay', 'Original_annotation', 'CellHint_harmonised_group',
       'cell_type', 'Curated_annotation', 'organism', 'disease', 'tissue',
       'barcodes', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [190]:
# delete unwanted columns in obs

In [191]:
del adata.obs['tissue']
del adata.obs['organism']
del adata.obs['disease']
del adata.obs['assay']
del adata.obs['sex']
del adata.obs['development_stage']
del adata.obs['assays']
del adata.obs['barcodes']
del adata.uns['schema_version']
del adata.obs['cell_type']

In [192]:
# view obs

In [193]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,cell,SI_earlyAE,Group60,TA,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,cell,SI_AE2,Group60,Enterocytes,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,cell,SI_tuft,Group76,Tuft cells,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,cell,C_earlyCC,Group61,TA,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001156
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,cell,SI_earlyAE,Group60,TA,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),cell,LYVE1 Macrophage,Group7,Macrophages,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),cell,B cell IgA Plasma,Group2,Plasma cells,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),cell,B cell IgA Plasma,Group2,Plasma cells,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0001159
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),cell,B cell IgA Plasma,Group2,Plasma cells,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160


In [194]:
# view var

In [195]:
adata.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000121410,True,True,True,True,False
ENSG00000268895,True,True,True,True,False
ENSG00000148584,True,True,True,True,False
ENSG00000175899,False,True,True,True,False
ENSG00000245105,True,True,True,True,False
...,...,...,...,...,...
ENSG00000272920,False,True,False,False,False
ENSG00000265929,False,False,True,False,False
ENSG00000264824,False,False,True,False,False
ENSG00000215769,False,False,True,False,False


In [196]:
araw.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000121410,True,True,True,True
ENSG00000268895,True,True,True,True
ENSG00000148584,True,True,True,True
ENSG00000175899,False,True,True,True
ENSG00000245105,True,True,True,True
...,...,...,...,...
ENSG00000272920,False,True,False,False
ENSG00000265929,False,False,True,False
ENSG00000264824,False,False,True,False
ENSG00000215769,False,False,True,False


In [197]:
#view uns

In [198]:
adata.uns

{'title': 'Intestine', 'default_embedding': 'X_umap'}

In [199]:
list(adata.uns.keys())

['title', 'default_embedding']

In [200]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [201]:
# Remove unwanted columns in uns

In [202]:
#check the format of expression matrix

In [203]:
adata.X

<263275x31345 sparse matrix of type '<class 'numpy.float32'>'
	with 308410085 stored elements in Compressed Sparse Row format>

In [204]:
araw.X

<263275x31345 sparse matrix of type '<class 'numpy.float32'>'
	with 308410085 stored elements in Compressed Sparse Row format>

In [205]:
#Copy raw counts to adata.raw

In [206]:
del adata.raw

In [207]:
if isinstance(araw, ad.AnnData):
    adata.raw = araw
else:
    print("Error: araw is not an AnnData object.")

In [208]:
adata.raw = araw

In [209]:
obs_dtype = adata.obs.dtypes

In [210]:
obs_dtype

Dataset                                     category
donor_id                                    category
suspension_type                             category
Original_annotation                         category
CellHint_harmonised_group                   category
Curated_annotation                          category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
tissue_type                                 category
tissue_ontology_term_id                     category
dtype: object

In [211]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/cell_typist/Final_objects/to_upload/Intestine.h5ad', compression = 'gzip')

In [212]:
adata.obs

Unnamed: 0,Dataset,donor_id,suspension_type,Original_annotation,CellHint_harmonised_group,Curated_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_type,tissue_ontology_term_id
TGAATCGAGTTTCGAC-1-0,Burclaff et al. 2022,Donor 2,cell,SI_earlyAE,Group60,TA,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
ACCCTCAAGTGTTCCA-1-0,Burclaff et al. 2022,Donor 2,cell,SI_AE2,Group60,Enterocytes,EFO:0009922,CL:0000584,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
CCACACTTCTCCTGTG-1-1,Burclaff et al. 2022,Donor 1,cell,SI_tuft,Group76,Tuft cells,EFO:0009922,CL:0009041,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
TGTCAGAGTACGTTCA-1-1,Burclaff et al. 2022,Donor 1,cell,C_earlyCC,Group61,TA,EFO:0009922,CL:0009010,HsapDv:0000123,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0001156
TTGTGGATCCGCGGAT-1-0,Burclaff et al. 2022,Donor 2,cell,SI_earlyAE,Group60,TA,EFO:0009922,CL:0009010,HsapDv:0000139,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,PATO:0000384,tissue,UBERON:0002115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCTCTCTAAGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),cell,LYVE1 Macrophage,Group7,Macrophages,EFO:0009900,CL:0000235,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160
TTTCCTCCAGGACCCT-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),cell,B cell IgA Plasma,Group2,Plasma cells,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160
TTTGCGCGTGGGTCAA-1-Human_colon_16S8001871,James et al. 2020,A34 (417C),cell,B cell IgA Plasma,Group2,Plasma cells,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0001159
TTTGGTTTCCACGTGG-1-Human_colon_16S8001871-1,James et al. 2020,A34 (417C),cell,B cell IgA Plasma,Group2,Plasma cells,EFO:0009900,CL:0000786,HsapDv:0000094,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,tissue,UBERON:0000160


In [213]:
adata.obs.columns

Index(['Dataset', 'donor_id', 'suspension_type', 'Original_annotation',
       'CellHint_harmonised_group', 'Curated_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [214]:
adata.raw.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000121410,True,True,True,True
ENSG00000268895,True,True,True,True
ENSG00000148584,True,True,True,True
ENSG00000175899,False,True,True,True
ENSG00000245105,True,True,True,True
...,...,...,...,...
ENSG00000272920,False,True,False,False
ENSG00000265929,False,False,True,False
ENSG00000264824,False,False,True,False
ENSG00000215769,False,False,True,False


In [215]:
adata.var

Unnamed: 0_level_0,exist_in_Burclaff2022,exist_in_Elmentaite2021,exist_in_Smillie2019,exist_in_James2020,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000121410,True,True,True,True,False
ENSG00000268895,True,True,True,True,False
ENSG00000148584,True,True,True,True,False
ENSG00000175899,False,True,True,True,False
ENSG00000245105,True,True,True,True,False
...,...,...,...,...,...
ENSG00000272920,False,True,False,False,False
ENSG00000265929,False,False,True,False,False
ENSG00000264824,False,False,True,False,False
ENSG00000215769,False,False,True,False,False


In [216]:
adata.raw.X

<263275x31345 sparse matrix of type '<class 'numpy.float32'>'
	with 308410085 stored elements in Compressed Sparse Row format>

In [217]:
list(adata.obs['is_primary_data'].unique())

[False]