### **Curating human_limb.h5ad**

Article: A human embryonic limb cell atlas resolved in space and time

DOI: https://doi.org/10.1038/s41586-023-06806-x

Data Source : https://developmental.cellatlas.io/embryonic-limb

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/limb_cell_atlas/Data/New_data/human_limb.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 125955 × 26522
    obs: 'batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden', 'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype'
    var: 'gene_ids', 'feature_types', 'n_cells', 'highly_variableWSSS_THYst8796437', 'highly_variableWSSS_THYst8796442', 'highly_variable5478STDY7935101', 'highly_variable5478STDY7935102', 'highly_variableWSSS_THYst8796439', 'highly_variableWSSS_THYst9384956', 'highly_variable5478STDY7717491', 'highly_variableWSSS_THYst9384953', 'highly_variable5478STDY7717492', 'highly_variableWSSS_THYst9384955', 'highly_variable5386STDY7537944', 'highly_variableWSSS_THYst9384954', 'highly_variableFCAImmP7536759', 'highly_variable5478STDY7980349', 'highly_variableWSSS_THYst8796438', 'highly_variable5478STDY7652318', 'highly_variableWSSS_THYst8796441', 'highly_variableFCAImmP7536758', 'highly_variableWSSS_THYst9384957', 'highly_variable5478STDY7980348', 'h

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<125955x26522 sparse matrix of type '<class 'numpy.float32'>'
	with 298892029 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 20)	1.4683918
  (0, 44)	1.4683918
  (0, 55)	1.4683918
  (0, 100)	1.4683918
  (0, 127)	3.322802
  (0, 194)	1.4683918
  (0, 286)	1.4683918
  (0, 323)	1.4683918
  (0, 329)	1.4683918
  (0, 353)	1.4683918
  (0, 366)	2.0392044
  (0, 390)	2.0392044
  (0, 392)	3.9344435
  (0, 423)	2.0392044
  (0, 439)	2.0392044
  (0, 448)	1.4683918
  (0, 452)	2.0392044
  (0, 482)	1.4683918
  (0, 558)	1.4683918
  (0, 562)	2.0392044
  (0, 586)	1.4683918
  (0, 606)	2.0392044
  (0, 607)	1.4683918
  (0, 626)	1.4683918
  (0, 650)	1.4683918
  :	:
  (125954, 26444)	0.3436403
  (125954, 26450)	0.3436403
  (125954, 26452)	0.80209756
  (125954, 26453)	1.1152585
  (125954, 26456)	0.3436403
  (125954, 26458)	0.80209756
  (125954, 26465)	0.3436403
  (125954, 26473)	0.3436403
  (125954, 26479)	0.3436403
  (125954, 26489)	0.970887
  (125954, 26497)	0.80209756
  (125954, 26498)	2.5242875
  (125954, 26499)	1.6293803
  (125954, 26500)	4.031753
  (125954, 26501)	3.3913152
  (125954, 26502)	1.967262
  (125954, 26503)	3.04896

##### **Raw counts matrix**

In [11]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/limb_cell_atlas/Data/Raw_data/human_limb_raw.h5ad')

In [12]:
araw.layers.keys()

KeysView(Layers with keys: counts)

In [13]:
araw = ad.AnnData(X=araw.layers['counts'].copy(), obs=araw.obs.copy(), var=araw.var.copy())

In [14]:
print(araw.X)

  (0, 20)	1.0
  (0, 44)	1.0
  (0, 55)	1.0
  (0, 100)	1.0
  (0, 127)	8.0
  (0, 194)	1.0
  (0, 286)	1.0
  (0, 323)	1.0
  (0, 329)	1.0
  (0, 353)	1.0
  (0, 366)	2.0
  (0, 390)	2.0
  (0, 392)	15.0
  (0, 423)	2.0
  (0, 439)	2.0
  (0, 448)	1.0
  (0, 452)	2.0
  (0, 482)	1.0
  (0, 558)	1.0
  (0, 562)	2.0
  (0, 586)	1.0
  (0, 606)	2.0
  (0, 607)	1.0
  (0, 626)	1.0
  (0, 650)	1.0
  :	:
  (125954, 26444)	1.0
  (125954, 26450)	1.0
  (125954, 26452)	3.0
  (125954, 26453)	5.0
  (125954, 26456)	1.0
  (125954, 26458)	3.0
  (125954, 26465)	1.0
  (125954, 26473)	1.0
  (125954, 26479)	1.0
  (125954, 26489)	4.0
  (125954, 26497)	3.0
  (125954, 26498)	28.0
  (125954, 26499)	10.0
  (125954, 26500)	135.0
  (125954, 26501)	70.0
  (125954, 26502)	15.0
  (125954, 26503)	49.0
  (125954, 26504)	83.0
  (125954, 26505)	40.0
  (125954, 26506)	58.0
  (125954, 26507)	23.0
  (125954, 26508)	19.0
  (125954, 26509)	5.0
  (125954, 26510)	57.0
  (125954, 26519)	1.0


##### **Variables(var)**

In [15]:
# View the var of anndata and raw object

In [16]:
adata.var

Unnamed: 0,gene_ids,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,...,highly_variableFCAImmP7536758,highly_variableWSSS_THYst9384957,highly_variable5478STDY7980348,highly_variable5386STDY7557337,highly_variable5386STDY7557336,highly_variableWSSS_THYst9384958,highly_variableWSSS_THYst8796440,highly_variable5386STDY7557335,highly_variable_n,highly_variable
MIR1302-2HG,ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,False
AL627309.1,ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,False
AL627309.3,ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,False
AL732372.1,ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,False
AC114498.1,ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC007325.2,ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,False
AL354822.1,ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,False
AC004556.1,ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,False
AC240274.1,ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,4,False


In [17]:
araw.var

Unnamed: 0,gene_ids,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,...,Deep_WSSS_THYst8796441,Deep_FCAImmP7536758,Deep_WSSS_THYst9384957,Deep_5478STDY7980348,Deep_5386STDY7557337,Deep_5386STDY7557336,Deep_WSSS_THYst9384958,Deep_WSSS_THYst8796440,Deep_5386STDY7557335,Deep_n
MIR1302-2HG,ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
AL627309.1,ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
AL627309.3,ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
AL732372.1,ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
AC114498.1,ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC007325.2,ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
AL354822.1,ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
AC004556.1,ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
AC240274.1,ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0


In [18]:
adata.var['gene_name'] = adata.var.index
araw.var['gene_name'] = araw.var.index

In [19]:
adata.var.index = adata.var['gene_ids'] 
araw.var.index = araw.var['gene_ids']

In [20]:
# Load the approved genes file.

In [21]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [22]:
#Create a dictionary from the approved genes file 

In [23]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [24]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [25]:
len(genedict)

119799

In [26]:
#Filter out the genes which are not in the approved genes file.

In [27]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [28]:
len(var_to_keep_adata)

26252

In [29]:
len(var_to_keep_araw)

26252

In [30]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,...,highly_variableWSSS_THYst9384957,highly_variable5478STDY7980348,highly_variable5386STDY7557337,highly_variable5386STDY7557336,highly_variableWSSS_THYst9384958,highly_variableWSSS_THYst8796440,highly_variable5386STDY7557335,highly_variable_n,highly_variable,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,MIR1302-2HG
ENSG00000238009,ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL627309.3
ENSG00000236601,ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL732372.1
ENSG00000235146,ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AC114498.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277196,ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AC007325.2
ENSG00000278384,ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL354822.1
ENSG00000276345,ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AC004556.1
ENSG00000271254,ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,4,False,AC240274.1


In [31]:
araw.var

Unnamed: 0_level_0,gene_ids,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,...,Deep_FCAImmP7536758,Deep_WSSS_THYst9384957,Deep_5478STDY7980348,Deep_5386STDY7557337,Deep_5386STDY7557336,Deep_WSSS_THYst9384958,Deep_WSSS_THYst8796440,Deep_5386STDY7557335,Deep_n,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,MIR1302-2HG
ENSG00000238009,ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL627309.3
ENSG00000236601,ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL732372.1
ENSG00000235146,ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC114498.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277196,ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC007325.2
ENSG00000278384,ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL354822.1
ENSG00000276345,ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC004556.1
ENSG00000271254,ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC240274.1


In [32]:
# Modify the anndata object by filtering out the filtered genes.

In [33]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [34]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,...,highly_variableWSSS_THYst9384957,highly_variable5478STDY7980348,highly_variable5386STDY7557337,highly_variable5386STDY7557336,highly_variableWSSS_THYst9384958,highly_variableWSSS_THYst8796440,highly_variable5386STDY7557335,highly_variable_n,highly_variable,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,MIR1302-2HG
ENSG00000238009,ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL627309.3
ENSG00000236601,ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL732372.1
ENSG00000235146,ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AC114498.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277196,ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AC007325.2
ENSG00000278384,ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL354822.1
ENSG00000276345,ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AC004556.1
ENSG00000271254,ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,4,False,AC240274.1


In [35]:
# View var

In [36]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,...,highly_variableWSSS_THYst9384957,highly_variable5478STDY7980348,highly_variable5386STDY7557337,highly_variable5386STDY7557336,highly_variableWSSS_THYst9384958,highly_variableWSSS_THYst8796440,highly_variable5386STDY7557335,highly_variable_n,highly_variable,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,MIR1302-2HG
ENSG00000238009,ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL627309.3
ENSG00000236601,ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL732372.1
ENSG00000235146,ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AC114498.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277196,ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AC007325.2
ENSG00000278384,ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AL354822.1
ENSG00000276345,ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,False,AC004556.1
ENSG00000271254,ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,4,False,AC240274.1


In [37]:
araw.var

Unnamed: 0_level_0,gene_ids,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,...,Deep_FCAImmP7536758,Deep_WSSS_THYst9384957,Deep_5478STDY7980348,Deep_5386STDY7557337,Deep_5386STDY7557336,Deep_WSSS_THYst9384958,Deep_WSSS_THYst8796440,Deep_5386STDY7557335,Deep_n,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,MIR1302-2HG
ENSG00000238009,ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL627309.3
ENSG00000236601,ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL732372.1
ENSG00000235146,ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC114498.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277196,ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC007325.2
ENSG00000278384,ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL354822.1
ENSG00000276345,ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC004556.1
ENSG00000271254,ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC240274.1


feature is filtered

In [38]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [39]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,...,highly_variable5478STDY7980348,highly_variable5386STDY7557337,highly_variable5386STDY7557336,highly_variableWSSS_THYst9384958,highly_variableWSSS_THYst8796440,highly_variable5386STDY7557335,highly_variable_n,highly_variable,gene_name,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,MIR1302-2HG,False
ENSG00000238009,ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AL627309.1,False
ENSG00000239945,ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AL627309.3,False
ENSG00000236601,ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AL732372.1,False
ENSG00000235146,ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AC114498.1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277196,ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AC007325.2,False
ENSG00000278384,ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AL354822.1,False
ENSG00000276345,ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AC004556.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,...,False,False,False,False,False,False,4,False,AC240274.1,False


In [40]:
araw.var

Unnamed: 0_level_0,gene_ids,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,...,Deep_FCAImmP7536758,Deep_WSSS_THYst9384957,Deep_5478STDY7980348,Deep_5386STDY7557337,Deep_5386STDY7557336,Deep_WSSS_THYst9384958,Deep_WSSS_THYst8796440,Deep_5386STDY7557335,Deep_n,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,MIR1302-2HG
ENSG00000238009,ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL627309.3
ENSG00000236601,ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL732372.1
ENSG00000235146,ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC114498.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277196,ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC007325.2
ENSG00000278384,ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL354822.1
ENSG00000276345,ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC004556.1
ENSG00000271254,ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC240274.1


In [41]:
del adata.var['gene_ids']
del araw.var['gene_ids']

In [42]:
araw.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/limb_cell_atlas/Data/Raw_data/human_limb_approved_genes_raw.h5ad', compression = 'gzip')

#### **obs (Cell metadata)**

In [43]:
#view obs

In [44]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,leiden,S_score,G2M_score,phase,leiden_R,celltype
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,-0.117172,-0.204391,G1,2,Mes1
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,-0.104853,-0.170856,G1,40,OCP
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,0.033006,0.277833,G2M,2,Mes1
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,0.106430,0.407822,G2M,2,Mes1
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,26,-0.120746,-0.143064,G1,140,VenousEndo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,20,0.300386,0.415698,G2M,220,DistalMes
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,-0.157143,-0.116372,G1,90,InterZone
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,-0.108790,-0.151803,G1,90,InterZone
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,0.063410,-0.133681,S,00,Mes2


In [45]:
# view the column names in obs

In [46]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype'],
      dtype='object')

In [47]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,leiden,S_score,G2M_score,phase,leiden_R,celltype
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,-0.117172,-0.204391,G1,2,Mes1
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,-0.104853,-0.170856,G1,40,OCP
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,0.033006,0.277833,G2M,2,Mes1
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,0.106430,0.407822,G2M,2,Mes1
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,26,-0.120746,-0.143064,G1,140,VenousEndo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,20,0.300386,0.415698,G2M,220,DistalMes
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,-0.157143,-0.116372,G1,90,InterZone
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,-0.108790,-0.151803,G1,90,InterZone
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,0.063410,-0.133681,S,00,Mes2


#### **assay_ontology_term_id**

In [48]:
adata.obs['barcodes'] = adata.obs_names

In [49]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [50]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [51]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [52]:
adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [53]:
list(adata.obs['assay'].unique())

['3pv2_5pv1_5pv2', '3pv2_5pv1_5pv2+3pv3', '3pv2_5pv1_5pv2+multiome', nan]

In [54]:
mapping={'5386STDY7537944':'3v2',
'FCAImmP7536758':'3v2',
'FCAImmP7536759':'3v2',
'5386STDY7557335':'3v2',
'5386STDY7557336':'3v2',
'5386STDY7557337':'3v2',
'5478STDY7652318':'3v2',
'5478STDY7717491':'3v2',
'5478STDY7717492':'3v2',
'5478STDY7935101':'3v2',
'5478STDY7935102':'3v2',
'5478STDY7980348':'3v2',
'5478STDY7980349':'3v2',
'WSSS_THYst8796437':'3v2',
'WSSS_THYst8796438':'3v2',
'WSSS_THYst8796439':'3v2',
'WSSS_THYst8796440':'3v2',
'WSSS_THYst8796441':'3v2',
'WSSS_THYst8796442':'3v2',
'WSSS_THYst9384953':'5v1',
'WSSS_THYst9384954':'5v1',
'WSSS_THYst9384955':'5v1',
'WSSS_THYst9384956':'5v1',
'WSSS_THYst9384957':'5v1',
'WSSS_THYst9384958':'5v1',
'WSSS_THYst9807808':'3v3',
'WSSS_THYst9807809':'3v3',
'WSSS_THYst9807810':'3v3',
'WSSS_THYst9807811':'3v3',
'WSSS_THYst9807812':'3v3',
'WSSS_THYst9807813':'3v3',
'WSSS_THYst9807814':'3v3',
'WSSS_THYst9807815':'3v3',
'WSSS_THYst9807816':'3v3',
'WSSS_THYst9807817':'3v3',
'WSSS_THYst9807818':'3v3',
'WSSS_THYst9807819':'3v3',
'WSSS_THYst9807820':'3v3',
'1_e13_5':'3v2',
'3_e11':'3v2',
'4_e12':'3v2',
'5_e13':'3v2',
'6_e15':'3v2',
'7_e10_5':'3v2',
'8_e15_whole':'3v2',
'9_e15_prox':'3v2',
'10_e15_mid':'3v2',
'11_e15_dist':'3v2',
'12_e13':'3v2',
'13_e14':'3v2',
'GSM4498677':'3v2',
'GSM4498678':'3v2',
'GSM4227224':'3v2',
'GSM4227225':'3v2',
'GSM4227226':'3v2',
'GSM4227227':'3v2'}

In [55]:
adata.obs['assays']  = adata.obs['batch'].map(mapping)

In [56]:
list(adata.obs['assay'].unique())

['3pv2_5pv1_5pv2', '3pv2_5pv1_5pv2+3pv3', '3pv2_5pv1_5pv2+multiome', nan]

In [57]:
list(adata.obs['assays'].unique())

['3v2', '5v1']

In [58]:
import pandas as pd
import scanpy as sc

# Suppose 'adata' is your AnnData object already loaded.

# Step 1: Check unique values in 'assay'
unique_assays = adata.obs['assay'].unique()
print("Unique values in 'assay':", unique_assays)

# Step 2: Display unique 'assays' for each unique 'assay'
for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['assay'] == assay, 'assays'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)

Unique values in 'assay': ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay 3pv2_5pv1_5pv2: ['3v2' '5v1']
Unique 'assays' for assay 3pv2_5pv1_5pv2+3pv3: ['3v2' '5v1']
Unique 'assays' for assay 3pv2_5pv1_5pv2+multiome: ['3v2' '5v1']
Unique 'assays' for assay nan: []


In [59]:
mapping= {'3v2':'EFO:0009899', '5v1' : 'EFO:0011025'}

In [60]:
adata.obs['assay_ontology_term_id']  = adata.obs['assays'].map(mapping)

In [61]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [62]:
# view adata.obs

In [63]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,leiden,S_score,G2M_score,phase,leiden_R,celltype,barcodes,assay,assays,assay_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,-0.117172,-0.204391,G1,2,Mes1,AAACCTGCACATTTCT,3pv2_5pv1_5pv2,3v2,EFO:0009899
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,-0.104853,-0.170856,G1,40,OCP,AAACCTGCAGGGATTG,3pv2_5pv1_5pv2,3v2,EFO:0009899
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,0.033006,0.277833,G2M,2,Mes1,AAACCTGCATCGGGTC,3pv2_5pv1_5pv2,3v2,EFO:0009899
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,0.106430,0.407822,G2M,2,Mes1,AAACCTGCATGGGACA,3pv2_5pv1_5pv2,3v2,EFO:0009899
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,26,-0.120746,-0.143064,G1,140,VenousEndo,AAACCTGGTGATGCCC,3pv2_5pv1_5pv2,3v2,EFO:0009899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,20,0.300386,0.415698,G2M,220,DistalMes,TTTGTCAAGATAGGAG,3pv2_5pv1_5pv2,5v1,EFO:0011025
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,-0.157143,-0.116372,G1,90,InterZone,TTTGTCACATATGCTG,3pv2_5pv1_5pv2,5v1,EFO:0011025
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,-0.108790,-0.151803,G1,90,InterZone,TTTGTCAGTGATGCCC,3pv2_5pv1_5pv2,5v1,EFO:0011025
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,0.063410,-0.133681,S,00,Mes2,TTTGTCAGTGCACCAC,3pv2_5pv1_5pv2,5v1,EFO:0011025


In [64]:
assays_counts = adata.obs['assay_ontology_term_id'].value_counts()

In [65]:
assays_counts

assay_ontology_term_id
EFO:0009899    101257
EFO:0011025     24698
Name: count, dtype: int64

#### **cell_type_ontology_term_id**

In [66]:
#identify the column in adata.obs related. to cell type annotation

In [67]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes',
       'assay', 'assays', 'assay_ontology_term_id'],
      dtype='object')

In [68]:
list(adata.obs['celltype'].unique())

['Mes1',
 'OCP',
 'VenousEndo',
 'Mes4',
 'TransMes',
 'PrimErythro2',
 'STMN2+Fibro',
 'PAX3+MyoProg',
 'OsteoB',
 'AER-Basal',
 'MesCond',
 'DistalMes',
 'Periderm',
 'MyoB1',
 'ISL1+Mes',
 'ChondroProg',
 'Mes2',
 'InterZone',
 'ProxMes',
 'RDH10+DistalMes',
 'ArterialEndo',
 'SchwannProg',
 'HOXC5+DermFibroProg',
 'MyoC1',
 'PrimErythro1',
 'MYL3+MyoC',
 'Mes3',
 'Megakaryo',
 'Monocyte',
 'SynapSchwann',
 'PAX7+MyoProg',
 'Pericyte',
 'ADH+Fibro',
 'DermFibro',
 'Macro',
 'MFAP5+Fibro',
 'TenoProg',
 'SMC',
 'Perimysium',
 'SMProg',
 'MyoB2',
 'Schwann',
 'F10+DermFibroProg',
 'NeuralFibro',
 'Teno',
 'PrehyperChon',
 'NK',
 'MyoC2',
 'B',
 'PeriChon',
 'InterMusFibro',
 'Basal',
 'LymphEndo',
 'DC2',
 'LMPP/ELP',
 'Melano',
 'MYH3+MyoC',
 'CMP/GMP',
 'ProlifChon',
 'Mast',
 'Myelocyte',
 'ArtiChon',
 'HyperChon',
 'RestingChon',
 'DefReticulo',
 'DefErythro',
 'Neuronal']

In [69]:
# create a dictionary of cell type and ontology term

In [70]:
mapping= {
'Mes1':'CL:0008019',
'OCP':'CL:0000062',
'VenousEndo':'CL:0002543',
'Mes4':'CL:0008019',
'TransMes':'CL:0008019',
'PrimErythro2':'CL:0002355',
'STMN2+Fibro':'CL:0002551',
'PAX3+MyoProg':'CL:0000515',
'OsteoB':'CL:0000062',
'AER-Basal':'CL:0000646',
'MesCond':'CL:0000138',
'DistalMes':'CL:0008019',
'Periderm':'CL:0000078',
'MyoB1':'CL:0000056',
'ISL1+Mes':'CL:0008019',
'ChondroProg':'CL:0000138',
'Mes2':'CL:0008019',
'InterZone':'CL:0008019',
'ProxMes':'CL:0008019',
'RDH10+DistalMes':'CL:0008019',
'ArterialEndo':'CL:1000413',
'SchwannProg':'CL:0002375',
'HOXC5+DermFibroProg':'CL:0002551',
'MyoC1':'CL:0000187',
'PrimErythro1':'CL:0002355',
'MYL3+MyoC':'CL:0000187',
'Mes3':'CL:0008019',
'Megakaryo':'CL:0000556',
'Monocyte':'CL:0000576',
'SynapSchwann':'CL:0002573',
'PAX7+MyoProg':'CL:0000515',
'Pericyte':'CL:0000669',
'ADH+Fibro':'CL:1001609',
'DermFibro':'CL:0002551',
'Macro':'CL:0000235',
'MFAP5+Fibro':'CL:0000057',
'TenoProg':'CL:0000388',
'SMC':'CL:0000192',
'Perimysium':'CL:0002320',
'SMProg':'CL:0000192',
'MyoB2':'CL:0000056',
'Schwann':'CL:0002573',
'F10+DermFibroProg':'CL:0002551',
'NeuralFibro':'CL:0000057',
'Teno':'CL:0000388',
'PrehyperChon':'CL:0000138',
'NK':'CL:0000623',
'MyoC2':'CL:0000187',
'B':'CL:0000236',
'PeriChon':'CL:0000058',
'InterMusFibro':'CL:1001609',
'Basal':'CL:0000646',
'LymphEndo':'CL:0002138',
'DC2':'CL:0000990',
'LMPP/ELP':'CL:0000936',
'Melano':'CL:0000148',
'MYH3+MyoC':'CL:0000187',
'CMP/GMP':'CL:0000049',
'ProlifChon':'CL:0000138',
'Mast':'CL:0000097',
'Myelocyte':'CL:0002193',
'ArtiChon':'CL:1001607',
'HyperChon':'CL:0000743',
'RestingChon':'CL:0000138',
'DefReticulo':'CL:0000558',
'DefErythro':'CL:0000232',
'Neuronal':'CL:0000540',
'MyoB':'CL:0000056',
'EarlyDistalMes':'CL:0008019',
'Meox2+Mes':'CL:0008019',
'PrimErythro':'CL:0002355',
'Dpt+Fibro':'CL:0002551',
'EarlyProxMes':'CL:0008019',
'Basophil':'CL:0000767',
'ILC':'CL:0001065',
'SupraBasal1':'CL:0000066',
'Placode':'CL:0002483',
'SupraBasal2':'CL:0000066',
'Nail':'CL:4033056',
'PAX3+PAX7+MyoProg':'CL:0000187',
'PAX7+SPON2+MyoProg':'CL:0000187',
'PAX7+NTN5+MyoProg':'CL:0000187'}

In [71]:
# add the cell_type_ontology_term_id column

In [72]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['celltype'].map(mapping)

In [73]:
# change datatype of the column

In [74]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [75]:
# view adata.obs

In [76]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,S_score,G2M_score,phase,leiden_R,celltype,barcodes,assay,assays,assay_ontology_term_id,cell_type_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,-0.117172,-0.204391,G1,2,Mes1,AAACCTGCACATTTCT,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,-0.104853,-0.170856,G1,40,OCP,AAACCTGCAGGGATTG,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0000062
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,0.033006,0.277833,G2M,2,Mes1,AAACCTGCATCGGGTC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,0.106430,0.407822,G2M,2,Mes1,AAACCTGCATGGGACA,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,-0.120746,-0.143064,G1,140,VenousEndo,AAACCTGGTGATGCCC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0002543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,0.300386,0.415698,G2M,220,DistalMes,TTTGTCAAGATAGGAG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,-0.157143,-0.116372,G1,90,InterZone,TTTGTCACATATGCTG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,-0.108790,-0.151803,G1,90,InterZone,TTTGTCAGTGATGCCC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,0.063410,-0.133681,S,00,Mes2,TTTGTCAGTGCACCAC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019


#### **development_stage_ontology_term_id**

In [77]:
# identify the column in adata which corresponds to age

In [78]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes',
       'assay', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [79]:
list(adata.obs['adj_stage'].unique())

['Pcw5.1',
 'Pcw8.4',
 'Pcw8.0',
 'Pcw7.2',
 'Pcw6.5',
 'Pcw5.4',
 'Pcw9.3',
 'Pcw9.0',
 'Pcw6.1',
 'Pcw5.6']

In [80]:
mapping= {'Pcw5.1':'HsapDv:0000022',
 'Pcw8.4':'HsapDv:0000030',
 'Pcw8.0':'HsapDv:0000030',
 'Pcw7.2':'HsapDv:0000026',
 'Pcw6.5':'HsapDv:0000025',
 'Pcw5.4':'HsapDv:0000023',
 'Pcw9.3':'HsapDv:0000047',
 'Pcw9.0':'HsapDv:0000046',
 'Pcw6.1':'HsapDv:0000024',
 'Pcw5.6':'HsapDv:0000023'}

In [81]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['adj_stage'].map(mapping)

In [82]:
# change datatype of the column

In [83]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [84]:
# view unique values of development_stage_ontology_term_id column

In [85]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000022',
 'HsapDv:0000030',
 'HsapDv:0000026',
 'HsapDv:0000025',
 'HsapDv:0000023',
 'HsapDv:0000047',
 'HsapDv:0000046',
 'HsapDv:0000024']

In [86]:
# view adata.obs

In [87]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,G2M_score,phase,leiden_R,celltype,barcodes,assay,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,-0.204391,G1,2,Mes1,AAACCTGCACATTTCT,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,-0.170856,G1,40,OCP,AAACCTGCAGGGATTG,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0000062,HsapDv:0000022
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,0.277833,G2M,2,Mes1,AAACCTGCATCGGGTC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,0.407822,G2M,2,Mes1,AAACCTGCATGGGACA,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,-0.143064,G1,140,VenousEndo,AAACCTGGTGATGCCC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0002543,HsapDv:0000022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,0.415698,G2M,220,DistalMes,TTTGTCAAGATAGGAG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,-0.116372,G1,90,InterZone,TTTGTCACATATGCTG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,-0.151803,G1,90,InterZone,TTTGTCAGTGATGCCC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,-0.133681,S,00,Mes2,TTTGTCAGTGCACCAC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023


#### **donor_id**

In [88]:
#identify the column in adata.obs which provides donor information

In [89]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes',
       'assay', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [90]:
list(adata.obs['batch'].unique())

['5386STDY7537944',
 'FCAImmP7536758',
 'FCAImmP7536759',
 '5386STDY7557335',
 '5386STDY7557336',
 '5386STDY7557337',
 '5478STDY7652318',
 '5478STDY7717491',
 '5478STDY7717492',
 '5478STDY7935101',
 '5478STDY7935102',
 '5478STDY7980348',
 '5478STDY7980349',
 'WSSS_THYst8796437',
 'WSSS_THYst8796438',
 'WSSS_THYst8796439',
 'WSSS_THYst8796440',
 'WSSS_THYst8796441',
 'WSSS_THYst8796442',
 'WSSS_THYst9384953',
 'WSSS_THYst9384954',
 'WSSS_THYst9384955',
 'WSSS_THYst9384956',
 'WSSS_THYst9384957',
 'WSSS_THYst9384958']

In [91]:
# add the donor_id column

In [92]:
adata.obs['donor_id'] = adata.obs['batch']

In [93]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [94]:
# change datatype of the column

In [95]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [96]:
# view unique values of donor_id column

In [97]:
list(adata.obs['donor_id'].unique())

['5386STDY7537944',
 'FCAImmP7536758',
 'FCAImmP7536759',
 '5386STDY7557335',
 '5386STDY7557336',
 '5386STDY7557337',
 '5478STDY7652318',
 '5478STDY7717491',
 '5478STDY7717492',
 '5478STDY7935101',
 '5478STDY7935102',
 '5478STDY7980348',
 '5478STDY7980349',
 'WSSS_THYst8796437',
 'WSSS_THYst8796438',
 'WSSS_THYst8796439',
 'WSSS_THYst8796440',
 'WSSS_THYst8796441',
 'WSSS_THYst8796442',
 'WSSS_THYst9384953',
 'WSSS_THYst9384954',
 'WSSS_THYst9384955',
 'WSSS_THYst9384956',
 'WSSS_THYst9384957',
 'WSSS_THYst9384958']

In [98]:
#view obs

In [99]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,phase,leiden_R,celltype,barcodes,assay,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,G1,2,Mes1,AAACCTGCACATTTCT,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,G1,40,OCP,AAACCTGCAGGGATTG,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0000062,HsapDv:0000022,5386STDY7537944
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,G2M,2,Mes1,AAACCTGCATCGGGTC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,G2M,2,Mes1,AAACCTGCATGGGACA,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,G1,140,VenousEndo,AAACCTGGTGATGCCC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0002543,HsapDv:0000022,5386STDY7537944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,G2M,220,DistalMes,TTTGTCAAGATAGGAG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,G1,90,InterZone,TTTGTCACATATGCTG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,G1,90,InterZone,TTTGTCAGTGATGCCC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,S,00,Mes2,TTTGTCAGTGCACCAC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958


In [100]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes',
       'assay', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [101]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [102]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,leiden_R,celltype,barcodes,assay,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,2,Mes1,AAACCTGCACATTTCT,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,40,OCP,AAACCTGCAGGGATTG,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0000062,HsapDv:0000022,5386STDY7537944,PATO:0000461
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,2,Mes1,AAACCTGCATCGGGTC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,2,Mes1,AAACCTGCATGGGACA,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,140,VenousEndo,AAACCTGGTGATGCCC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0002543,HsapDv:0000022,5386STDY7537944,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,220,DistalMes,TTTGTCAAGATAGGAG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,90,InterZone,TTTGTCACATATGCTG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,90,InterZone,TTTGTCAGTGATGCCC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,00,Mes2,TTTGTCAGTGCACCAC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461


In [103]:
# change datatype of the column

In [104]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [105]:
# view obs

In [106]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,leiden_R,celltype,barcodes,assay,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,2,Mes1,AAACCTGCACATTTCT,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,40,OCP,AAACCTGCAGGGATTG,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0000062,HsapDv:0000022,5386STDY7537944,PATO:0000461
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,2,Mes1,AAACCTGCATCGGGTC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,2,Mes1,AAACCTGCATGGGACA,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,140,VenousEndo,AAACCTGGTGATGCCC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0002543,HsapDv:0000022,5386STDY7537944,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,220,DistalMes,TTTGTCAAGATAGGAG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,90,InterZone,TTTGTCACATATGCTG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,90,InterZone,TTTGTCAGTGATGCCC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,00,Mes2,TTTGTCAGTGCACCAC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461


#### **is_primary_data**

In [107]:
#change data type of column

In [108]:
adata.obs['is_primary_data']= [True] * len(adata.obs)

In [109]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [110]:
# view obs

In [111]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,celltype,barcodes,assay,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,Mes1,AAACCTGCACATTTCT,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,OCP,AAACCTGCAGGGATTG,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0000062,HsapDv:0000022,5386STDY7537944,PATO:0000461,True
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,Mes1,AAACCTGCATCGGGTC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,Mes1,AAACCTGCATGGGACA,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,VenousEndo,AAACCTGGTGATGCCC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0002543,HsapDv:0000022,5386STDY7537944,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,DistalMes,TTTGTCAAGATAGGAG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,InterZone,TTTGTCACATATGCTG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,InterZone,TTTGTCAGTGATGCCC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,Mes2,TTTGTCAGTGCACCAC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True


#### **organism_ontology_term_id**

In [112]:
# assign organism id 

In [113]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [114]:
#change data type of column

In [115]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [116]:
# view obs

In [117]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,barcodes,assay,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,AAACCTGCACATTTCT,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,AAACCTGCAGGGATTG,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0000062,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,AAACCTGCATCGGGTC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,AAACCTGCATGGGACA,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,AAACCTGGTGATGCCC,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0002543,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,TTTGTCAAGATAGGAG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,TTTGTCACATATGCTG,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,TTTGTCAGTGATGCCC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,TTTGTCAGTGCACCAC,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [118]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [119]:
# change data type

In [120]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [121]:
# view obs

In [122]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,assay,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0000062,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0002543,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown


In [123]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes',
       'assay', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### **sex_ontology_term_id**

In [124]:
# identify the column in adata.obs which corresponds to sex

In [125]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes',
       'assay', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [126]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,assay,assays,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0000062,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,3pv2_5pv1_5pv2,3v2,EFO:0009899,CL:0002543,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,3pv2_5pv1_5pv2,5v1,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown


In [127]:
# list the unique values 

In [128]:
mapping = {'5386STDY7537944':'F',
           '5478STDY7717491':'M',
'5478STDY7717492':'M',
'5478STDY7652318':'M',
'5386STDY7557336':'M',
'5386STDY7557337':'M',
'5386STDY7557335':'F',
'FCAImmP7536758':'F',
'FCAImmP7536759':'F',
'5478STDY7980348':'M',
'5478STDY7980349':'M',
'5478STDY7935101':'F',
'5478STDY7935102':'M',
'WSSS_THYst9384953':'M',
'WSSS_THYst9384954':'M',
'WSSS_THYst9384955':'M',
'WSSS_THYst9384956':'M',
'WSSS_THYst9384957':'M',
'WSSS_THYst9384958':'M',
'WSSS_THYst8796437':'M',
'WSSS_THYst8796438':'M',
'WSSS_THYst8796439':'M',
'WSSS_THYst8796440':'M',
'WSSS_THYst8796441':'M',
'WSSS_THYst8796442':'M',
'WSSS_THYst9383359':'M',
'WSSS_THYst9383360':'M',
'WSSS_THYst9383361':'M',
'WSSS_THYst9383362':'M',
'WSSS_THYst9699523':'F',
'WSSS_THYst9699524':'F',
'WSSS_THYst9699525':'F',
'WSSS_THYst9699526':'unknown'}

In [129]:
adata.obs['sex'] = adata.obs['batch'].map(mapping)

In [130]:
# create a dictionary of sex and sex ontology term id

In [131]:
mapping= {'F': 'PATO:0000383', 'M': 'PATO:0000384', 'unknown':'unknown'}

In [132]:
# add sex_ontology_term_id column

In [133]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex'].map(mapping)

In [134]:
# change data type

In [135]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [136]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0000062,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0002543,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384


#### **suspension_type**

In [137]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0000062,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,EFO:0009899,CL:0002543,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,EFO:0011025,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384


In [138]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [139]:
# change data type of column

In [140]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [141]:
# view obs

In [142]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id,suspension_type
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,CL:0000062,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,CL:0008019,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,CL:0002543,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,CL:0008019,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell


#### **tissue_type**

In [143]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [144]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [145]:
# identify the column in adata.obs which corresponds to tissue

In [146]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes',
       'assay', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue_type'],
      dtype='object')

In [147]:
list(adata.obs['tissue'].unique())

['LowerLimb', 'UpperLimb']

In [148]:
# Ensure 'barcode' and 'cell_type_ontology_term_id' are columns in adata.obs
if 'batch' in adata.obs.columns and 'tissue' in adata.obs.columns:
    # Create the dictionary
    mapping = dict(zip(adata.obs['batch'], adata.obs['tissue']))
else:
    raise KeyError("Columns 'barcode' and/or 'cell_type_ontology_term_id' not found in adata.obs")

In [149]:
mapping

{'5386STDY7537944': 'LowerLimb',
 'FCAImmP7536758': 'LowerLimb',
 'FCAImmP7536759': 'LowerLimb',
 '5386STDY7557335': 'LowerLimb',
 '5386STDY7557336': 'LowerLimb',
 '5386STDY7557337': 'LowerLimb',
 '5478STDY7652318': 'LowerLimb',
 '5478STDY7717491': 'LowerLimb',
 '5478STDY7717492': 'LowerLimb',
 '5478STDY7935101': 'LowerLimb',
 '5478STDY7935102': 'LowerLimb',
 '5478STDY7980348': 'LowerLimb',
 '5478STDY7980349': 'LowerLimb',
 'WSSS_THYst8796437': 'UpperLimb',
 'WSSS_THYst8796438': 'UpperLimb',
 'WSSS_THYst8796439': 'UpperLimb',
 'WSSS_THYst8796440': 'LowerLimb',
 'WSSS_THYst8796441': 'UpperLimb',
 'WSSS_THYst8796442': 'LowerLimb',
 'WSSS_THYst9384953': 'UpperLimb',
 'WSSS_THYst9384954': 'UpperLimb',
 'WSSS_THYst9384955': 'UpperLimb',
 'WSSS_THYst9384956': 'LowerLimb',
 'WSSS_THYst9384957': 'LowerLimb',
 'WSSS_THYst9384958': 'LowerLimb'}

In [150]:
mapping= {'LowerLimb':'UBERON:0002103', 'UpperLimb':'UBERON:0002102'}

In [151]:
# add 'tissue_ontology_term_id' column

In [152]:
adata.obs['tissue_ontology_term_id'] =adata.obs['tissue'].map(mapping)

In [153]:
# change data type of column

In [154]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [155]:
#list the unique values in 'tissue_ontology_term_id' column

In [156]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002103', 'UBERON:0002102']

In [157]:
# view obs

In [158]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell,tissue,UBERON:0002103
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell,tissue,UBERON:0002103
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell,tissue,UBERON:0002103
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell,tissue,UBERON:0002103


In [159]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes',
       'assay', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [160]:
# view obsm

In [161]:
# check whether all columns are prefixed with X

In [162]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

#### **uns (Dataset Metadata)**

In [163]:
# View

In [164]:
adata.uns

{'adj_stage_colors': array(['#DF0029', '#EC870E', '#F1AF00', '#DCD800', '#5BBD2B', '#00B2BF',
        '#7388C1', '#511F90', '#79378B', '#8F006D'], dtype=object),
 'batch_colors': array(['#023fa5', '#7d87b9', '#bec1d4', '#d6bcc0', '#bb7784', '#8e063b',
        '#4a6fe3', '#8595e1', '#b5bbe3', '#e6afb9', '#e07b91', '#d33f6a',
        '#11c638', '#8dd593', '#c6dec7', '#ead3c6', '#f0b98d', '#ef9708',
        '#0fcfc0', '#9cded6', '#d5eae7', '#f3e1eb', '#f6c4e1', '#f79cd4',
        '#7f7f7f'], dtype=object),
 'celltype_colors': array(['#ffff00', '#82D900', '#9AFF02', '#5E005E', '#008941', '#00E3E3',
        '#a30059', '#C6A300', '#0000a6', '#B15BFF', '#b79762', '#005757',
        '#6C3365', '#00AEAE', '#005AB5', '#ff0000', '#00EC00', '#484891',
        '#737300', '#61615a', '#BF0060', '#00FFFF', '#B8B8DC', '#AE00AE',
        '#ddefff', '#000035', '#7b4f4b', '#95CACA', '#006000', '#00BB00',
        '#009100', '#79FF79', '#424200', '#c2ffed', '#a079bf', '#FF44FF',
        '#BB3D00', '#AE0000'

In [165]:
adata.uns.keys

<function dict.keys>

In [166]:
# Give a title for the dataset

In [167]:
adata.uns['title'] = 'human_limb_scRNAseq'

In [168]:
# Set the default embedding

In [169]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [170]:
# view anndata object

In [171]:
adata

AnnData object with n_obs × n_vars = 125955 × 26252
    obs: 'batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden', 'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes', 'assay', 'assays', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'feature_types', 'n_cells', 'highly_variableWSSS_THYst8796437', 'highly_variableWSSS_THYst8796442', 'highly_variable5478STDY7935101', 'highly_variable5478STDY7935102', 'highly_variableWSSS_THYst8796439', 'highly_variableWSSS_THYst9384956', 'highly_variable5478STDY7717491', 'highly_variableWSSS_THYst9384953', 'highly_variable5478STDY7717492', 'highly_variableWSSS_THYst9384955', 'highly_var

In [172]:
# view obs and var data types

In [173]:
adata.obs.dtypes

batch                                       category
percent_mito                                 float32
n_counts                                     float32
n_genes                                        int64
doublet_scores                               float64
bh_pval                                      float64
region                                      category
tissue                                      category
adj_stage                                   category
adj_sample                                  category
leiden                                      category
S_score                                      float64
G2M_score                                    float64
phase                                       category
leiden_R                                    category
celltype                                    category
barcodes                                      object
assay                                         object
assays                                        

In [174]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

changed n_cells from int64 to int32


In [175]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed doublet_scores from float64 to float32
changed bh_pval from float64 to float32
changed S_score from float64 to float32
changed G2M_score from float64 to float32
changed n_genes from int64 to int32
changed barcodes from object to category
changed assay from object to category
changed assays from object to category
changed sex from object to category


In [176]:
# view obs

In [177]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,tissue,adj_stage,adj_sample,...,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGCAGGGATTG-5386STDY7537944,5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGCATCGGGTC-5386STDY7537944,5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGCATGGGACA-5386STDY7537944,5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGGTGATGCCC-5386STDY7537944,5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,LowerLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,...,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,F,PATO:0000383,cell,tissue,UBERON:0002103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell,tissue,UBERON:0002103
TTTGTCACATATGCTG-WSSS_THYst9384958,WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell,tissue,UBERON:0002103
TTTGTCAGTGATGCCC-WSSS_THYst9384958,WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell,tissue,UBERON:0002103
TTTGTCAGTGCACCAC-WSSS_THYst9384958,WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,LowerLimb,Pcw5.6,Pcw5.6_Proximal_LowerLimb,...,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,cell,tissue,UBERON:0002103


In [178]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'leiden',
       'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype', 'barcodes',
       'assay', 'assays', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [179]:
# delete unwanted columns in obs

In [180]:
del adata.obs['barcodes']
del adata.obs['assay']
del adata.obs['tissue']
del adata.obs['sex']
del adata.obs['assays']
del adata.obs['batch']

In [181]:
# view obs

In [182]:
adata.obs

Unnamed: 0,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,region,adj_stage,adj_sample,leiden,S_score,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGCACATTTCT-5386STDY7537944,0.011364,2992.0,1152,0.286585,0.872782,WholeLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,-0.117172,...,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGCAGGGATTG-5386STDY7537944,0.006805,2939.0,1187,0.194631,0.872782,WholeLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,-0.104853,...,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGCATCGGGTC-5386STDY7537944,0.011667,6857.0,2329,0.098748,0.872782,WholeLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,0.033006,...,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGCATGGGACA-5386STDY7537944,0.009933,10974.0,2967,0.185345,0.872782,WholeLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,12,0.106430,...,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002103
AAACCTGGTGATGCCC-5386STDY7537944,0.015003,6532.0,2083,0.130000,0.927508,WholeLimb,Pcw5.1,Pcw5.1_WholeLimb_LowerLimb,26,-0.120746,...,HsapDv:0000022,5386STDY7537944,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGATAGGAG-WSSS_THYst9384958,0.024296,17616.0,4814,0.118644,0.910325,Proximal,Pcw5.6,Pcw5.6_Proximal_LowerLimb,20,0.300386,...,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002103
TTTGTCACATATGCTG-WSSS_THYst9384958,0.025709,5250.0,2190,0.190476,0.902918,Proximal,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,-0.157143,...,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002103
TTTGTCAGTGATGCCC-WSSS_THYst9384958,0.026241,9183.0,3080,0.215116,0.902918,Proximal,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,-0.108790,...,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002103
TTTGTCAGTGCACCAC-WSSS_THYst9384958,0.036578,12576.0,3849,0.152466,0.910325,Proximal,Pcw5.6,Pcw5.6_Proximal_LowerLimb,2,0.063410,...,HsapDv:0000023,WSSS_THYst9384958,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002103


In [183]:
# view var

In [184]:
adata.var

Unnamed: 0_level_0,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,highly_variableWSSS_THYst9384953,...,highly_variable5478STDY7980348,highly_variable5386STDY7557337,highly_variable5386STDY7557336,highly_variableWSSS_THYst9384958,highly_variableWSSS_THYst8796440,highly_variable5386STDY7557335,highly_variable_n,highly_variable,gene_name,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,MIR1302-2HG,False
ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AL627309.1,False
ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AL627309.3,False
ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AL732372.1,False
ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AC114498.1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AC007325.2,False
ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AL354822.1,False
ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,False,AC004556.1,False
ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,4,False,AC240274.1,False


In [185]:
araw.var

Unnamed: 0_level_0,feature_types,n_cells,highly_variableWSSS_THYst8796437,highly_variableWSSS_THYst8796442,highly_variable5478STDY7935101,highly_variable5478STDY7935102,highly_variableWSSS_THYst8796439,highly_variableWSSS_THYst9384956,highly_variable5478STDY7717491,highly_variableWSSS_THYst9384953,...,Deep_FCAImmP7536758,Deep_WSSS_THYst9384957,Deep_5478STDY7980348,Deep_5386STDY7557337,Deep_5386STDY7557336,Deep_WSSS_THYst9384958,Deep_WSSS_THYst8796440,Deep_5386STDY7557335,Deep_n,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485,Gene Expression,41,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,MIR1302-2HG
ENSG00000238009,Gene Expression,139,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL627309.1
ENSG00000239945,Gene Expression,8,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL627309.3
ENSG00000236601,Gene Expression,22,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL732372.1
ENSG00000235146,Gene Expression,8,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC114498.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277196,Gene Expression,35,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC007325.2
ENSG00000278384,Gene Expression,66,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AL354822.1
ENSG00000276345,Gene Expression,1113,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC004556.1
ENSG00000271254,Gene Expression,11664,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,AC240274.1


In [186]:
#view uns

In [187]:
adata.uns

{'adj_stage_colors': array(['#DF0029', '#EC870E', '#F1AF00', '#DCD800', '#5BBD2B', '#00B2BF',
        '#7388C1', '#511F90', '#79378B', '#8F006D'], dtype=object),
 'batch_colors': array(['#023fa5', '#7d87b9', '#bec1d4', '#d6bcc0', '#bb7784', '#8e063b',
        '#4a6fe3', '#8595e1', '#b5bbe3', '#e6afb9', '#e07b91', '#d33f6a',
        '#11c638', '#8dd593', '#c6dec7', '#ead3c6', '#f0b98d', '#ef9708',
        '#0fcfc0', '#9cded6', '#d5eae7', '#f3e1eb', '#f6c4e1', '#f79cd4',
        '#7f7f7f'], dtype=object),
 'celltype_colors': array(['#ffff00', '#82D900', '#9AFF02', '#5E005E', '#008941', '#00E3E3',
        '#a30059', '#C6A300', '#0000a6', '#B15BFF', '#b79762', '#005757',
        '#6C3365', '#00AEAE', '#005AB5', '#ff0000', '#00EC00', '#484891',
        '#737300', '#61615a', '#BF0060', '#00FFFF', '#B8B8DC', '#AE00AE',
        '#ddefff', '#000035', '#7b4f4b', '#95CACA', '#006000', '#00BB00',
        '#009100', '#79FF79', '#424200', '#c2ffed', '#a079bf', '#FF44FF',
        '#BB3D00', '#AE0000'

In [188]:
list(adata.uns.keys())

['adj_stage_colors',
 'batch_colors',
 'celltype_colors',
 'celltype_sizes',
 'leiden',
 'leiden_R_colors',
 'leiden_sizes',
 'log1p',
 'majority_voting_colors',
 'markers_celltype',
 'markers_new_celltype',
 'neighbors',
 'new_celltype_sizes',
 'paga',
 'pca',
 'phase_colors',
 'pre-annotation_colors',
 'rank_genes_groups',
 'region_colors',
 'tissue_colors',
 'umap',
 'title',
 'default_embedding']

In [189]:
del adata.uns['leiden_R_colors']
del adata.uns['majority_voting_colors']
del adata.uns['pre-annotation_colors']
del adata.uns['tissue_colors']
del adata.uns['log1p']
del adata.uns['batch_colors']

In [190]:
adata.obs.columns

Index(['percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval',
       'region', 'adj_stage', 'adj_sample', 'leiden', 'S_score', 'G2M_score',
       'phase', 'leiden_R', 'celltype', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [191]:
# Remove unwanted columns in uns

In [192]:
#check the format of expression matrix

In [193]:
adata.X

<125955x26252 sparse matrix of type '<class 'numpy.float32'>'
	with 298559457 stored elements in Compressed Sparse Row format>

In [194]:
araw.X

<125955x26252 sparse matrix of type '<class 'numpy.float32'>'
	with 298559457 stored elements in Compressed Sparse Row format>

In [195]:
#Copy raw counts to adata.raw

In [196]:
adata.raw = araw

In [197]:
obs_dtype = adata.obs.dtypes

In [198]:
obs_dtype

percent_mito                                 float32
n_counts                                     float32
n_genes                                        int32
doublet_scores                               float32
bh_pval                                      float32
region                                      category
adj_stage                                   category
adj_sample                                  category
leiden                                      category
S_score                                      float32
G2M_score                                    float32
phase                                       category
leiden_R                                    category
celltype                                    category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
donor_id                                    category
disease_ontology_term_id                    ca

In [199]:
del adata.raw.var['highly_variableWSSS_THYst8796437']
del adata.raw.var['highly_variableWSSS_THYst9384954']
del adata.raw.var['highly_variable5478STDY7980349']
del adata.raw.var['highly_variable5478STDY7652318']
del adata.raw.var['highly_variableFCAImmP7536758']
del adata.raw.var['highly_variable5478STDY7980348']
del adata.raw.var['highly_variable5386STDY7557336']
del adata.raw.var['highly_variableWSSS_THYst8796440']
del adata.raw.var['highly_variable5386STDY7557335']
del adata.raw.var['highly_variable5386STDY7557337']
del adata.raw.var['highly_variableWSSS_THYst9384957']
del adata.raw.var['highly_variableWSSS_THYst8796441']
del adata.raw.var['highly_variableWSSS_THYst8796438']
del adata.raw.var['highly_variableFCAImmP7536759']
del adata.raw.var['highly_variable5386STDY7537944']
del adata.raw.var['highly_variable5478STDY7717492']
del adata.raw.var['highly_variable5478STDY7717491']
del adata.raw.var['highly_variableWSSS_THYst8796439']
del adata.raw.var['highly_variable5478STDY7935101']
del adata.raw.var['highly_variableWSSS_THYst8796442']
del adata.raw.var['highly_variable5478STDY7935102']
del adata.raw.var['highly_variableWSSS_THYst9384953']
del adata.raw.var['highly_variableWSSS_THYst9384958']
del adata.raw.var['highly_variableWSSS_THYst9384955']
del adata.raw.var['highly_variableWSSS_THYst9384956']
del adata.raw.var['highly_variable_n']
del adata.raw.var['highly_variable']
del adata.raw.var['gene_name']
del adata.raw.var['n_cells']
del adata.raw.var['feature_types']
del adata.raw.var['Deep_WSSS_THYst8796440']
del adata.raw.var['Deep_5478STDY7935102']
del adata.raw.var['Deep_WSSS_THYst8796442']
del adata.raw.var['Deep_WSSS_THYst9384953']
del adata.raw.var['Deep_WSSS_THYst9384958']
del adata.raw.var['Deep_5386STDY7557336']
del adata.raw.var['Deep_5478STDY7935101']
del adata.raw.var['Deep_WSSS_THYst8796439']
del adata.raw.var['Deep_5478STDY7717491']
del adata.raw.var['Deep_5478STDY7717492']
del adata.raw.var['Deep_5386STDY7537944']
del adata.raw.var['Deep_FCAImmP7536759']
del adata.raw.var['Deep_5478STDY7980349']
del adata.raw.var['Deep_5478STDY7652318']
del adata.raw.var['Deep_FCAImmP7536758']
del adata.raw.var['Deep_5478STDY7980348']
del adata.raw.var['Deep_WSSS_THYst8796437']
del adata.raw.var['Deep_WSSS_THYst9384955']
del adata.raw.var['Deep_WSSS_THYst8796438']
del adata.raw.var['Deep_WSSS_THYst9384957']
del adata.raw.var['Deep_5386STDY7557335']
del adata.raw.var['Deep_WSSS_THYst9384956']
del adata.raw.var['Deep_WSSS_THYst9384954']
del adata.raw.var['Deep_WSSS_THYst8796441']
del adata.raw.var['Deep_5386STDY7557337']
del adata.raw.var['Deep_n']

In [200]:
adata.raw.var

ENSG00000243485
ENSG00000238009
ENSG00000239945
ENSG00000236601
ENSG00000235146
...
ENSG00000277196
ENSG00000278384
ENSG00000276345
ENSG00000271254
ENSG00000268674


In [201]:
del adata.var['highly_variableWSSS_THYst8796437']
del adata.var['highly_variableWSSS_THYst9384954']
del adata.var['highly_variable5478STDY7980349']
del adata.var['highly_variable5478STDY7652318']
del adata.var['highly_variableFCAImmP7536758']
del adata.var['highly_variable5478STDY7980348']
del adata.var['highly_variable5386STDY7557336']
del adata.var['highly_variableWSSS_THYst8796440']
del adata.var['highly_variable5386STDY7557335']
del adata.var['highly_variable5386STDY7557337']
del adata.var['highly_variableWSSS_THYst9384957']
del adata.var['highly_variableWSSS_THYst8796441']
del adata.var['highly_variableWSSS_THYst8796438']
del adata.var['highly_variableFCAImmP7536759']
del adata.var['highly_variable5386STDY7537944']
del adata.var['highly_variable5478STDY7717492']
del adata.var['highly_variable5478STDY7717491']
del adata.var['highly_variableWSSS_THYst8796439']
del adata.var['highly_variable5478STDY7935101']
del adata.var['highly_variableWSSS_THYst8796442']
del adata.var['highly_variable5478STDY7935102']
del adata.var['highly_variableWSSS_THYst9384953']
del adata.var['highly_variableWSSS_THYst9384958']
del adata.var['highly_variableWSSS_THYst9384955']
del adata.var['highly_variableWSSS_THYst9384956']
del adata.var['highly_variable_n']
del adata.var['highly_variable']
del adata.var['gene_name']
del adata.var['n_cells']
del adata.var['feature_types']

In [202]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
gene_ids,Unnamed: 1_level_1
ENSG00000243485,False
ENSG00000238009,False
ENSG00000239945,False
ENSG00000236601,False
ENSG00000235146,False
...,...
ENSG00000277196,False
ENSG00000278384,False
ENSG00000276345,False
ENSG00000271254,False


In [None]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/limb_cell_atlas/Final_objects/to_upload/human_limb.h5ad', compression = 'gzip')

In [None]:
adata.obs

In [None]:
adata.obs.columns

In [None]:
adata.raw.var

In [None]:
adata.var

In [None]:
adata.raw.X