### **Curating visium_stitched.h5ad**

Article:   A human embryonic limb cell atlas resolved in space and time

DOI: https://doi.org/10.1038/s41586-023-06806-x

Data Source : https://developmental.cellatlas.io/embryonic-limb

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/limb_cell_atlas/Data/New_data/visium_stitched.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 14976 × 33694
    obs: 'sample', 'batch', 'in_tissue', 'array_row', 'array_col', 'barcode', 'library_id', 'percent_mito', 'C2L: ADH+Fibro', 'C2L: AER-Basal', 'C2L: ArterialEndo', 'C2L: ArtiChon', 'C2L: B', 'C2L: Basal', 'C2L: CMP/GMP', 'C2L: ChondroProg', 'C2L: DC2', 'C2L: DefErythro', 'C2L: DefReticulo', 'C2L: DermFibro', 'C2L: DistalMes', 'C2L: F10+DermFibroProg', 'C2L: HOXC5+DermFibroProg', 'C2L: HyperChon', 'C2L: ISL1+Mes', 'C2L: InterMusFibro', 'C2L: InterZone', 'C2L: LMPP/ELP', 'C2L: LymphEndo', 'C2L: MFAP5+Fibro', 'C2L: MYH3+MyoC', 'C2L: MYL3+MyoC', 'C2L: Macro', 'C2L: Mast', 'C2L: Megakaryo', 'C2L: Melano', 'C2L: Mes1', 'C2L: Mes2', 'C2L: Mes3', 'C2L: Mes4', 'C2L: MesCond', 'C2L: Monocyte', 'C2L: Myelocyte', 'C2L: MyoB1', 'C2L: MyoB2', 'C2L: MyoC1', 'C2L: MyoC2', 'C2L: NK', 'C2L: NeuralFibro', 'C2L: Neuronal', 'C2L: OCP', 'C2L: OsteoB', 'C2L: PAX3+MyoProg', 'C2L: PAX7+MyoProg', 'C2L: Perichon', 'C2L: Pericyte', 'C2L: Periderm', 'C2L: Perimys

##### **X- expression matrix**

In [7]:
# View the expression matrix of the anndata object

In [8]:
adata.X

<14976x33694 sparse matrix of type '<class 'numpy.float32'>'
	with 11368584 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 238)	1.0
  (0, 372)	1.0
  (0, 1933)	1.0
  (0, 2049)	1.0
  (0, 3683)	1.0
  (0, 4204)	3.0
  (0, 4688)	1.0
  (0, 4848)	1.0
  (0, 4853)	1.0
  (0, 4884)	1.0
  (0, 5533)	1.0
  (0, 5929)	1.0
  (0, 5966)	1.0
  (0, 5984)	1.0
  (0, 6239)	1.0
  (0, 7477)	2.0
  (0, 9818)	1.0
  (0, 10736)	1.0
  (0, 10764)	1.0
  (0, 10780)	1.0
  (0, 11106)	1.0
  (0, 11444)	1.0
  (0, 11571)	1.0
  (0, 12538)	1.0
  (0, 13503)	1.0
  :	:
  (14975, 33010)	1.0
  (14975, 33027)	1.0
  (14975, 33033)	1.0
  (14975, 33113)	1.0
  (14975, 33394)	1.0
  (14975, 33402)	1.0
  (14975, 33431)	1.0
  (14975, 33433)	1.0
  (14975, 33467)	1.0
  (14975, 33480)	2.0
  (14975, 33488)	1.0
  (14975, 33548)	1.0
  (14975, 33550)	1.0
  (14975, 33567)	1.0
  (14975, 33600)	2.0
  (14975, 33635)	2.0
  (14975, 33643)	2.0
  (14975, 33652)	5.0
  (14975, 33654)	1.0
  (14975, 33655)	3.0
  (14975, 33657)	8.0
  (14975, 33658)	7.0
  (14975, 33659)	4.0
  (14975, 33661)	7.0
  (14975, 33664)	2.0


##### **Variables(var)**

In [11]:
#View the var of anndata and raw object

In [12]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
RP11-34P13.3,ENSG00000243485,Gene Expression,GRCh38
FAM138A,ENSG00000237613,Gene Expression,GRCh38
OR4F5,ENSG00000186092,Gene Expression,GRCh38
RP11-34P13.7,ENSG00000238009,Gene Expression,GRCh38
RP11-34P13.8,ENSG00000239945,Gene Expression,GRCh38
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38
AC233755.1,ENSG00000275063,Gene Expression,GRCh38
AC240274.1,ENSG00000271254,Gene Expression,GRCh38
AC213203.1,ENSG00000277475,Gene Expression,GRCh38


In [13]:
#Store ensembl ids in a new column in adata.var by matching gene symbols and ensembl ids from the gene information file

In [14]:
adata.var['gene_symbols'] = adata.var_names

In [15]:
adata.var_names = adata.var['gene_ids']

In [16]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38,RP11-34P13.3
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38,RP11-34P13.7
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38,RP11-34P13.8
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38,AC213203.1


In [17]:
# load the approved genes file

In [18]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [19]:
# Create a dictionary from the approved genes file using the symbols and feature id columns.

In [20]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [21]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [22]:
len(genedict)

119799

In [23]:
# Filter out the genes which are not in the approved genes file

In [24]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]

In [25]:
len(var_to_keep_adata)

32831

In [26]:
# Modify the anndata object by filtering out the filtered genes. copy the index column values to a new column called gene_symbols

In [27]:
adata = adata[:, var_to_keep_adata].copy()

In [28]:
#  View the var

In [29]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38,RP11-34P13.3
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38,RP11-34P13.7
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38,RP11-34P13.8
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38,AC213203.1


feature is filtered

In [30]:
# view var

In [31]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [32]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38,RP11-34P13.3,False
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38,FAM138A,False
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38,OR4F5,False
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38,RP11-34P13.7,False
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38,RP11-34P13.8,False
...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38,AC233755.2,False
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38,AC233755.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38,AC240274.1,False
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38,AC213203.1,False


In [33]:
del adata.var['gene_ids']
del adata.var['gene_symbols']

#### **Observations(obs) (Cell metadata)**

In [34]:
#view obs

In [35]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,C2L: SMC,C2L: SMProg,C2L: STMN2+Fibro,C2L: Schwann,C2L: SchwannProg,C2L: SynapSchwann,C2L: Teno,C2L: TenoProg,C2L: TransMes,C2L: VenousEndo
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,,,,,,,,,
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,0.246522,0.145605,0.009509,0.028623,0.019403,0.008215,0.025422,0.005845,0.018950,0.136347
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,,,,,,,,,
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,0.077816,0.030349,0.025740,0.014795,0.007393,0.013559,0.208248,0.011997,0.016958,0.028851
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,,,,,,,,,
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,,,,,,,,,
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,,,,,,,,,
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,,,,,,,,,


In [36]:
adata.obs.columns

Index(['sample', 'batch', 'in_tissue', 'array_row', 'array_col', 'barcode',
       'library_id', 'percent_mito', 'C2L: ADH+Fibro', 'C2L: AER-Basal',
       'C2L: ArterialEndo', 'C2L: ArtiChon', 'C2L: B', 'C2L: Basal',
       'C2L: CMP/GMP', 'C2L: ChondroProg', 'C2L: DC2', 'C2L: DefErythro',
       'C2L: DefReticulo', 'C2L: DermFibro', 'C2L: DistalMes',
       'C2L: F10+DermFibroProg', 'C2L: HOXC5+DermFibroProg', 'C2L: HyperChon',
       'C2L: ISL1+Mes', 'C2L: InterMusFibro', 'C2L: InterZone',
       'C2L: LMPP/ELP', 'C2L: LymphEndo', 'C2L: MFAP5+Fibro', 'C2L: MYH3+MyoC',
       'C2L: MYL3+MyoC', 'C2L: Macro', 'C2L: Mast', 'C2L: Megakaryo',
       'C2L: Melano', 'C2L: Mes1', 'C2L: Mes2', 'C2L: Mes3', 'C2L: Mes4',
       'C2L: MesCond', 'C2L: Monocyte', 'C2L: Myelocyte', 'C2L: MyoB1',
       'C2L: MyoB2', 'C2L: MyoC1', 'C2L: MyoC2', 'C2L: NK', 'C2L: NeuralFibro',
       'C2L: Neuronal', 'C2L: OCP', 'C2L: OsteoB', 'C2L: PAX3+MyoProg',
       'C2L: PAX7+MyoProg', 'C2L: Perichon', 'C2L: Per

#### **assay_ontology_term_id**

In [37]:
# add the assay_ontology_term_id column

In [38]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [39]:
# change datatype of the column

In [40]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [41]:
# view adata.obs

In [42]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,C2L: SMProg,C2L: STMN2+Fibro,C2L: Schwann,C2L: SchwannProg,C2L: SynapSchwann,C2L: Teno,C2L: TenoProg,C2L: TransMes,C2L: VenousEndo,assay_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,,,,,,,,,EFO:0010961
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,0.145605,0.009509,0.028623,0.019403,0.008215,0.025422,0.005845,0.018950,0.136347,EFO:0010961
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,,,,,,,,,EFO:0010961
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,0.030349,0.025740,0.014795,0.007393,0.013559,0.208248,0.011997,0.016958,0.028851,EFO:0010961
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,,,,,,,,,EFO:0010961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,,,,,,,,,EFO:0010961
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,,,,,,,,,EFO:0010961
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,,,,,,,,,EFO:0010961
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,,,,,,,,,EFO:0010961


#### **cell_type_ontology_term_id**

In [43]:
#get the column in adata.obs related. to cell type annotation

In [44]:
adata.obs.columns

Index(['sample', 'batch', 'in_tissue', 'array_row', 'array_col', 'barcode',
       'library_id', 'percent_mito', 'C2L: ADH+Fibro', 'C2L: AER-Basal',
       'C2L: ArterialEndo', 'C2L: ArtiChon', 'C2L: B', 'C2L: Basal',
       'C2L: CMP/GMP', 'C2L: ChondroProg', 'C2L: DC2', 'C2L: DefErythro',
       'C2L: DefReticulo', 'C2L: DermFibro', 'C2L: DistalMes',
       'C2L: F10+DermFibroProg', 'C2L: HOXC5+DermFibroProg', 'C2L: HyperChon',
       'C2L: ISL1+Mes', 'C2L: InterMusFibro', 'C2L: InterZone',
       'C2L: LMPP/ELP', 'C2L: LymphEndo', 'C2L: MFAP5+Fibro', 'C2L: MYH3+MyoC',
       'C2L: MYL3+MyoC', 'C2L: Macro', 'C2L: Mast', 'C2L: Megakaryo',
       'C2L: Melano', 'C2L: Mes1', 'C2L: Mes2', 'C2L: Mes3', 'C2L: Mes4',
       'C2L: MesCond', 'C2L: Monocyte', 'C2L: Myelocyte', 'C2L: MyoB1',
       'C2L: MyoB2', 'C2L: MyoC1', 'C2L: MyoC2', 'C2L: NK', 'C2L: NeuralFibro',
       'C2L: Neuronal', 'C2L: OCP', 'C2L: OsteoB', 'C2L: PAX3+MyoProg',
       'C2L: PAX7+MyoProg', 'C2L: Perichon', 'C2L: Per

In [45]:
adata.obsm

AxisArrays with keys: X_spatial

In [46]:
c2l_columns = [col for col in adata.obs.columns if col.startswith('C2L')]

In [47]:
c2l_columns

['C2L: ADH+Fibro',
 'C2L: AER-Basal',
 'C2L: ArterialEndo',
 'C2L: ArtiChon',
 'C2L: B',
 'C2L: Basal',
 'C2L: CMP/GMP',
 'C2L: ChondroProg',
 'C2L: DC2',
 'C2L: DefErythro',
 'C2L: DefReticulo',
 'C2L: DermFibro',
 'C2L: DistalMes',
 'C2L: F10+DermFibroProg',
 'C2L: HOXC5+DermFibroProg',
 'C2L: HyperChon',
 'C2L: ISL1+Mes',
 'C2L: InterMusFibro',
 'C2L: InterZone',
 'C2L: LMPP/ELP',
 'C2L: LymphEndo',
 'C2L: MFAP5+Fibro',
 'C2L: MYH3+MyoC',
 'C2L: MYL3+MyoC',
 'C2L: Macro',
 'C2L: Mast',
 'C2L: Megakaryo',
 'C2L: Melano',
 'C2L: Mes1',
 'C2L: Mes2',
 'C2L: Mes3',
 'C2L: Mes4',
 'C2L: MesCond',
 'C2L: Monocyte',
 'C2L: Myelocyte',
 'C2L: MyoB1',
 'C2L: MyoB2',
 'C2L: MyoC1',
 'C2L: MyoC2',
 'C2L: NK',
 'C2L: NeuralFibro',
 'C2L: Neuronal',
 'C2L: OCP',
 'C2L: OsteoB',
 'C2L: PAX3+MyoProg',
 'C2L: PAX7+MyoProg',
 'C2L: Perichon',
 'C2L: Pericyte',
 'C2L: Periderm',
 'C2L: Perimysium',
 'C2L: PrehyperChon',
 'C2L: PrimErythro1',
 'C2L: PrimErythro2',
 'C2L: ProlifChon',
 'C2L: ProxMes',


In [48]:
adata.obs['max_c2l_column'] = adata.obs[c2l_columns].idxmax(axis=1)

In [49]:
adata.obs['max_c2l_column_value'] = adata.obs[c2l_columns].max(axis=1)

In [50]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,C2L: Schwann,C2L: SchwannProg,C2L: SynapSchwann,C2L: Teno,C2L: TenoProg,C2L: TransMes,C2L: VenousEndo,assay_ontology_term_id,max_c2l_column,max_c2l_column_value
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,,,,,,,EFO:0010961,,
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,0.028623,0.019403,0.008215,0.025422,0.005845,0.018950,0.136347,EFO:0010961,C2L: MYL3+MyoC,8.189351
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,,,,,,,EFO:0010961,,
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,0.014795,0.007393,0.013559,0.208248,0.011997,0.016958,0.028851,EFO:0010961,C2L: F10+DermFibroProg,8.147670
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,,,,,,,EFO:0010961,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,,,,,,,EFO:0010961,,
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,,,,,,,EFO:0010961,,
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,,,,,,,EFO:0010961,,
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,,,,,,,EFO:0010961,,


In [51]:
list(adata.obs['max_c2l_column'].unique())

[nan,
 'C2L: MYL3+MyoC',
 'C2L: F10+DermFibroProg',
 'C2L: MFAP5+Fibro',
 'C2L: HOXC5+DermFibroProg',
 'C2L: DefReticulo',
 'C2L: Teno',
 'C2L: Basal',
 'C2L: MYH3+MyoC',
 'C2L: Schwann',
 'C2L: RestingChon',
 'C2L: Periderm',
 'C2L: PAX7+MyoProg',
 'C2L: MesCond',
 'C2L: DermFibro',
 'C2L: Perimysium',
 'C2L: ArterialEndo',
 'C2L: NeuralFibro',
 'C2L: Macro',
 'C2L: DistalMes',
 'C2L: SMC',
 'C2L: Neuronal',
 'C2L: Pericyte',
 'C2L: VenousEndo',
 'C2L: PrehyperChon',
 'C2L: ADH+Fibro',
 'C2L: ArtiChon',
 'C2L: SMProg',
 'C2L: DefErythro',
 'C2L: Megakaryo',
 'C2L: HyperChon',
 'C2L: LymphEndo',
 'C2L: ChondroProg',
 'C2L: AER-Basal',
 'C2L: Mes4',
 'C2L: STMN2+Fibro',
 'C2L: InterZone',
 'C2L: OsteoB',
 'C2L: SynapSchwann',
 'C2L: DC2',
 'C2L: SchwannProg']

In [52]:
import pandas as pd

# Assuming adata.obs['col'] is a pandas Series object
adata.obs['max_c2l_column'] = adata.obs['max_c2l_column'].fillna('NaN').astype(str)


In [53]:
list(adata.obs['max_c2l_column'].unique())

['NaN',
 'C2L: MYL3+MyoC',
 'C2L: F10+DermFibroProg',
 'C2L: MFAP5+Fibro',
 'C2L: HOXC5+DermFibroProg',
 'C2L: DefReticulo',
 'C2L: Teno',
 'C2L: Basal',
 'C2L: MYH3+MyoC',
 'C2L: Schwann',
 'C2L: RestingChon',
 'C2L: Periderm',
 'C2L: PAX7+MyoProg',
 'C2L: MesCond',
 'C2L: DermFibro',
 'C2L: Perimysium',
 'C2L: ArterialEndo',
 'C2L: NeuralFibro',
 'C2L: Macro',
 'C2L: DistalMes',
 'C2L: SMC',
 'C2L: Neuronal',
 'C2L: Pericyte',
 'C2L: VenousEndo',
 'C2L: PrehyperChon',
 'C2L: ADH+Fibro',
 'C2L: ArtiChon',
 'C2L: SMProg',
 'C2L: DefErythro',
 'C2L: Megakaryo',
 'C2L: HyperChon',
 'C2L: LymphEndo',
 'C2L: ChondroProg',
 'C2L: AER-Basal',
 'C2L: Mes4',
 'C2L: STMN2+Fibro',
 'C2L: InterZone',
 'C2L: OsteoB',
 'C2L: SynapSchwann',
 'C2L: DC2',
 'C2L: SchwannProg']

In [54]:
mapping= {'C2L: Teno' :'CL:0000388',
 'C2L: SMC':'CL:0000192',
 'C2L: F10+DermFibroProg':'CL:0002551',
 'C2L: MYL3+MyoC':'CL:0000187',
 'C2L: HOXC5+DermFibroProg':'CL:0002551',
 'C2L: MFAP5+Fibro':'CL:0000057',
 'C2L: Pericyte':'CL:0000669',
 'C2L: HyperChon':'CL:0000743',
 'C2L: MYH3+MyoC':'CL:0000187',
 'C2L: Periderm':'CL:0000078',
 'C2L: Perimysium':'CL:0002320',
 'C2L: DermFibro':'CL:0002551',
 'C2L: Schwann':'CL:0002573',
 'C2L: ADH+Fibro':'CL:1001609',
 'C2L: RestingChon':'CL:0000138',
 'C2L: MesCond':'CL:0000138',
 'C2L: DefErythro':'CL:0000232',
 'C2L: ArtiChon':'CL:1001607',
 'C2L: Basal':'CL:0000646',
 'C2L: PAX7+MyoProg':'CL:0000187',
 'C2L: Macro':'CL:0000235',
 'C2L: ArterialEndo':'CL:1000413',
 'C2L: DefReticulo':'CL:0000558',
 'C2L: VenousEndo':'CL:0002543',
 'C2L: OsteoB':'CL:0000062',
 'C2L: NeuralFibro':'CL:0000057',
 'C2L: PrimErythro1':'CL:0002355',
 'C2L: AER-Basal':'CL:0000646',
 'C2L: ChondroProg':'CL:0000138',
 'C2L: SMProg':'CL:0000192',
 'NaN':'unknown',
 'C2L: DistalMes':'CL:0008019',
 'C2L: Neuronal':'CL:0000540',
 'C2L: PrehyperChon':'CL:0000138', 
 'C2L: Megakaryo' :'CL:0000556',
 'C2L: LymphEndo' : 'CL:0002138',
 'C2L: Mes4':'CL:0008019',
 'C2L: STMN2+Fibro':'CL:0002551',
 'C2L: InterZone':'CL:0008019',
 'C2L: SynapSchwann':'CL:0002573',
 'C2L: DC2':'CL:0000990',
 'C2L: SchwannProg':'CL:0002573',
 'C2L: PAX3+MyoProg':'CL:0000515',
 'C2L: Mes3':'CL:0008019',
 'C2L: Perichon':'CL:0000058',
 'C2L: TenoProg':'CL:0000388',
 'C2L: ProlifChon':'CL:0000138',
 'C2L: RDH10+DistalMes':'CL:0008019',
 'C2L: PrimErythro2':'CL:0002355',
 'C2L: TransMes':'CL:0008019',
 'C2L: Mes2':'CL:0008019'}

In [55]:
# create a dictionary of cell type and ontology term

In [56]:
# add the cell_type_ontology_term_id column

In [57]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['max_c2l_column'].map(mapping)

In [58]:
# change datatype of the column

In [59]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [60]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['unknown',
 'CL:0000187',
 'CL:0002551',
 'CL:0000057',
 'CL:0000558',
 'CL:0000388',
 'CL:0000646',
 'CL:0002573',
 'CL:0000138',
 'CL:0000078',
 'CL:0002320',
 'CL:1000413',
 'CL:0000235',
 'CL:0008019',
 'CL:0000192',
 'CL:0000540',
 'CL:0000669',
 'CL:0002543',
 'CL:1001609',
 'CL:1001607',
 'CL:0000232',
 'CL:0000556',
 'CL:0000743',
 'CL:0002138',
 'CL:0000062',
 'CL:0000990']

In [61]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,C2L: SchwannProg,C2L: SynapSchwann,C2L: Teno,C2L: TenoProg,C2L: TransMes,C2L: VenousEndo,assay_ontology_term_id,max_c2l_column,max_c2l_column_value,cell_type_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,,,,,,EFO:0010961,,,unknown
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,0.019403,0.008215,0.025422,0.005845,0.018950,0.136347,EFO:0010961,C2L: MYL3+MyoC,8.189351,CL:0000187
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,,,,,,EFO:0010961,,,unknown
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,0.007393,0.013559,0.208248,0.011997,0.016958,0.028851,EFO:0010961,C2L: F10+DermFibroProg,8.147670,CL:0002551
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,,,,,,EFO:0010961,,,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,,,,,,EFO:0010961,,,unknown
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,,,,,,EFO:0010961,,,unknown
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,,,,,,EFO:0010961,,,unknown
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,,,,,,EFO:0010961,,,unknown


#### **donor_id**

In [62]:
#identify the column in adata.obs which provides donor information

In [63]:
adata.obs.columns

Index(['sample', 'batch', 'in_tissue', 'array_row', 'array_col', 'barcode',
       'library_id', 'percent_mito', 'C2L: ADH+Fibro', 'C2L: AER-Basal',
       'C2L: ArterialEndo', 'C2L: ArtiChon', 'C2L: B', 'C2L: Basal',
       'C2L: CMP/GMP', 'C2L: ChondroProg', 'C2L: DC2', 'C2L: DefErythro',
       'C2L: DefReticulo', 'C2L: DermFibro', 'C2L: DistalMes',
       'C2L: F10+DermFibroProg', 'C2L: HOXC5+DermFibroProg', 'C2L: HyperChon',
       'C2L: ISL1+Mes', 'C2L: InterMusFibro', 'C2L: InterZone',
       'C2L: LMPP/ELP', 'C2L: LymphEndo', 'C2L: MFAP5+Fibro', 'C2L: MYH3+MyoC',
       'C2L: MYL3+MyoC', 'C2L: Macro', 'C2L: Mast', 'C2L: Megakaryo',
       'C2L: Melano', 'C2L: Mes1', 'C2L: Mes2', 'C2L: Mes3', 'C2L: Mes4',
       'C2L: MesCond', 'C2L: Monocyte', 'C2L: Myelocyte', 'C2L: MyoB1',
       'C2L: MyoB2', 'C2L: MyoC1', 'C2L: MyoC2', 'C2L: NK', 'C2L: NeuralFibro',
       'C2L: Neuronal', 'C2L: OCP', 'C2L: OsteoB', 'C2L: PAX3+MyoProg',
       'C2L: PAX7+MyoProg', 'C2L: Perichon', 'C2L: Per

In [64]:
list(adata.obs['sample'].unique())

['WSSS_THYst9383360', 'WSSS_THYst9383361', 'WSSS_THYst9383362']

In [65]:
# add the donor_id column

In [66]:
adata.obs['donor_id'] = adata.obs['sample']

In [67]:
# change datatype of the column

In [68]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [69]:
# view unique values of donor_id column

In [70]:
list(adata.obs['donor_id'].unique())

['WSSS_THYst9383360', 'WSSS_THYst9383361', 'WSSS_THYst9383362']

In [71]:
#view obs

In [72]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,C2L: SynapSchwann,C2L: Teno,C2L: TenoProg,C2L: TransMes,C2L: VenousEndo,assay_ontology_term_id,max_c2l_column,max_c2l_column_value,cell_type_ontology_term_id,donor_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383360
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,0.008215,0.025422,0.005845,0.018950,0.136347,EFO:0010961,C2L: MYL3+MyoC,8.189351,CL:0000187,WSSS_THYst9383360
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383360
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,0.013559,0.208248,0.011997,0.016958,0.028851,EFO:0010961,C2L: F10+DermFibroProg,8.147670,CL:0002551,WSSS_THYst9383360
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383362
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383362
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383362
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383362


In [73]:
adata.obs.columns

Index(['sample', 'batch', 'in_tissue', 'array_row', 'array_col', 'barcode',
       'library_id', 'percent_mito', 'C2L: ADH+Fibro', 'C2L: AER-Basal',
       'C2L: ArterialEndo', 'C2L: ArtiChon', 'C2L: B', 'C2L: Basal',
       'C2L: CMP/GMP', 'C2L: ChondroProg', 'C2L: DC2', 'C2L: DefErythro',
       'C2L: DefReticulo', 'C2L: DermFibro', 'C2L: DistalMes',
       'C2L: F10+DermFibroProg', 'C2L: HOXC5+DermFibroProg', 'C2L: HyperChon',
       'C2L: ISL1+Mes', 'C2L: InterMusFibro', 'C2L: InterZone',
       'C2L: LMPP/ELP', 'C2L: LymphEndo', 'C2L: MFAP5+Fibro', 'C2L: MYH3+MyoC',
       'C2L: MYL3+MyoC', 'C2L: Macro', 'C2L: Mast', 'C2L: Megakaryo',
       'C2L: Melano', 'C2L: Mes1', 'C2L: Mes2', 'C2L: Mes3', 'C2L: Mes4',
       'C2L: MesCond', 'C2L: Monocyte', 'C2L: Myelocyte', 'C2L: MyoB1',
       'C2L: MyoB2', 'C2L: MyoC1', 'C2L: MyoC2', 'C2L: NK', 'C2L: NeuralFibro',
       'C2L: Neuronal', 'C2L: OCP', 'C2L: OsteoB', 'C2L: PAX3+MyoProg',
       'C2L: PAX7+MyoProg', 'C2L: Perichon', 'C2L: Per

In [74]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,C2L: SynapSchwann,C2L: Teno,C2L: TenoProg,C2L: TransMes,C2L: VenousEndo,assay_ontology_term_id,max_c2l_column,max_c2l_column_value,cell_type_ontology_term_id,donor_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383360
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,0.008215,0.025422,0.005845,0.018950,0.136347,EFO:0010961,C2L: MYL3+MyoC,8.189351,CL:0000187,WSSS_THYst9383360
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383360
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,0.013559,0.208248,0.011997,0.016958,0.028851,EFO:0010961,C2L: F10+DermFibroProg,8.147670,CL:0002551,WSSS_THYst9383360
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383362
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383362
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383362
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,,,,,EFO:0010961,,,unknown,WSSS_THYst9383362


#### **development_stage_ontology_term_id**

In [75]:
# identify the column in adata which corresponds to age

In [76]:
# add the development_stage_ontology_term_id column

In [77]:
mapping = {'WSSS_THYst9383359':'PCW8.1',
'WSSS_THYst9383360':'PCW8.1',
'WSSS_THYst9383361':'PCW8.1',
'WSSS_THYst9383362':'PCW8.1',
'WSSS_THYst9699523':'PCW7.0',
'WSSS_THYst9699524':'PCW7.0',
'WSSS_THYst9699525':'PCW6.2',
'WSSS_THYst9699526':'PCW5.6'}

In [78]:
adata.obs['stage'] = adata.obs['sample'].map(mapping)

In [79]:
mapping= {'PCW8.1' :'HsapDv:0000030','PCW7.0':'HsapDv:0000026','PCW6.2':'HsapDv:0000024','PCW5.6':'HsapDv:0000023'}

In [80]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['stage'].map(mapping)

In [81]:
# change datatype of the column

In [82]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [83]:
# view unique values of development_stage_ontology_term_id column

In [84]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000030']

In [85]:
# view adata.obs

In [86]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,C2L: TenoProg,C2L: TransMes,C2L: VenousEndo,assay_ontology_term_id,max_c2l_column,max_c2l_column_value,cell_type_ontology_term_id,donor_id,stage,development_stage_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,,,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,0.005845,0.018950,0.136347,EFO:0010961,C2L: MYL3+MyoC,8.189351,CL:0000187,WSSS_THYst9383360,PCW8.1,HsapDv:0000030
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,,,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,0.011997,0.016958,0.028851,EFO:0010961,C2L: F10+DermFibroProg,8.147670,CL:0002551,WSSS_THYst9383360,PCW8.1,HsapDv:0000030
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,,,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030


#### **disease_ontology_term_id**

In [87]:
# Assign normal since all are healthy patients

In [88]:
# add the disease_ontology_term_id column

In [89]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [90]:
#change data type of column

In [91]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [92]:
# view obs

In [93]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,C2L: TransMes,C2L: VenousEndo,assay_ontology_term_id,max_c2l_column,max_c2l_column_value,cell_type_ontology_term_id,donor_id,stage,development_stage_ontology_term_id,disease_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,0.018950,0.136347,EFO:0010961,C2L: MYL3+MyoC,8.189351,CL:0000187,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,0.016958,0.028851,EFO:0010961,C2L: F10+DermFibroProg,8.147670,CL:0002551,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461


#### **is_primary_data**

In [94]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [95]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,C2L: VenousEndo,assay_ontology_term_id,max_c2l_column,max_c2l_column_value,cell_type_ontology_term_id,donor_id,stage,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,0.136347,EFO:0010961,C2L: MYL3+MyoC,8.189351,CL:0000187,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,0.028851,EFO:0010961,C2L: F10+DermFibroProg,8.147670,CL:0002551,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True


In [96]:
#change data type of column

In [97]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

#### **organism_ontology_term_id**

In [98]:
# assign organism id 

In [99]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [100]:
#change data type of column

In [101]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [102]:
# view obs

In [103]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,assay_ontology_term_id,max_c2l_column,max_c2l_column_value,cell_type_ontology_term_id,donor_id,stage,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,EFO:0010961,C2L: MYL3+MyoC,8.189351,CL:0000187,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,EFO:0010961,C2L: F10+DermFibroProg,8.147670,CL:0002551,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,EFO:0010961,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,EFO:0010961,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [104]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [105]:
# change data type

In [106]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [107]:
# view obs

In [108]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,max_c2l_column,max_c2l_column_value,cell_type_ontology_term_id,donor_id,stage,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,C2L: MYL3+MyoC,8.189351,CL:0000187,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,C2L: F10+DermFibroProg,8.147670,CL:0002551,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,,,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,,,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown


In [109]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

['unknown']

#### **sex_ontology_term_id**

In [110]:
mapping = {'5386STDY7537944':'F',
           '5478STDY7717491':'M',
'5478STDY7717492':'M',
'5478STDY7652318':'M',
'5386STDY7557336':'M',
'5386STDY7557337':'M',
'5386STDY7557335':'F',
'FCAImmP7536758':'F',
'FCAImmP7536759':'F',
'5478STDY7980348':'M',
'5478STDY7980349':'M',
'5478STDY7935101':'F',
'5478STDY7935102':'M',
'WSSS_THYst9384953':'M',
'WSSS_THYst9384954':'M',
'WSSS_THYst9384955':'M',
'WSSS_THYst9384956':'M',
'WSSS_THYst9384957':'M',
'WSSS_THYst9384958':'M',
'WSSS_THYst8796437':'M',
'WSSS_THYst8796438':'M',
'WSSS_THYst8796439':'M',
'WSSS_THYst8796440':'M',
'WSSS_THYst8796441':'M',
'WSSS_THYst8796442':'M',
'WSSS_THYst9383359':'M',
'WSSS_THYst9383360':'M',
'WSSS_THYst9383361':'M',
'WSSS_THYst9383362':'M',
'WSSS_THYst9699523':'F',
'WSSS_THYst9699524':'F',
'WSSS_THYst9699525':'F',
'WSSS_THYst9699526':'unknown'}

In [111]:
adata.obs['sex'] = adata.obs['sample'].map(mapping)

In [112]:
mapping= {'F': 'PATO:0000383', 'M': 'PATO:0000384', 'unknown':'unknown'}

In [113]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex'].map(mapping)

In [114]:
adata.obs['sex_ontology_term_id'] = ['PATO:0000384'] * len(adata.obs)

In [115]:
# change data type

In [116]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [117]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,cell_type_ontology_term_id,donor_id,stage,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,CL:0000187,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,CL:0002551,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,unknown,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,unknown,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384


#### **suspension_type**

In [118]:
# since visium suspension type is 'na'

In [119]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [120]:
# change data type

In [121]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [122]:
# view obs

In [123]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,donor_id,stage,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id,suspension_type
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,WSSS_THYst9383360,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,WSSS_THYst9383362,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na


#### **tissue_type**

In [124]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [125]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [126]:
list(adata.obs['sample'].unique())

['WSSS_THYst9383360', 'WSSS_THYst9383361', 'WSSS_THYst9383362']

In [127]:
mapping= {'WSSS_THYst9383360':'UBERON:0000978', 'WSSS_THYst9383361':'UBERON:0000978', 'WSSS_THYst9383362':'UBERON:0000978'}

In [128]:
adata.obs['tissue_ontology_term_id'] = adata.obs['sample'].map(mapping)

In [129]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [130]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0000978']

In [131]:
# view obs

In [132]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,array_row,array_col,barcode,library_id,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,...,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,0,16,,,,,,...,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,50,102,AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0.0,6.548307,0.005808,...,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,3,43,,,,,,...,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,59,19,AAACACCAATAACTGC-1,WSSS_THYst9383360,0.0,0.119246,0.005342,...,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,14,94,,,,,,...,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na,tissue,UBERON:0000978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,58,42,,,,,,...,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,60,30,,,,,,...,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,45,27,,,,,,...,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,73,41,,,,,,...,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,M,PATO:0000384,na,tissue,UBERON:0000978


In [133]:
adata.obs.columns

Index(['sample', 'batch', 'in_tissue', 'array_row', 'array_col', 'barcode',
       'library_id', 'percent_mito', 'C2L: ADH+Fibro', 'C2L: AER-Basal',
       'C2L: ArterialEndo', 'C2L: ArtiChon', 'C2L: B', 'C2L: Basal',
       'C2L: CMP/GMP', 'C2L: ChondroProg', 'C2L: DC2', 'C2L: DefErythro',
       'C2L: DefReticulo', 'C2L: DermFibro', 'C2L: DistalMes',
       'C2L: F10+DermFibroProg', 'C2L: HOXC5+DermFibroProg', 'C2L: HyperChon',
       'C2L: ISL1+Mes', 'C2L: InterMusFibro', 'C2L: InterZone',
       'C2L: LMPP/ELP', 'C2L: LymphEndo', 'C2L: MFAP5+Fibro', 'C2L: MYH3+MyoC',
       'C2L: MYL3+MyoC', 'C2L: Macro', 'C2L: Mast', 'C2L: Megakaryo',
       'C2L: Melano', 'C2L: Mes1', 'C2L: Mes2', 'C2L: Mes3', 'C2L: Mes4',
       'C2L: MesCond', 'C2L: Monocyte', 'C2L: Myelocyte', 'C2L: MyoB1',
       'C2L: MyoB2', 'C2L: MyoC1', 'C2L: MyoC2', 'C2L: NK', 'C2L: NeuralFibro',
       'C2L: Neuronal', 'C2L: OCP', 'C2L: OsteoB', 'C2L: PAX3+MyoProg',
       'C2L: PAX7+MyoProg', 'C2L: Perichon', 'C2L: Per

In [134]:
del adata.obs['barcode']
del adata.obs['max_c2l_column']
del adata.obs['max_c2l_column_value']
del adata.obs['array_row']
del adata.obs['array_col']
del adata.obs['library_id']
del adata.obs['sex']

#### **obsm (Embeddings)**

In [135]:
adata.obsm

AxisArrays with keys: X_spatial

In [136]:
adata.obsm.keys()

KeysView(AxisArrays with keys: X_spatial)

#### **uns (Dataset Metadata)**

In [137]:
adata.uns

OverloadedDict, wrapping:
	{'sample_colors': array(['#1f77b4', '#ff7f0e', '#2ca02c'], dtype=object), 'spatial': {'joint': {'images': {'hires': array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       ...,

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        

In [138]:
adata.uns['image_caption'] = 'Shown here is an image of ten micron thick cryosections of whole embryonic limb samples stained with H&E'

In [139]:
adata.uns['title'] = 'Visium_stitched'

In [140]:
adata.uns['default_embedding'] = 'X_spatial'

In [141]:
adata.uns.keys()

dict_keys(['sample_colors', 'spatial', 'image_caption', 'title', 'default_embedding'])

### **Final checks and adjustments**

In [142]:
adata

AnnData object with n_obs × n_vars = 14976 × 32831
    obs: 'sample', 'batch', 'in_tissue', 'percent_mito', 'C2L: ADH+Fibro', 'C2L: AER-Basal', 'C2L: ArterialEndo', 'C2L: ArtiChon', 'C2L: B', 'C2L: Basal', 'C2L: CMP/GMP', 'C2L: ChondroProg', 'C2L: DC2', 'C2L: DefErythro', 'C2L: DefReticulo', 'C2L: DermFibro', 'C2L: DistalMes', 'C2L: F10+DermFibroProg', 'C2L: HOXC5+DermFibroProg', 'C2L: HyperChon', 'C2L: ISL1+Mes', 'C2L: InterMusFibro', 'C2L: InterZone', 'C2L: LMPP/ELP', 'C2L: LymphEndo', 'C2L: MFAP5+Fibro', 'C2L: MYH3+MyoC', 'C2L: MYL3+MyoC', 'C2L: Macro', 'C2L: Mast', 'C2L: Megakaryo', 'C2L: Melano', 'C2L: Mes1', 'C2L: Mes2', 'C2L: Mes3', 'C2L: Mes4', 'C2L: MesCond', 'C2L: Monocyte', 'C2L: Myelocyte', 'C2L: MyoB1', 'C2L: MyoB2', 'C2L: MyoC1', 'C2L: MyoC2', 'C2L: NK', 'C2L: NeuralFibro', 'C2L: Neuronal', 'C2L: OCP', 'C2L: OsteoB', 'C2L: PAX3+MyoProg', 'C2L: PAX7+MyoProg', 'C2L: Perichon', 'C2L: Pericyte', 'C2L: Periderm', 'C2L: Perimysium', 'C2L: PrehyperChon', 'C2L: PrimErythro1', 'C2

In [143]:
adata.obs.dtypes

sample                                      category
batch                                       category
in_tissue                                      int64
percent_mito                                 float64
C2L: ADH+Fibro                               float64
                                              ...   
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
suspension_type                             category
tissue_type                                 category
tissue_ontology_term_id                     category
Length: 84, dtype: object

In [144]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [145]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed percent_mito from float64 to float32
changed C2L: ADH+Fibro from float64 to float32
changed C2L: AER-Basal from float64 to float32
changed C2L: ArterialEndo from float64 to float32
changed C2L: ArtiChon from float64 to float32
changed C2L: B from float64 to float32
changed C2L: Basal from float64 to float32
changed C2L: CMP/GMP from float64 to float32
changed C2L: ChondroProg from float64 to float32
changed C2L: DC2 from float64 to float32
changed C2L: DefErythro from float64 to float32
changed C2L: DefReticulo from float64 to float32
changed C2L: DermFibro from float64 to float32
changed C2L: DistalMes from float64 to float32
changed C2L: F10+DermFibroProg from float64 to float32
changed C2L: HOXC5+DermFibroProg from float64 to float32
changed C2L: HyperChon from float64 to float32
changed C2L: ISL1+Mes from float64 to float32
changed C2L: InterMusFibro from float64 to float32
changed C2L: InterZone from float64 to float32
changed C2L: LMPP/ELP from float64 to float32
changed 

In [146]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,C2L: ArterialEndo,C2L: ArtiChon,C2L: B,C2L: Basal,...,stage,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,0.0,6.548307,0.005808,0.138850,0.034429,0.005005,0.002333,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,0.0,0.119246,0.005342,0.031597,0.104752,0.010040,0.004520,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978


In [147]:
adata.obs.columns

Index(['sample', 'batch', 'in_tissue', 'percent_mito', 'C2L: ADH+Fibro',
       'C2L: AER-Basal', 'C2L: ArterialEndo', 'C2L: ArtiChon', 'C2L: B',
       'C2L: Basal', 'C2L: CMP/GMP', 'C2L: ChondroProg', 'C2L: DC2',
       'C2L: DefErythro', 'C2L: DefReticulo', 'C2L: DermFibro',
       'C2L: DistalMes', 'C2L: F10+DermFibroProg', 'C2L: HOXC5+DermFibroProg',
       'C2L: HyperChon', 'C2L: ISL1+Mes', 'C2L: InterMusFibro',
       'C2L: InterZone', 'C2L: LMPP/ELP', 'C2L: LymphEndo', 'C2L: MFAP5+Fibro',
       'C2L: MYH3+MyoC', 'C2L: MYL3+MyoC', 'C2L: Macro', 'C2L: Mast',
       'C2L: Megakaryo', 'C2L: Melano', 'C2L: Mes1', 'C2L: Mes2', 'C2L: Mes3',
       'C2L: Mes4', 'C2L: MesCond', 'C2L: Monocyte', 'C2L: Myelocyte',
       'C2L: MyoB1', 'C2L: MyoB2', 'C2L: MyoC1', 'C2L: MyoC2', 'C2L: NK',
       'C2L: NeuralFibro', 'C2L: Neuronal', 'C2L: OCP', 'C2L: OsteoB',
       'C2L: PAX3+MyoProg', 'C2L: PAX7+MyoProg', 'C2L: Perichon',
       'C2L: Pericyte', 'C2L: Periderm', 'C2L: Perimysium',
       

In [148]:
adata.var

Unnamed: 0_level_0,feature_types,genome,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,Gene Expression,GRCh38,False
ENSG00000237613,Gene Expression,GRCh38,False
ENSG00000186092,Gene Expression,GRCh38,False
ENSG00000238009,Gene Expression,GRCh38,False
ENSG00000239945,Gene Expression,GRCh38,False
...,...,...,...
ENSG00000277856,Gene Expression,GRCh38,False
ENSG00000275063,Gene Expression,GRCh38,False
ENSG00000271254,Gene Expression,GRCh38,False
ENSG00000277475,Gene Expression,GRCh38,False


In [149]:
adata.obs

Unnamed: 0,sample,batch,in_tissue,percent_mito,C2L: ADH+Fibro,C2L: AER-Basal,C2L: ArterialEndo,C2L: ArtiChon,C2L: B,C2L: Basal,...,stage,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
WSSS_THYst9383360_AAACAACGAATAGTTC-1,WSSS_THYst9383360,0,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACAAGTATCTCCCA-1,WSSS_THYst9383360,0,0,0.0,6.548307,0.005808,0.138850,0.034429,0.005005,0.002333,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACAATCTACTAGCA-1,WSSS_THYst9383360,0,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACACCAATAACTGC-1,WSSS_THYst9383360,0,1,0.0,0.119246,0.005342,0.031597,0.104752,0.010040,0.004520,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383360_AAACAGAGCGACTCCT-1,WSSS_THYst9383360,0,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_THYst9383362_TTGTTTCACATCCAGG-1,WSSS_THYst9383362,2,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383362_TTGTTTCATTAGTCTA-1,WSSS_THYst9383362,2,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383362_TTGTTTCCATACAACT-1,WSSS_THYst9383362,2,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978
WSSS_THYst9383362_TTGTTTGTATTACACG-1,WSSS_THYst9383362,2,0,,,,,,,,...,PCW8.1,HsapDv:0000030,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000384,na,tissue,UBERON:0000978


In [150]:
adata.obs.columns

Index(['sample', 'batch', 'in_tissue', 'percent_mito', 'C2L: ADH+Fibro',
       'C2L: AER-Basal', 'C2L: ArterialEndo', 'C2L: ArtiChon', 'C2L: B',
       'C2L: Basal', 'C2L: CMP/GMP', 'C2L: ChondroProg', 'C2L: DC2',
       'C2L: DefErythro', 'C2L: DefReticulo', 'C2L: DermFibro',
       'C2L: DistalMes', 'C2L: F10+DermFibroProg', 'C2L: HOXC5+DermFibroProg',
       'C2L: HyperChon', 'C2L: ISL1+Mes', 'C2L: InterMusFibro',
       'C2L: InterZone', 'C2L: LMPP/ELP', 'C2L: LymphEndo', 'C2L: MFAP5+Fibro',
       'C2L: MYH3+MyoC', 'C2L: MYL3+MyoC', 'C2L: Macro', 'C2L: Mast',
       'C2L: Megakaryo', 'C2L: Melano', 'C2L: Mes1', 'C2L: Mes2', 'C2L: Mes3',
       'C2L: Mes4', 'C2L: MesCond', 'C2L: Monocyte', 'C2L: Myelocyte',
       'C2L: MyoB1', 'C2L: MyoB2', 'C2L: MyoC1', 'C2L: MyoC2', 'C2L: NK',
       'C2L: NeuralFibro', 'C2L: Neuronal', 'C2L: OCP', 'C2L: OsteoB',
       'C2L: PAX3+MyoProg', 'C2L: PAX7+MyoProg', 'C2L: Perichon',
       'C2L: Pericyte', 'C2L: Periderm', 'C2L: Perimysium',
       

In [151]:
#check the format of expression matrix

In [152]:
adata.X

<14976x32831 sparse matrix of type '<class 'numpy.float32'>'
	with 11353544 stored elements in Compressed Sparse Row format>

In [153]:
#write the curated object to final_objects folder

In [154]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/limb_cell_atlas/Final_objects/visium_stitched.h5ad', compression = 'gzip')