### **Curating visium_slide3.h5ad**

Article:   A human embryonic limb cell atlas resolved in space and time

DOI: https://doi.org/10.1038/s41586-023-06806-x

Data Source : https://developmental.cellatlas.io/embryonic-limb

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [None]:
#Import all packages required for curation

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [None]:
# Load the AnnData object

In [None]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/limb_cell_atlas/Data/New_data/visium_slide3.h5ad')

In [None]:
# View the AnnData object

In [None]:
adata

##### **X- expression matrix**

In [None]:
# View the expression matrix of the anndata object

In [None]:
adata.X

In [None]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [None]:
print(adata.X)

##### **Variables(var)**

In [None]:
#View the var of anndata and raw object

In [None]:
adata.var

In [None]:
#Store ensembl ids in a new column in adata.var by matching gene symbols and ensembl ids from the gene information file

In [None]:
adata.var['gene_symbols'] = adata.var_names

In [None]:
adata.var_names = adata.var['ensg_ids']

In [None]:
adata.var

In [None]:
# load the approved genes file

In [None]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [None]:
# Create a dictionary from the approved genes file using the symbols and feature id columns.

In [None]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [None]:
genedict

In [None]:
len(genedict)

In [None]:
# Filter out the genes which are not in the approved genes file

In [None]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]

In [None]:
len(var_to_keep_adata)

In [None]:
# Modify the anndata object by filtering out the filtered genes. copy the index column values to a new column called gene_symbols

In [None]:
adata = adata[:, var_to_keep_adata].copy()

In [None]:
#  View the var

In [None]:
adata.var

feature is filtered

In [None]:
# view var

In [None]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [None]:
adata.var

In [None]:
del adata.var['ensg_ids']
del adata.var['gene_symbols']

#### **Observations(obs) (Cell metadata)**

In [None]:
#view obs

In [None]:
adata.obs

In [None]:
adata.obs.columns

#### **assay_ontology_term_id**

In [None]:
# add the assay_ontology_term_id column

In [None]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [None]:
# change datatype of the column

In [None]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [None]:
# view adata.obs

In [None]:
adata.obs

#### **cell_type_ontology_term_id**

In [None]:
#get the column in adata.obs related. to cell type annotation

In [None]:
adata.obs.columns

In [None]:
adata.obsm

In [None]:
c2l_columns = [col for col in adata.obs.columns if col.startswith('C2L')]

In [None]:
c2l_columns

In [None]:
adata.obs['max_c2l_column'] = adata.obs[c2l_columns].idxmax(axis=1)

In [None]:
adata.obs['max_c2l_column_value'] = adata.obs[c2l_columns].max(axis=1)

In [None]:
adata.obs

In [None]:
list(adata.obs['max_c2l_column'].unique())

In [None]:
import pandas as pd

# Assuming adata.obs['col'] is a pandas Series object
adata.obs['max_c2l_column'] = adata.obs['max_c2l_column'].fillna('NaN').astype(str)


In [None]:
list(adata.obs['max_c2l_column'].unique())

In [None]:
mapping= {'C2L: Teno' :'CL:0000388',
 'C2L: SMC':'CL:0000192',
 'C2L: F10+DermFibroProg':'CL:0002551',
 'C2L: MYL3+MyoC':'CL:0000187',
 'C2L: HOXC5+DermFibroProg':'CL:0002551',
 'C2L: MFAP5+Fibro':'CL:0000057',
 'C2L: Pericyte':'CL:0000669',
 'C2L: HyperChon':'CL:0000743',
 'C2L: MYH3+MyoC':'CL:0000187',
 'C2L: Periderm':'CL:0000078',
 'C2L: Perimysium':'CL:0002320',
 'C2L: DermFibro':'CL:0002551',
 'C2L: Schwann':'CL:0002573',
 'C2L: ADH+Fibro':'CL:1001609',
 'C2L: RestingChon':'CL:0000138',
 'C2L: MesCond':'CL:0000138',
 'C2L: DefErythro':'CL:0000232',
 'C2L: ArtiChon':'CL:1001607',
 'C2L: Basal':'CL:0000646',
 'C2L: PAX7+MyoProg':'CL:0000187',
 'C2L: Macro':'CL:0000235',
 'C2L: ArterialEndo':'CL:1000413',
 'C2L: DefReticulo':'CL:0000558',
 'C2L: VenousEndo':'CL:0002543',
 'C2L: OsteoB':'CL:0000062',
 'C2L: NeuralFibro':'CL:0000057',
 'C2L: PrimErythro1':'CL:0002355',
 'C2L: AER-Basal':'CL:0000646',
 'C2L: ChondroProg':'CL:0000138',
 'C2L: SMProg':'CL:0000192',
 'NaN':'unknown',
 'C2L: DistalMes':'CL:0008019',
 'C2L: Neuronal':'CL:0000540',
 'C2L: PrehyperChon':'CL:0000138', 
 'C2L: Megakaryo' :'CL:0000556',
 'C2L: LymphEndo' : 'CL:0002138',
 'C2L: Mes4':'CL:0008019',
 'C2L: STMN2+Fibro':'CL:0002551',
 'C2L: InterZone':'CL:0008019',
 'C2L: SynapSchwann':'CL:0002573',
 'C2L: DC2':'CL:0000990',
 'C2L: SchwannProg':'CL:0002573',
 'C2L: PAX3+MyoProg':'CL:0000515',
 'C2L: Mes3':'CL:0008019',
 'C2L: Perichon':'CL:0000058',
 'C2L: TenoProg':'CL:0000388',
 'C2L: ProlifChon':'CL:0000138',
 'C2L: RDH10+DistalMes':'CL:0008019',
 'C2L: PrimErythro2':'CL:0002355',
 'C2L: TransMes':'CL:0008019',
 'C2L: Mes2':'CL:0008019'}


In [None]:
# create a dictionary of cell type and ontology term

In [None]:
# add the cell_type_ontology_term_id column

In [None]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['max_c2l_column'].map(mapping)

In [None]:
# change datatype of the column

In [None]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [None]:
list(adata.obs['cell_type_ontology_term_id'].unique())

In [None]:
adata.obs

#### **donor_id**

In [None]:
#identify the column in adata.obs which provides donor information

In [None]:
adata.obs.columns

In [None]:
list(adata.obs['library_id'].unique())

In [None]:
# add the donor_id column

In [None]:
adata.obs['donor_id'] = adata.obs['library_id']

In [None]:
# change datatype of the column

In [None]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [None]:
# view unique values of donor_id column

In [None]:
list(adata.obs['donor_id'].unique())

In [None]:
#view obs

In [None]:
adata.obs

In [None]:
adata.obs.columns

In [None]:
adata.obs

#### **development_stage_ontology_term_id**

In [None]:
# identify the column in adata which corresponds to age

In [None]:
# add the development_stage_ontology_term_id column

In [None]:
mapping = {'WSSS_THYst9383359':'PCW8.1',
'WSSS_THYst9383360':'PCW8.1',
'WSSS_THYst9383361':'PCW8.1',
'WSSS_THYst9383362':'PCW8.1',
'WSSS_THYst9699523':'PCW7.0',
'WSSS_THYst9699524':'PCW7.0',
'WSSS_THYst9699525':'PCW6.2',
'WSSS_THYst9699526':'PCW5.6'}

In [None]:
adata.obs['stage'] = adata.obs['library_id'].map(mapping)

In [None]:
mapping= {'PCW8.1' :'HsapDv:0000030','PCW7.0':'HsapDv:0000026','PCW6.2':'HsapDv:0000024','PCW5.6':'HsapDv:0000023'}

In [None]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['stage'].map(mapping)

In [None]:
# change datatype of the column

In [None]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [None]:
# view unique values of development_stage_ontology_term_id column

In [None]:
list(adata.obs['development_stage_ontology_term_id'].unique())

In [None]:
# view adata.obs

In [None]:
adata.obs

#### **disease_ontology_term_id**

In [None]:
# Assign normal since all are healthy patients

In [None]:
# add the disease_ontology_term_id column

In [None]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [None]:
#change data type of column

In [None]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **is_primary_data**

In [None]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [None]:
adata.obs

In [None]:
#change data type of column

In [None]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

#### **organism_ontology_term_id**

In [None]:
# assign organism id 

In [None]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [None]:
#change data type of column

In [None]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **self_reported_ethnicity_ontology_term_id**

In [None]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [None]:
# change data type

In [None]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

In [None]:
list(adata.obs['self_reported_ethnicity_ontology_term_id'].unique())

#### **sex_ontology_term_id**

In [None]:
mapping = {'5386STDY7537944':'F',
           '5478STDY7717491':'M',
'5478STDY7717492':'M',
'5478STDY7652318':'M',
'5386STDY7557336':'M',
'5386STDY7557337':'M',
'5386STDY7557335':'F',
'FCAImmP7536758':'F',
'FCAImmP7536759':'F',
'5478STDY7980348':'M',
'5478STDY7980349':'M',
'5478STDY7935101':'F',
'5478STDY7935102':'M',
'WSSS_THYst9384953':'M',
'WSSS_THYst9384954':'M',
'WSSS_THYst9384955':'M',
'WSSS_THYst9384956':'M',
'WSSS_THYst9384957':'M',
'WSSS_THYst9384958':'M',
'WSSS_THYst8796437':'M',
'WSSS_THYst8796438':'M',
'WSSS_THYst8796439':'M',
'WSSS_THYst8796440':'M',
'WSSS_THYst8796441':'M',
'WSSS_THYst8796442':'M',
'WSSS_THYst9383359':'M',
'WSSS_THYst9383360':'M',
'WSSS_THYst9383361':'M',
'WSSS_THYst9383362':'M',
'WSSS_THYst9699523':'F',
'WSSS_THYst9699524':'F',
'WSSS_THYst9699525':'F',
'WSSS_THYst9699526':'unknown'}

In [None]:
adata.obs['sex'] = adata.obs['library_id'].map(mapping)

In [None]:
mapping= {'F': 'PATO:0000383', 'M': 'PATO:0000384', 'unknown':'unknown'}

In [None]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex'].map(mapping)

In [None]:
adata.obs['sex_ontology_term_id'] = ['PATO:0000384'] * len(adata.obs)

In [None]:
# change data type

In [None]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [None]:
adata.obs

#### **suspension_type**

In [None]:
# since visium suspension type is 'na'

In [None]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [None]:
# change data type

In [None]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [None]:
# view obs

In [None]:
adata.obs

#### **tissue_type**

In [None]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [None]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [None]:
list(adata.obs['library_id'].unique())

In [None]:
mapping= {'WSSS_THYst9383360':'UBERON:0000978', 'WSSS_THYst9383361':'UBERON:0000978', 'WSSS_THYst9383362':'UBERON:0000978'}

In [None]:
adata.obs['tissue_ontology_term_id'] = adata.obs['library_id'].map(mapping)

In [None]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [None]:
list(adata.obs['tissue_ontology_term_id'].unique())

In [None]:
# view obs

In [None]:
adata.obs

In [None]:
adata.obs.columns

In [None]:
del adata.obs['barcode']
del adata.obs['max_c2l_column']
del adata.obs['max_c2l_column_value']
del adata.obs['array_row']
del adata.obs['array_col']
del adata.obs['library_id']
del adata.obs['sex']

#### **obsm (Embeddings)**

In [None]:
adata.obsm

In [None]:
adata.obsm.keys()

#### **uns (Dataset Metadata)**

In [None]:
adata.uns

In [None]:
adata.uns['image_caption'] = 'Shown here is an image of ten micron thick cryosections of whole embryonic limb samples stained with H&E'

In [None]:
adata.uns['title'] = 'visium_slide3'

In [None]:
adata.uns['default_embedding'] = 'X_spatial'

In [None]:
adata.uns.keys()

### **Final checks and adjustments**

In [None]:
adata

In [None]:
adata.obs.dtypes

In [None]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [None]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

In [None]:
adata.obs

In [None]:
adata.obs.columns

In [None]:
adata.var

In [None]:
adata.obs

In [None]:
adata.obs.columns

In [None]:
#check the format of expression matrix

In [None]:
adata.X

In [None]:
#write the curated object to final_objects folder

In [None]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/limb_cell_atlas/Final_objects/visium_slide3.h5ad', compression = 'gzip')