### **Curating Pla_Camb9518737 datasets**

Article: Spatial multiomics map of trophoblast development in early pregnancy

DOI : https://doi.org/10.1038/s41586-023-05869-0

Data Source : https://www.reproductivecellatlas.org/mfi.html

## Mount farm

mount-farm

## Packages required for curation

### Import all packages required for the curation

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess

## Curation Schema

### X (Matrix Layers)

#### AnnData object

In [2]:
# Load the AnnData object

In [3]:
file = '/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/new_datasets/Visium_spatial_ID_36173_Pla_Camb9518737_raw'

In [4]:
adata = sc.read_h5ad(file + '.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 4071 × 33538
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

#### Raw Counts matrix

In [7]:
# Check whether adata has raw counts or normalized counts

In [8]:
print(adata.X)

  (1008, 0)	1.0
  (248, 3)	1.0
  (437, 3)	1.0
  (898, 3)	1.0
  (1082, 3)	1.0
  (1465, 3)	1.0
  (1984, 3)	1.0
  (2420, 3)	1.0
  (2761, 3)	1.0
  (3231, 3)	1.0
  (3489, 3)	1.0
  (232, 7)	1.0
  (1428, 7)	1.0
  (92, 12)	1.0
  (139, 12)	1.0
  (156, 12)	1.0
  (175, 12)	1.0
  (208, 12)	1.0
  (221, 12)	1.0
  (275, 12)	1.0
  (441, 12)	1.0
  (463, 12)	1.0
  (466, 12)	1.0
  (612, 12)	1.0
  (758, 12)	1.0
  :	:
  (3377, 33527)	1.0
  (3500, 33527)	1.0
  (3563, 33527)	1.0
  (3703, 33527)	1.0
  (3726, 33527)	1.0
  (3738, 33527)	1.0
  (3767, 33527)	1.0
  (3859, 33527)	1.0
  (3897, 33527)	1.0
  (3993, 33527)	1.0
  (1865, 33528)	1.0
  (2334, 33528)	1.0
  (3610, 33528)	1.0
  (644, 33530)	1.0
  (1331, 33530)	1.0
  (3688, 33530)	1.0
  (3912, 33530)	1.0
  (19, 33535)	1.0
  (477, 33535)	1.0
  (729, 33535)	1.0
  (1216, 33535)	1.0
  (1822, 33535)	1.0
  (1850, 33535)	1.0
  (2276, 33535)	1.0
  (3897, 33535)	1.0


In [9]:
# if its floating point numbers, they are normalized counts.if they are integers, they are raw counts

In [10]:
print(adata.raw.X)

  (0, 27)	7.0
  (0, 29)	1.0
  (0, 32)	1.0
  (0, 39)	5.0
  (0, 47)	1.0
  (0, 48)	1.0
  (0, 51)	1.0
  (0, 52)	5.0
  (0, 54)	2.0
  (0, 61)	1.0
  (0, 66)	2.0
  (0, 71)	1.0
  (0, 74)	1.0
  (0, 78)	3.0
  (0, 89)	2.0
  (0, 93)	1.0
  (0, 98)	1.0
  (0, 117)	1.0
  (0, 121)	2.0
  (0, 122)	1.0
  (0, 151)	1.0
  (0, 154)	4.0
  (0, 157)	1.0
  (0, 163)	1.0
  (0, 166)	4.0
  :	:
  (4070, 30392)	1.0
  (4070, 30683)	1.0
  (4070, 31340)	1.0
  (4070, 31341)	1.0
  (4070, 31342)	2.0
  (4070, 31347)	1.0
  (4070, 31348)	1.0
  (4070, 31441)	1.0
  (4070, 31501)	1.0
  (4070, 31583)	2.0
  (4070, 31588)	1.0
  (4070, 31764)	1.0
  (4070, 32010)	1.0
  (4070, 32293)	1.0
  (4070, 32626)	2.0
  (4070, 32703)	1.0
  (4070, 32727)	2.0
  (4070, 32891)	1.0
  (4070, 33339)	1.0
  (4070, 33396)	1.0
  (4070, 33498)	4.0
  (4070, 33503)	1.0
  (4070, 33504)	1.0
  (4070, 33505)	2.0
  (4070, 33508)	2.0


In [11]:
adata.raw.X

<4071x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 8591161 stored elements in Compressed Sparse Row format>

In [12]:
adata.X

<4071x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 8591161 stored elements in Compressed Sparse Column format>

In [13]:
# Check whether adata and araw has same dimensions

In [14]:
adata

AnnData object with n_obs × n_vars = 4071 × 33538
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

## Variables (var)

In [15]:
# View adata.var and araw.var

In [16]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


In [17]:
adata.raw.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


#### Ensembl IDs

#### Ensembl ids provided in a single column

In [18]:
# Check whether ensembl ids  are provided in  var

In [19]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


##### Approved gene information file

In [20]:
# Give the approved gene information file name and path

In [21]:
var_mapping_file = '/home/jovyan/CellxGene_Curation/Gene_Info_File/genes_approved.csv'

In [22]:
var_map_df = pd.read_csv(var_mapping_file)

In [23]:
var_map_df

Unnamed: 0,feature_id,symb,num,length
0,ERCC-00002,ERCC-00002 (spike-in control),1.0,1061
1,ERCC-00003,ERCC-00003 (spike-in control),1.0,1023
2,ERCC-00004,ERCC-00004 (spike-in control),1.0,523
3,ERCC-00009,ERCC-00009 (spike-in control),1.0,984
4,ERCC-00012,ERCC-00012 (spike-in control),1.0,994
...,...,...,...,...
116179,ENSSASG00005000009,ORF7a,,366
116180,ENSSASG00005000012,ORF7b,,132
116181,ENSSASG00005000008,ORF8,,366
116182,ENSSASG00005000005,N,,1260


In [24]:
#Add a column for Gene IDs based on a column with Gene version if ensembl ids are provided as gene versions

In [25]:
adata.var['gene_symbols'] = adata.var.index

In [26]:
adata.var['ensembl_ids'] = adata.var['gene_ids']

In [27]:
# Make the ensembl ids column as the index column

In [28]:
adata.var = adata.var.set_index('ensembl_ids')

In [29]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1


##### Filter out genes that dont appear in the approved annotation

In [30]:
var_to_keep = adata.var.index.tolist()

In [31]:
var_to_keep

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000236601',
 'ENSG00000284733',
 'ENSG00000235146',
 'ENSG00000284662',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000273443',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',


In [32]:
len(var_to_keep)

33538

In [33]:
var_in_approved = adata.var.index[adata.var.index.isin(var_map_df['feature_id'])].tolist()

In [34]:
var_in_approved

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000236601',
 'ENSG00000284733',
 'ENSG00000235146',
 'ENSG00000284662',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000273443',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',


In [35]:
len(var_in_approved)

33234

In [36]:
var_to_keep = [e for e in var_to_keep if e in var_in_approved]


In [37]:
len(var_to_keep)

33234

In [38]:
adata = adata[:, var_to_keep]

In [39]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1


#### feature_is_filtered

In [40]:
#This MUST be True if the feature was filtered out in the normalized matrix (X) but is present in the raw matrix (raw.X).Otherwise false 

In [41]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [42]:
# View var

In [43]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG,False
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A,False
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5,False
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1,False
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3,False
...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2,False
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1,False
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1,False


In [44]:
# View adata and araw

In [45]:
adata

AnnData object with n_obs × n_vars = 4071 × 33234
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

## obs (Cell metadata)

In [46]:
# View obs

In [47]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dS1_mRNA_factors,u_sf_dS2_mRNA_factors,u_sf_dS3_mRNA_factors,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,36173_Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,1199.672241,830.871643,1847.062256,308.683197,331.071777,2703.110596,612.525574,375.193420,30358.902344,3932.055176
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,36173_Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,63.943253,84.633728,200.385101,50.112194,52.695633,96.163094,158.929153,243.026108,60.335632,47.268257
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,36173_Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,107.135918,123.268547,332.767090,86.487198,77.772659,276.008575,432.015045,786.160461,127.510620,113.496750
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,36173_Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,113.011932,85.596176,198.011276,65.109856,57.854637,80.722313,1035.936768,454.550720,51.649097,406.043335
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,36173_Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,36.175301,48.243610,85.721558,24.059019,22.264326,22.372486,63.259857,52.433914,17.225506,27.665653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,36173_Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,273.145813,631.961670,2342.851318,200.064514,182.693848,2075.194824,254.549438,175.300339,15494.495117,347.261902
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,36173_Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,71.097427,201.248489,2487.297852,48.246975,45.208618,302.110321,165.335098,133.029633,125.961365,71.110420
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,36173_Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,55.443493,68.418495,489.472809,45.392971,24.001490,219.425446,52.594616,51.875038,227.164246,55.441219
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,36173_Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,82.026070,102.952812,975.242126,45.228523,42.278877,137.239288,257.849213,354.588135,137.183762,98.473907


In [48]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density',
       'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density',
       'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density',
       'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density',
       'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density',
       'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density',
       'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density',
       'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density',
       'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density',
       'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density',
       'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density',
       'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density',
       'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density',
       'w_sf_dNK2_cell_density

### assay_ontology_term_id

In [49]:
# if same for all columns

In [50]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [51]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dS2_mRNA_factors,u_sf_dS3_mRNA_factors,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,36173_Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,830.871643,1847.062256,308.683197,331.071777,2703.110596,612.525574,375.193420,30358.902344,3932.055176,EFO:0010961
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,36173_Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,84.633728,200.385101,50.112194,52.695633,96.163094,158.929153,243.026108,60.335632,47.268257,EFO:0010961
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,36173_Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,123.268547,332.767090,86.487198,77.772659,276.008575,432.015045,786.160461,127.510620,113.496750,EFO:0010961
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,36173_Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,85.596176,198.011276,65.109856,57.854637,80.722313,1035.936768,454.550720,51.649097,406.043335,EFO:0010961
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,36173_Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,48.243610,85.721558,24.059019,22.264326,22.372486,63.259857,52.433914,17.225506,27.665653,EFO:0010961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,36173_Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,631.961670,2342.851318,200.064514,182.693848,2075.194824,254.549438,175.300339,15494.495117,347.261902,EFO:0010961
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,36173_Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,201.248489,2487.297852,48.246975,45.208618,302.110321,165.335098,133.029633,125.961365,71.110420,EFO:0010961
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,36173_Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,68.418495,489.472809,45.392971,24.001490,219.425446,52.594616,51.875038,227.164246,55.441219,EFO:0010961
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,36173_Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,102.952812,975.242126,45.228523,42.278877,137.239288,257.849213,354.588135,137.183762,98.473907,EFO:0010961


### cell_type_ontology_term_id

In [52]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density',
       'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density',
       'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density',
       'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density',
       'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density',
       'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density',
       'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density',
       'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density',
       'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density',
       'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density',
       'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density',
       'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density',
       'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density',
       'w_sf_dNK2_cell_density

In [53]:
cell_density_columns = [col for col in adata.obs.columns if 'cell_density' in col]

In [54]:
cell_density_columns

['w_sf_B_cells_cell_density',
 'w_sf_DC_cell_density',
 'w_sf_EVT_1_cell_density',
 'w_sf_EVT_2_cell_density',
 'w_sf_Endo_F_cell_density',
 'w_sf_Endo_L_cell_density',
 'w_sf_Endo_M_cell_density',
 'w_sf_GC_cell_density',
 'w_sf_Granulocytes_cell_density',
 'w_sf_HOFB_cell_density',
 'w_sf_ILC3_cell_density',
 'w_sf_M3_cell_density',
 'w_sf_MO_cell_density',
 'w_sf_NK_cell_density',
 'w_sf_PV_cell_density',
 'w_sf_Plasma_cell_density',
 'w_sf_SCT_cell_density',
 'w_sf_T_cells_cell_density',
 'w_sf_VCT_cell_density',
 'w_sf_VCT_CCC_cell_density',
 'w_sf_VCT_fusing_cell_density',
 'w_sf_VCT_p_cell_density',
 'w_sf_dDC_cell_density',
 'w_sf_dEpi_lumenal_cell_density',
 'w_sf_dEpi_secretory_cell_density',
 'w_sf_dM1_cell_density',
 'w_sf_dM2_cell_density',
 'w_sf_dNK1_cell_density',
 'w_sf_dNK2_cell_density',
 'w_sf_dNK3_cell_density',
 'w_sf_dS1_cell_density',
 'w_sf_dS2_cell_density',
 'w_sf_dS3_cell_density',
 'w_sf_dT_cells_cell_density',
 'w_sf_dT_regs_cell_density',
 'w_sf_eEVT_cell

In [55]:
max_columns = adata.obs[cell_density_columns].apply(lambda x: x[x == x.max()].index.tolist(), axis=1)

In [56]:
max_columns

barcode_sample
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737              [w_sf_iEVT_cell_density]
AAACACCAATAACTGC-1_36173_Pla_Camb9518737               [w_sf_SCT_cell_density]
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737               [w_sf_VCT_cell_density]
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737               [w_sf_fF1_cell_density]
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737    [w_sf_dEpi_secretory_cell_density]
                                                           ...                
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737              [w_sf_iEVT_cell_density]
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737               [w_sf_dS3_cell_density]
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737               [w_sf_dS3_cell_density]
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737               [w_sf_SCT_cell_density]
TTGTTTGTATTACACG-1_36173_Pla_Camb9518737               [w_sf_SCT_cell_density]
Length: 4071, dtype: object

In [57]:
columns_with_multiple_max = max_columns.apply(lambda x: [col for col in x if x.count(col)>1])

In [58]:
adata.obs['columns_with_multiple_max'] = columns_with_multiple_max

In [59]:
adata.obs['columns_with_multiple_max']

barcode_sample
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737    []
AAACACCAATAACTGC-1_36173_Pla_Camb9518737    []
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737    []
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737    []
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737    []
                                            ..
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737    []
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737    []
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737    []
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737    []
TTGTTTGTATTACACG-1_36173_Pla_Camb9518737    []
Name: columns_with_multiple_max, Length: 4071, dtype: object

In [60]:
non_empty_columns = columns_with_multiple_max[columns_with_multiple_max .apply(lambda x: len(x) > 0)]

In [61]:
len(non_empty_columns )

0

In [62]:
#max_columns = max_columns.apply(lambda x: [col.replace('w_sf_','').replace('_cell_density','') for col in x])

In [63]:
max_columns = [col[0].replace('w_sf_','').replace('_cell_density','')if len(col) > 0 else '' for col in max_columns]

In [64]:
adata.obs['highest_cell_density_columns'] = max_columns

In [65]:
adata.obs['highest_cell_density_columns']

barcode_sample
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737              iEVT
AAACACCAATAACTGC-1_36173_Pla_Camb9518737               SCT
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737               VCT
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737               fF1
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737    dEpi_secretory
                                                 ...      
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737              iEVT
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737               dS3
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737               dS3
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737               SCT
TTGTTTGTATTACACG-1_36173_Pla_Camb9518737               SCT
Name: highest_cell_density_columns, Length: 4071, dtype: object

In [66]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,36173_Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,308.683197,331.071777,2703.110596,612.525574,375.193420,30358.902344,3932.055176,EFO:0010961,[],iEVT
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,36173_Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,50.112194,52.695633,96.163094,158.929153,243.026108,60.335632,47.268257,EFO:0010961,[],SCT
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,36173_Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,86.487198,77.772659,276.008575,432.015045,786.160461,127.510620,113.496750,EFO:0010961,[],VCT
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,36173_Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,65.109856,57.854637,80.722313,1035.936768,454.550720,51.649097,406.043335,EFO:0010961,[],fF1
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,36173_Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,24.059019,22.264326,22.372486,63.259857,52.433914,17.225506,27.665653,EFO:0010961,[],dEpi_secretory
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,36173_Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,200.064514,182.693848,2075.194824,254.549438,175.300339,15494.495117,347.261902,EFO:0010961,[],iEVT
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,36173_Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,48.246975,45.208618,302.110321,165.335098,133.029633,125.961365,71.110420,EFO:0010961,[],dS3
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,36173_Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,45.392971,24.001490,219.425446,52.594616,51.875038,227.164246,55.441219,EFO:0010961,[],dS3
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,36173_Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,45.228523,42.278877,137.239288,257.849213,354.588135,137.183762,98.473907,EFO:0010961,[],SCT


In [67]:
adata.obs['highest_cell_density_columns']

barcode_sample
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737              iEVT
AAACACCAATAACTGC-1_36173_Pla_Camb9518737               SCT
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737               VCT
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737               fF1
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737    dEpi_secretory
                                                 ...      
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737              iEVT
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737               dS3
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737               dS3
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737               SCT
TTGTTTGTATTACACG-1_36173_Pla_Camb9518737               SCT
Name: highest_cell_density_columns, Length: 4071, dtype: object

In [68]:
list(adata.obs['highest_cell_density_columns'].unique())

['iEVT',
 'SCT',
 'VCT',
 'fF1',
 'dEpi_secretory',
 'dS3',
 'uSMC',
 'VCT_p',
 'EVT_1',
 'HOFB',
 'fF2',
 'VCT_CCC',
 'VCT_fusing',
 'EVT_2',
 'Endo_F',
 'dNK1',
 'Endo_L',
 'dM1',
 'eEVT',
 'GC',
 'M3']

In [69]:
# if different samples have different terms

In [70]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['highest_cell_density_columns']

In [71]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['iEVT',
 'SCT',
 'VCT',
 'fF1',
 'dEpi_secretory',
 'dS3',
 'uSMC',
 'VCT_p',
 'EVT_1',
 'HOFB',
 'fF2',
 'VCT_CCC',
 'VCT_fusing',
 'EVT_2',
 'Endo_F',
 'dNK1',
 'Endo_L',
 'dM1',
 'eEVT',
 'GC',
 'M3']

In [72]:
#create a dictionary called mapping with version and assay_ontology_term_id

In [73]:
cl_tropho = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/cl_tropho.csv')

In [74]:
mapping = dict(zip(cl_tropho['cell_type'], cl_tropho['CL ID']))

In [75]:
mapping

{'dNK1': 'CL:0002343',
 'dT_cells': 'CL:0000084',
 'T_cells': 'CL:0000084',
 'dNK2': 'CL:0002343',
 'ILC3': 'CL:0001078',
 'dNK3': 'CL:0002343',
 'dT_regs': 'CL:0000815',
 'dM1': 'CL:0000235',
 'HOFB': 'CL:3000001',
 'B_cells': 'CL:0000236',
 'dDC': 'CL:0000451',
 'dM2': 'CL:0000235',
 'dS2': 'CL:0000499',
 'M3': 'CL:0000235',
 'VCT': 'CL:2000060',
 'NK': 'CL:0000623',
 'Granulocytes': 'CL:0000094',
 'uSMC': 'CL:0002601',
 'Endo_F': 'CL:0009092',
 'DC': 'CL:0000451',
 'dEpi_secretory': 'CL:0000066',
 'MO': 'CL:0000235',
 'EVT_2': 'CL:0008036',
 'Endo_M': 'CL:0009095',
 'SCT': 'CL:0000525',
 'dS1': 'CL:0000499',
 'dS3': 'CL:0000499',
 'dEpi_lumenal': 'CL:0000066',
 'iEVT': 'CL:0008036',
 'Endo_L': 'CL:0002138',
 'PV MMP11': 'CL:0000003',
 'PVMMP11': 'CL:0000003',
 'PVSTEAP4': 'CL:0000003',
 'PV STEAP4': 'CL:0000003',
 'EVT_1': 'CL:0008036',
 'PVAOC3': 'CL:0000003',
 'PV AOC3': 'CL:0000003',
 'fF1': 'CL:2000042',
 'VCT_CCC': 'CL:2000060',
 'fF2': 'CL:2000042',
 'VCT_p': 'CL:2000060',
 'P

In [76]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [77]:
# change the data type 

In [78]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [79]:
# if mapping from supplementary file

In [80]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,36173_Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,331.071777,2703.110596,612.525574,375.193420,30358.902344,3932.055176,EFO:0010961,[],iEVT,CL:0008036
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,36173_Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,52.695633,96.163094,158.929153,243.026108,60.335632,47.268257,EFO:0010961,[],SCT,CL:0000525
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,36173_Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,77.772659,276.008575,432.015045,786.160461,127.510620,113.496750,EFO:0010961,[],VCT,CL:2000060
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,36173_Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,57.854637,80.722313,1035.936768,454.550720,51.649097,406.043335,EFO:0010961,[],fF1,CL:2000042
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,36173_Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,22.264326,22.372486,63.259857,52.433914,17.225506,27.665653,EFO:0010961,[],dEpi_secretory,CL:0000066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,36173_Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,182.693848,2075.194824,254.549438,175.300339,15494.495117,347.261902,EFO:0010961,[],iEVT,CL:0008036
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,36173_Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,45.208618,302.110321,165.335098,133.029633,125.961365,71.110420,EFO:0010961,[],dS3,CL:0000499
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,36173_Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,24.001490,219.425446,52.594616,51.875038,227.164246,55.441219,EFO:0010961,[],dS3,CL:0000499
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,36173_Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,42.278877,137.239288,257.849213,354.588135,137.183762,98.473907,EFO:0010961,[],SCT,CL:0000525


## development_stage_ontology_term_id

In [81]:
# if same for all columns

In [82]:
suppl_info = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suppl_info/visium_metadata.csv')

In [83]:
adata.obs['sample'] = adata.obs['sample'].str.split('_',n=1).str[1]

In [84]:
list(adata.obs['sample'].unique())

['Pla_Camb9518737']

In [85]:
list(adata.obs['sample'].unique())

['Pla_Camb9518737']

In [86]:
mapping = dict(zip(suppl_info['sample'], suppl_info['Stage_PCW']))

In [87]:
mapping

{'Pla_HDBR9518710': '6_PCW',
 'WS_PLA_S9101764': '8-9_PCW',
 'WS_PLA_S9101765': '8-9_PCW',
 'WS_PLA_S9101766': '8-9_PCW',
 'WS_PLA_S9101767': '8-9_PCW',
 'Pla_Camb9518737': '8-9_PCW',
 'WS_PLA_S9101769': '8_PCW',
 'WS_PLA_S9101770': '8_PCW'}

In [88]:
adata.obs['development_stage'] = adata.obs['sample'].map(mapping)

In [89]:
list(adata.obs['development_stage'].unique())

['8-9_PCW']

In [90]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,2703.110596,612.525574,375.193420,30358.902344,3932.055176,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,96.163094,158.929153,243.026108,60.335632,47.268257,EFO:0010961,[],SCT,CL:0000525,8-9_PCW
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,276.008575,432.015045,786.160461,127.510620,113.496750,EFO:0010961,[],VCT,CL:2000060,8-9_PCW
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,80.722313,1035.936768,454.550720,51.649097,406.043335,EFO:0010961,[],fF1,CL:2000042,8-9_PCW
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,22.372486,63.259857,52.433914,17.225506,27.665653,EFO:0010961,[],dEpi_secretory,CL:0000066,8-9_PCW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,2075.194824,254.549438,175.300339,15494.495117,347.261902,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,302.110321,165.335098,133.029633,125.961365,71.110420,EFO:0010961,[],dS3,CL:0000499,8-9_PCW
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,219.425446,52.594616,51.875038,227.164246,55.441219,EFO:0010961,[],dS3,CL:0000499,8-9_PCW
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,137.239288,257.849213,354.588135,137.183762,98.473907,EFO:0010961,[],SCT,CL:0000525,8-9_PCW


In [91]:
mapping = {'10_PCW':'HsapDv:0000048',
'8_PCW':'HsapDv:0000030',
'6_PCW':'HsapDv:0000025',
'7-8_PCW':'HsapDv:0000029',
'4-5_PCW':'HsapDv:0000021',
'9_PCW':'HsapDv:0000047',
'12_PCW':'HsapDv:0000049',
'12-13_PCW':'HsapDv:0000050',
'5_PCW':'HsapDv:0000023',
'8-9_PCW':'HsapDv:0000046',
'4_PCW':'HsapDv:0000020'}

In [92]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage'].map(mapping)

In [93]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,612.525574,375.193420,30358.902344,3932.055176,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,158.929153,243.026108,60.335632,47.268257,EFO:0010961,[],SCT,CL:0000525,8-9_PCW,HsapDv:0000046
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,432.015045,786.160461,127.510620,113.496750,EFO:0010961,[],VCT,CL:2000060,8-9_PCW,HsapDv:0000046
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,1035.936768,454.550720,51.649097,406.043335,EFO:0010961,[],fF1,CL:2000042,8-9_PCW,HsapDv:0000046
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,63.259857,52.433914,17.225506,27.665653,EFO:0010961,[],dEpi_secretory,CL:0000066,8-9_PCW,HsapDv:0000046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,254.549438,175.300339,15494.495117,347.261902,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,165.335098,133.029633,125.961365,71.110420,EFO:0010961,[],dS3,CL:0000499,8-9_PCW,HsapDv:0000046
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,52.594616,51.875038,227.164246,55.441219,EFO:0010961,[],dS3,CL:0000499,8-9_PCW,HsapDv:0000046
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,257.849213,354.588135,137.183762,98.473907,EFO:0010961,[],SCT,CL:0000525,8-9_PCW,HsapDv:0000046


## disease_ontology_term_id

In [94]:
# if same for all columns

In [95]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [96]:
# change the data type 

In [97]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [98]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,375.193420,30358.902344,3932.055176,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,243.026108,60.335632,47.268257,EFO:0010961,[],SCT,CL:0000525,8-9_PCW,HsapDv:0000046,PATO:0000461
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,786.160461,127.510620,113.496750,EFO:0010961,[],VCT,CL:2000060,8-9_PCW,HsapDv:0000046,PATO:0000461
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,454.550720,51.649097,406.043335,EFO:0010961,[],fF1,CL:2000042,8-9_PCW,HsapDv:0000046,PATO:0000461
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,52.433914,17.225506,27.665653,EFO:0010961,[],dEpi_secretory,CL:0000066,8-9_PCW,HsapDv:0000046,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,175.300339,15494.495117,347.261902,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,133.029633,125.961365,71.110420,EFO:0010961,[],dS3,CL:0000499,8-9_PCW,HsapDv:0000046,PATO:0000461
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,51.875038,227.164246,55.441219,EFO:0010961,[],dS3,CL:0000499,8-9_PCW,HsapDv:0000046,PATO:0000461
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,354.588135,137.183762,98.473907,EFO:0010961,[],SCT,CL:0000525,8-9_PCW,HsapDv:0000046,PATO:0000461


## donor_id

In [99]:
mapping = dict(zip(suppl_info['sample'], suppl_info['Donor']))

In [100]:
mapping

{'Pla_HDBR9518710': 'Hrv43',
 'WS_PLA_S9101764': 'P13',
 'WS_PLA_S9101765': 'P13',
 'WS_PLA_S9101766': 'P13',
 'WS_PLA_S9101767': 'P13',
 'Pla_Camb9518737': 'P13',
 'WS_PLA_S9101769': 'P14',
 'WS_PLA_S9101770': 'P14'}

In [101]:
adata.obs['donor_id'] = adata.obs['sample'].map(mapping)

In [102]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,30358.902344,3932.055176,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,60.335632,47.268257,EFO:0010961,[],SCT,CL:0000525,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,127.510620,113.496750,EFO:0010961,[],VCT,CL:2000060,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,51.649097,406.043335,EFO:0010961,[],fF1,CL:2000042,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,17.225506,27.665653,EFO:0010961,[],dEpi_secretory,CL:0000066,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,15494.495117,347.261902,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,125.961365,71.110420,EFO:0010961,[],dS3,CL:0000499,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,227.164246,55.441219,EFO:0010961,[],dS3,CL:0000499,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,137.183762,98.473907,EFO:0010961,[],SCT,CL:0000525,8-9_PCW,HsapDv:0000046,PATO:0000461,P13


## is_primary_data

In [103]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [104]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [105]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [106]:
# change the data type 

In [107]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

## self_reported_ethnicity_ontology_term_id

In [108]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

## sex_ontology_term_id

In [109]:
# if same for all columns

In [110]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

## suspension_type

In [111]:
# if same for all columns

In [112]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [113]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,CL:0000525,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,CL:2000060,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,CL:2000042,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,CL:0000066,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,CL:0000499,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,CL:0000499,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,CL:0000525,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na


## tissue_ontology_term_id

In [114]:
suppl_tissue = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suppl_info/tissue_info.csv')

In [115]:
mapping = dict(zip(suppl_tissue['Donor'], suppl_tissue['Tissue']))

In [116]:
mapping

{'P13': 'Placenta, decidua basalis',
 'P14': 'Placenta, decidua basalis',
 '5': 'Placenta, decidua basalis',
 '10': 'Placenta, decidua basalis',
 '11': 'Placenta',
 'H2': 'Decidua basalis',
 'H7': 'Decidua basalis',
 'H9': 'Decidua basalis',
 'Hrv98': 'Placenta',
 'Hrv99': 'Placenta',
 'Hrv100': 'Placenta',
 'Hrv43': 'Decidua basalis',
 'Hrv46': 'Placenta, decidua basalis',
 'P34': 'Placenta, decidua basalis',
 'Hrv81': 'Decidua basalis',
 '6': 'Decidua basalis, peripheral blood',
 '7': 'Decidua basalis, peripheral blood',
 '8': 'Decidua basalis, peripheral blood',
 '9': 'Decidua basalis, peripheral blood',
 '12': 'Placenta, decidua basalis'}

In [117]:
adata.obs['tissue_ontology_term_id'] = adata.obs['donor_id'].map(mapping)

In [118]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"


In [119]:
list(adata.obs['tissue_ontology_term_id'].unique())

['Placenta, decidua basalis']

In [120]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0000453'] * len(adata.obs)

In [121]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453


In [122]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density',
       ...
       'development_stage', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=102)

### Data types

In [123]:
adata.obs.dtypes

in_tissue                                   category
array_row                                    float32
array_col                                    float32
sample                                        object
n_counts                                     float32
                                              ...   
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id      object
sex_ontology_term_id                          object
suspension_type                               object
tissue_ontology_term_id                       object
Length: 102, dtype: object

### obsm (Embeddings)

In [124]:
adata.obsm

AxisArrays with keys: X_spatial

### uns

In [125]:
adata.uns

OverloadedDict, wrapping:
	{'spatial': {'spaceranger110_count_36173_Pla_Camb9518737_GRCh38-3_0_0_premrna': {'images': {'hires': array([[[0.9372549 , 0.93333334, 0.9529412 ],
        [0.9372549 , 0.93333334, 0.9529412 ],
        [0.9372549 , 0.93333334, 0.9529412 ],
        ...,
        [0.92941177, 0.9254902 , 0.94509804],
        [0.92941177, 0.9254902 , 0.94509804],
        [0.92941177, 0.9254902 , 0.94509804]],

       [[0.9372549 , 0.93333334, 0.9529412 ],
        [0.9372549 , 0.93333334, 0.9529412 ],
        [0.9372549 , 0.93333334, 0.9529412 ],
        ...,
        [0.93333334, 0.92941177, 0.9490196 ],
        [0.93333334, 0.92941177, 0.9490196 ],
        [0.93333334, 0.92941177, 0.9490196 ]],

       [[0.9372549 , 0.93333334, 0.9490196 ],
        [0.9372549 , 0.93333334, 0.9490196 ],
        [0.9372549 , 0.93333334, 0.9490196 ],
        ...,
        [0.93333334, 0.92941177, 0.94509804],
        [0.93333334, 0.92941177, 0.94509804],
        [0.93333334, 0.92941177, 0.94509804]],


In [126]:
# define a field in uns

In [127]:
adata.uns['schema_version'] = '3.0.0'
adata.uns['default_embedding'] = 'X_spatial'
adata.uns['title'] = 'Pla_Camb9518737'
adata.uns['image_caption'] = 'Shown here is an image of 10 μm cryosections of human maternal-fetal interface (decidua, placenta) tissue stained with H&E'

### Final checks

In [128]:
# View Anndata object

In [129]:
adata

AnnData object with n_obs × n_vars = 4071 × 33234
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

In [130]:
#View var

In [131]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG,False
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A,False
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5,False
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1,False
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3,False
...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2,False
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1,False
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1,False


In [132]:
#View obs

In [133]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_Camb9518737,1,50.0,102.0,Pla_Camb9518737,9363.0,9.144521,3418.0,0.005020,0.044958,0.080001,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACACCAATAACTGC-1_36173_Pla_Camb9518737,1,59.0,19.0,Pla_Camb9518737,598.0,6.393591,483.0,0.046823,0.001171,0.002001,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGCTTTCAGAAG-1_36173_Pla_Camb9518737,1,43.0,9.0,Pla_Camb9518737,1559.0,7.351800,1132.0,0.041693,0.006170,0.006138,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGGGTCTATATT-1_36173_Pla_Camb9518737,1,47.0,13.0,Pla_Camb9518737,949.0,6.855409,761.0,0.054795,0.001952,0.003842,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGTGTTCCTGGG-1_36173_Pla_Camb9518737,1,73.0,43.0,Pla_Camb9518737,139.0,4.934474,124.0,0.050360,0.000267,0.000180,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_36173_Pla_Camb9518737,1,31.0,77.0,Pla_Camb9518737,5667.0,8.642415,2491.0,0.007411,0.020403,0.029312,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCACATCCAGG-1_36173_Pla_Camb9518737,1,58.0,42.0,Pla_Camb9518737,1262.0,7.140453,834.0,0.036450,0.001771,0.004233,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCATTAGTCTA-1_36173_Pla_Camb9518737,1,60.0,30.0,Pla_Camb9518737,403.0,5.998937,314.0,0.044665,0.000331,0.001034,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCCATACAACT-1_36173_Pla_Camb9518737,1,45.0,27.0,Pla_Camb9518737,820.0,6.709304,587.0,0.043902,0.001003,0.001412,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453


In [134]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density',
       ...
       'development_stage', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=102)

In [135]:
del adata.obs['columns_with_multiple_max']
del adata.obs['development_stage']
del adata.obs['highest_cell_density_columns']

In [136]:
#check data type of obs

In [137]:
adata.obs.dtypes

in_tissue                                   category
array_row                                    float32
array_col                                    float32
sample                                        object
n_counts                                     float32
                                              ...   
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id      object
sex_ontology_term_id                          object
suspension_type                               object
tissue_ontology_term_id                       object
Length: 99, dtype: object

In [138]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed sample from object to category
changed assay_ontology_term_id from object to category
changed development_stage_ontology_term_id from object to category
changed donor_id from object to category
changed self_reported_ethnicity_ontology_term_id from object to category
changed sex_ontology_term_id from object to category
changed suspension_type from object to category
changed tissue_ontology_term_id from object to category


In [139]:
#change data type

In [140]:
#check data type of var

In [141]:
adata.var.dtypes

gene_ids                 object
feature_types          category
genome                 category
gene_symbols             object
feature_is_filtered        bool
dtype: object

In [142]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [143]:
# delete columns not required

In [144]:
# Convert expression matrix to sparse matrix format

In [145]:
adata.X

<4071x33234 sparse matrix of type '<class 'numpy.float32'>'
	with 8586259 stored elements in Compressed Sparse Column format>

In [146]:
# check the format of the expression matrix. If the matrix is in CSR (Compressed sparse row) format keep it as such
# If the file format of the matrix is compressed sparse column (CSC) format or dense numpy array, convert the expression matrix into CSR format.

In [147]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [148]:
adata.X

<4071x33234 sparse matrix of type '<class 'numpy.float32'>'
	with 8586259 stored elements in Compressed Sparse Row format>

In [149]:
del adata.raw

In [150]:
# save the final object

In [151]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/final_objects/Pla_Camb9518737.h5ad', compression = 'gzip')