### **Curating WS_PLA_S9101767 datasets**

Article: Spatial multiomics map of trophoblast development in early pregnancy

DOI : https://doi.org/10.1038/s41586-023-05869-0

Data Source : https://www.reproductivecellatlas.org/mfi.html

## Mount farm

mount-farm

## Packages required for curation

### Import all packages required for the curation

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess

## Curation Schema

### X (Matrix Layers)

#### AnnData object

In [2]:
# Load the AnnData object

In [3]:
file = '/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/new_datasets/Visium_spatial_ID_34914_WS_PLA_S9101767_raw'

In [4]:
adata = sc.read_h5ad(file + '.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 4186 × 33538
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

#### Raw Counts matrix

In [7]:
# Check whether adata has raw counts or normalized counts

In [8]:
print(adata.X)

  (2384, 0)	1.0
  (2726, 0)	1.0
  (132, 3)	1.0
  (690, 3)	1.0
  (1351, 3)	1.0
  (1720, 3)	1.0
  (3175, 3)	1.0
  (3467, 3)	1.0
  (3487, 3)	1.0
  (3760, 3)	1.0
  (1388, 9)	1.0
  (1171, 11)	1.0
  (4, 12)	1.0
  (15, 12)	1.0
  (31, 12)	2.0
  (41, 12)	1.0
  (45, 12)	1.0
  (132, 12)	1.0
  (147, 12)	1.0
  (159, 12)	1.0
  (182, 12)	1.0
  (197, 12)	1.0
  (309, 12)	1.0
  (336, 12)	1.0
  (378, 12)	1.0
  :	:
  (4108, 33527)	1.0
  (499, 33528)	1.0
  (1320, 33528)	1.0
  (1631, 33528)	1.0
  (2094, 33528)	1.0
  (2363, 33528)	1.0
  (2439, 33528)	1.0
  (2809, 33528)	1.0
  (3698, 33528)	1.0
  (1910, 33530)	1.0
  (3574, 33530)	1.0
  (1816, 33532)	1.0
  (15, 33535)	1.0
  (439, 33535)	2.0
  (624, 33535)	1.0
  (795, 33535)	1.0
  (1317, 33535)	1.0
  (1441, 33535)	1.0
  (1545, 33535)	1.0
  (1792, 33535)	1.0
  (1998, 33535)	1.0
  (2589, 33535)	1.0
  (3502, 33535)	1.0
  (3800, 33535)	1.0
  (4077, 33535)	1.0


In [9]:
# if its floating point numbers, they are normalized counts.if they are integers, they are raw counts

In [10]:
print(adata.raw.X)

  (0, 22)	1.0
  (0, 27)	3.0
  (0, 29)	1.0
  (0, 39)	1.0
  (0, 52)	2.0
  (0, 55)	1.0
  (0, 61)	1.0
  (0, 66)	2.0
  (0, 86)	1.0
  (0, 89)	2.0
  (0, 128)	1.0
  (0, 167)	1.0
  (0, 174)	1.0
  (0, 190)	1.0
  (0, 193)	1.0
  (0, 201)	2.0
  (0, 216)	1.0
  (0, 219)	1.0
  (0, 269)	2.0
  (0, 277)	1.0
  (0, 378)	1.0
  (0, 390)	1.0
  (0, 391)	1.0
  (0, 412)	2.0
  (0, 449)	2.0
  :	:
  (4185, 31670)	1.0
  (4185, 31760)	1.0
  (4185, 31796)	1.0
  (4185, 32425)	1.0
  (4185, 32445)	1.0
  (4185, 32626)	4.0
  (4185, 32643)	1.0
  (4185, 32677)	1.0
  (4185, 32883)	1.0
  (4185, 32891)	3.0
  (4185, 32963)	1.0
  (4185, 33211)	1.0
  (4185, 33227)	1.0
  (4185, 33245)	1.0
  (4185, 33249)	1.0
  (4185, 33279)	1.0
  (4185, 33343)	1.0
  (4185, 33474)	1.0
  (4185, 33479)	1.0
  (4185, 33498)	3.0
  (4185, 33499)	1.0
  (4185, 33501)	1.0
  (4185, 33502)	1.0
  (4185, 33505)	6.0
  (4185, 33508)	1.0


In [11]:
adata.raw.X

<4186x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 10348056 stored elements in Compressed Sparse Row format>

In [12]:
adata.X

<4186x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 10348056 stored elements in Compressed Sparse Column format>

In [13]:
# Check whether adata and araw has same dimensions

In [14]:
adata

AnnData object with n_obs × n_vars = 4186 × 33538
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

## Variables (var)

In [15]:
# View adata.var and araw.var

In [16]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


In [17]:
adata.raw.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


#### Ensembl IDs

#### Ensembl ids provided in a single column

In [18]:
# Check whether ensembl ids  are provided in  var

In [19]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


##### Approved gene information file

In [20]:
# Give the approved gene information file name and path

In [21]:
var_mapping_file = '/home/jovyan/CellxGene_Curation/Gene_Info_File/genes_approved.csv'

In [22]:
var_map_df = pd.read_csv(var_mapping_file)

In [23]:
var_map_df

Unnamed: 0,feature_id,symb,num,length
0,ERCC-00002,ERCC-00002 (spike-in control),1.0,1061
1,ERCC-00003,ERCC-00003 (spike-in control),1.0,1023
2,ERCC-00004,ERCC-00004 (spike-in control),1.0,523
3,ERCC-00009,ERCC-00009 (spike-in control),1.0,984
4,ERCC-00012,ERCC-00012 (spike-in control),1.0,994
...,...,...,...,...
116179,ENSSASG00005000009,ORF7a,,366
116180,ENSSASG00005000012,ORF7b,,132
116181,ENSSASG00005000008,ORF8,,366
116182,ENSSASG00005000005,N,,1260


In [24]:
#Add a column for Gene IDs based on a column with Gene version if ensembl ids are provided as gene versions

In [25]:
adata.var['gene_symbols'] = adata.var.index

In [26]:
adata.var['ensembl_ids'] = adata.var['gene_ids']

In [27]:
# Make the ensembl ids column as the index column

In [28]:
adata.var = adata.var.set_index('ensembl_ids')

In [29]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1


##### Filter out genes that dont appear in the approved annotation

In [30]:
var_to_keep = adata.var.index.tolist()

In [31]:
var_to_keep

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000236601',
 'ENSG00000284733',
 'ENSG00000235146',
 'ENSG00000284662',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000273443',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',


In [32]:
len(var_to_keep)

33538

In [33]:
var_in_approved = adata.var.index[adata.var.index.isin(var_map_df['feature_id'])].tolist()

In [34]:
var_in_approved

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000236601',
 'ENSG00000284733',
 'ENSG00000235146',
 'ENSG00000284662',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000273443',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',


In [35]:
len(var_in_approved)

33234

In [36]:
var_to_keep = [e for e in var_to_keep if e in var_in_approved]


In [37]:
len(var_to_keep)

33234

In [38]:
adata = adata[:, var_to_keep]

In [39]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1


#### feature_is_filtered

In [40]:
#This MUST be True if the feature was filtered out in the normalized matrix (X) but is present in the raw matrix (raw.X).Otherwise false 

In [41]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [42]:
# View var

In [43]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG,False
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A,False
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5,False
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1,False
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3,False
...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2,False
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1,False
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1,False


In [44]:
# View adata and araw

In [45]:
adata

AnnData object with n_obs × n_vars = 4186 × 33234
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

## obs (Cell metadata)

In [46]:
# View obs

In [47]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dS1_mRNA_factors,u_sf_dS2_mRNA_factors,u_sf_dS3_mRNA_factors,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,34914_WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,743.564087,355.548645,1150.687012,126.238251,128.070709,1406.656128,306.358154,173.590988,8294.897461,3075.969971
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,34914_WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,71.883698,122.199226,732.177673,53.983013,53.841377,208.338104,112.841003,95.587929,1559.006470,51.520386
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,34914_WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,4601.211914,383.557220,1345.497681,392.848633,582.459290,1128.393921,1044.214111,550.602661,9758.227539,23789.837891
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,34914_WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,46.941204,70.719322,203.270523,45.126789,51.200832,191.662689,77.115181,70.012558,1337.785278,36.196022
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,34914_WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,114.575546,187.518723,753.329590,48.913094,53.900780,228.461136,115.910271,100.460350,1444.847168,125.302269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,34914_WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,574.680664,1334.836792,9450.599609,345.471588,357.875610,2537.288086,466.144989,343.690277,28483.130859,1039.982788
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,34914_WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,127.064224,153.082428,896.967712,58.284164,48.407951,176.824936,115.590141,105.315323,976.930420,292.549713
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,34914_WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,162.322647,163.319290,438.616211,57.976601,50.677765,349.197845,179.898666,189.069473,1907.454834,329.403473
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,34914_WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,121.750328,186.041382,348.748077,100.279839,99.180519,1172.445068,235.975174,322.210358,2903.570557,154.361496


In [48]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density',
       'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density',
       'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density',
       'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density',
       'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density',
       'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density',
       'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density',
       'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density',
       'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density',
       'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density',
       'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density',
       'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density',
       'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density',
       'w_sf_dNK2_cell_density

### assay_ontology_term_id

In [49]:
# if same for all columns

In [50]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [51]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dS2_mRNA_factors,u_sf_dS3_mRNA_factors,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,34914_WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,355.548645,1150.687012,126.238251,128.070709,1406.656128,306.358154,173.590988,8294.897461,3075.969971,EFO:0010961
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,34914_WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,122.199226,732.177673,53.983013,53.841377,208.338104,112.841003,95.587929,1559.006470,51.520386,EFO:0010961
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,34914_WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,383.557220,1345.497681,392.848633,582.459290,1128.393921,1044.214111,550.602661,9758.227539,23789.837891,EFO:0010961
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,34914_WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,70.719322,203.270523,45.126789,51.200832,191.662689,77.115181,70.012558,1337.785278,36.196022,EFO:0010961
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,34914_WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,187.518723,753.329590,48.913094,53.900780,228.461136,115.910271,100.460350,1444.847168,125.302269,EFO:0010961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,34914_WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,1334.836792,9450.599609,345.471588,357.875610,2537.288086,466.144989,343.690277,28483.130859,1039.982788,EFO:0010961
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,34914_WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,153.082428,896.967712,58.284164,48.407951,176.824936,115.590141,105.315323,976.930420,292.549713,EFO:0010961
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,34914_WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,163.319290,438.616211,57.976601,50.677765,349.197845,179.898666,189.069473,1907.454834,329.403473,EFO:0010961
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,34914_WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,186.041382,348.748077,100.279839,99.180519,1172.445068,235.975174,322.210358,2903.570557,154.361496,EFO:0010961


### cell_type_ontology_term_id

In [52]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density',
       'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density',
       'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density',
       'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density',
       'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density',
       'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density',
       'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density',
       'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density',
       'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density',
       'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density',
       'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density',
       'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density',
       'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density',
       'w_sf_dNK2_cell_density

In [53]:
cell_density_columns = [col for col in adata.obs.columns if 'cell_density' in col]

In [54]:
cell_density_columns

['w_sf_B_cells_cell_density',
 'w_sf_DC_cell_density',
 'w_sf_EVT_1_cell_density',
 'w_sf_EVT_2_cell_density',
 'w_sf_Endo_F_cell_density',
 'w_sf_Endo_L_cell_density',
 'w_sf_Endo_M_cell_density',
 'w_sf_GC_cell_density',
 'w_sf_Granulocytes_cell_density',
 'w_sf_HOFB_cell_density',
 'w_sf_ILC3_cell_density',
 'w_sf_M3_cell_density',
 'w_sf_MO_cell_density',
 'w_sf_NK_cell_density',
 'w_sf_PV_cell_density',
 'w_sf_Plasma_cell_density',
 'w_sf_SCT_cell_density',
 'w_sf_T_cells_cell_density',
 'w_sf_VCT_cell_density',
 'w_sf_VCT_CCC_cell_density',
 'w_sf_VCT_fusing_cell_density',
 'w_sf_VCT_p_cell_density',
 'w_sf_dDC_cell_density',
 'w_sf_dEpi_lumenal_cell_density',
 'w_sf_dEpi_secretory_cell_density',
 'w_sf_dM1_cell_density',
 'w_sf_dM2_cell_density',
 'w_sf_dNK1_cell_density',
 'w_sf_dNK2_cell_density',
 'w_sf_dNK3_cell_density',
 'w_sf_dS1_cell_density',
 'w_sf_dS2_cell_density',
 'w_sf_dS3_cell_density',
 'w_sf_dT_cells_cell_density',
 'w_sf_dT_regs_cell_density',
 'w_sf_eEVT_cell

In [55]:
max_columns = adata.obs[cell_density_columns].apply(lambda x: x[x == x.max()].index.tolist(), axis=1)

In [56]:
max_columns

barcode_sample
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767     [w_sf_uSMC_cell_density]
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767     [w_sf_iEVT_cell_density]
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767     [w_sf_uSMC_cell_density]
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767     [w_sf_iEVT_cell_density]
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767    [w_sf_EVT_2_cell_density]
                                                      ...            
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767     [w_sf_iEVT_cell_density]
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767       [w_sf_GC_cell_density]
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767     [w_sf_iEVT_cell_density]
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767      [w_sf_SCT_cell_density]
TTGTTTGTATTACACG-1_34914_WS_PLA_S9101767     [w_sf_iEVT_cell_density]
Length: 4186, dtype: object

In [57]:
columns_with_multiple_max = max_columns.apply(lambda x: [col for col in x if x.count(col)>1])

In [58]:
adata.obs['columns_with_multiple_max'] = columns_with_multiple_max

In [59]:
adata.obs['columns_with_multiple_max']

barcode_sample
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767    []
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767    []
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767    []
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767    []
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767    []
                                            ..
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767    []
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767    []
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767    []
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767    []
TTGTTTGTATTACACG-1_34914_WS_PLA_S9101767    []
Name: columns_with_multiple_max, Length: 4186, dtype: object

In [60]:
non_empty_columns = columns_with_multiple_max[columns_with_multiple_max .apply(lambda x: len(x) > 0)]

In [61]:
len(non_empty_columns )

0

In [62]:
#max_columns = max_columns.apply(lambda x: [col.replace('w_sf_','').replace('_cell_density','') for col in x])

In [63]:
max_columns = [col[0].replace('w_sf_','').replace('_cell_density','')if len(col) > 0 else '' for col in max_columns]

In [64]:
adata.obs['highest_cell_density_columns'] = max_columns

In [65]:
adata.obs['highest_cell_density_columns']

barcode_sample
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767     uSMC
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767     iEVT
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767     uSMC
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767     iEVT
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767    EVT_2
                                            ...  
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767     iEVT
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767       GC
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767     iEVT
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767      SCT
TTGTTTGTATTACACG-1_34914_WS_PLA_S9101767     iEVT
Name: highest_cell_density_columns, Length: 4186, dtype: object

In [66]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,34914_WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,126.238251,128.070709,1406.656128,306.358154,173.590988,8294.897461,3075.969971,EFO:0010961,[],uSMC
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,34914_WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,53.983013,53.841377,208.338104,112.841003,95.587929,1559.006470,51.520386,EFO:0010961,[],iEVT
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,34914_WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,392.848633,582.459290,1128.393921,1044.214111,550.602661,9758.227539,23789.837891,EFO:0010961,[],uSMC
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,34914_WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,45.126789,51.200832,191.662689,77.115181,70.012558,1337.785278,36.196022,EFO:0010961,[],iEVT
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,34914_WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,48.913094,53.900780,228.461136,115.910271,100.460350,1444.847168,125.302269,EFO:0010961,[],EVT_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,34914_WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,345.471588,357.875610,2537.288086,466.144989,343.690277,28483.130859,1039.982788,EFO:0010961,[],iEVT
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,34914_WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,58.284164,48.407951,176.824936,115.590141,105.315323,976.930420,292.549713,EFO:0010961,[],GC
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,34914_WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,57.976601,50.677765,349.197845,179.898666,189.069473,1907.454834,329.403473,EFO:0010961,[],iEVT
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,34914_WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,100.279839,99.180519,1172.445068,235.975174,322.210358,2903.570557,154.361496,EFO:0010961,[],SCT


In [67]:
adata.obs['highest_cell_density_columns']

barcode_sample
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767     uSMC
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767     iEVT
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767     uSMC
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767     iEVT
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767    EVT_2
                                            ...  
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767     iEVT
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767       GC
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767     iEVT
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767      SCT
TTGTTTGTATTACACG-1_34914_WS_PLA_S9101767     iEVT
Name: highest_cell_density_columns, Length: 4186, dtype: object

In [68]:
list(adata.obs['highest_cell_density_columns'].unique())

['uSMC',
 'iEVT',
 'EVT_2',
 'GC',
 'dEpi_secretory',
 'dS3',
 'SCT',
 'fF2',
 'VCT_CCC',
 'eEVT',
 'dNK1',
 'EVT_1',
 'fF1',
 'VCT_p',
 'Endo_F',
 'Endo_L',
 'VCT_fusing',
 'HOFB',
 'VCT']

In [69]:
# if different samples have different terms

In [70]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['highest_cell_density_columns']

In [71]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['uSMC',
 'iEVT',
 'EVT_2',
 'GC',
 'dEpi_secretory',
 'dS3',
 'SCT',
 'fF2',
 'VCT_CCC',
 'eEVT',
 'dNK1',
 'EVT_1',
 'fF1',
 'VCT_p',
 'Endo_F',
 'Endo_L',
 'VCT_fusing',
 'HOFB',
 'VCT']

In [72]:
#create a dictionary called mapping with version and assay_ontology_term_id

In [73]:
cl_tropho = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/cl_tropho.csv')

In [74]:
mapping = dict(zip(cl_tropho['cell_type'], cl_tropho['CL ID']))

In [75]:
mapping

{'dNK1': 'CL:0002343',
 'dT_cells': 'CL:0000084',
 'T_cells': 'CL:0000084',
 'dNK2': 'CL:0002343',
 'ILC3': 'CL:0001078',
 'dNK3': 'CL:0002343',
 'dT_regs': 'CL:0000815',
 'dM1': 'CL:0000235',
 'HOFB': 'CL:3000001',
 'B_cells': 'CL:0000236',
 'dDC': 'CL:0000451',
 'dM2': 'CL:0000235',
 'dS2': 'CL:0000499',
 'M3': 'CL:0000235',
 'VCT': 'CL:2000060',
 'NK': 'CL:0000623',
 'Granulocytes': 'CL:0000094',
 'uSMC': 'CL:0002601',
 'Endo_F': 'CL:0009092',
 'DC': 'CL:0000451',
 'dEpi_secretory': 'CL:0000066',
 'MO': 'CL:0000235',
 'EVT_2': 'CL:0008036',
 'Endo_M': 'CL:0009095',
 'SCT': 'CL:0000525',
 'dS1': 'CL:0000499',
 'dS3': 'CL:0000499',
 'dEpi_lumenal': 'CL:0000066',
 'iEVT': 'CL:0008036',
 'Endo_L': 'CL:0002138',
 'PV MMP11': 'CL:0000003',
 'PVMMP11': 'CL:0000003',
 'PVSTEAP4': 'CL:0000003',
 'PV STEAP4': 'CL:0000003',
 'EVT_1': 'CL:0008036',
 'PVAOC3': 'CL:0000003',
 'PV AOC3': 'CL:0000003',
 'fF1': 'CL:2000042',
 'VCT_CCC': 'CL:2000060',
 'fF2': 'CL:2000042',
 'VCT_p': 'CL:2000060',
 'P

In [76]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [77]:
# change the data type 

In [78]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [79]:
# if mapping from supplementary file

In [80]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,34914_WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,128.070709,1406.656128,306.358154,173.590988,8294.897461,3075.969971,EFO:0010961,[],uSMC,CL:0002601
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,34914_WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,53.841377,208.338104,112.841003,95.587929,1559.006470,51.520386,EFO:0010961,[],iEVT,CL:0008036
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,34914_WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,582.459290,1128.393921,1044.214111,550.602661,9758.227539,23789.837891,EFO:0010961,[],uSMC,CL:0002601
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,34914_WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,51.200832,191.662689,77.115181,70.012558,1337.785278,36.196022,EFO:0010961,[],iEVT,CL:0008036
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,34914_WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,53.900780,228.461136,115.910271,100.460350,1444.847168,125.302269,EFO:0010961,[],EVT_2,CL:0008036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,34914_WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,357.875610,2537.288086,466.144989,343.690277,28483.130859,1039.982788,EFO:0010961,[],iEVT,CL:0008036
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,34914_WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,48.407951,176.824936,115.590141,105.315323,976.930420,292.549713,EFO:0010961,[],GC,CL:0002488
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,34914_WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,50.677765,349.197845,179.898666,189.069473,1907.454834,329.403473,EFO:0010961,[],iEVT,CL:0008036
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,34914_WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,99.180519,1172.445068,235.975174,322.210358,2903.570557,154.361496,EFO:0010961,[],SCT,CL:0000525


## development_stage_ontology_term_id

In [81]:
# if same for all columns

In [82]:
suppl_info = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suppl_info/visium_metadata.csv')

In [83]:
adata.obs['sample'] = adata.obs['sample'].str.split('_',n=1).str[1]

In [84]:
list(adata.obs['sample'].unique())

['WS_PLA_S9101767']

In [85]:
list(adata.obs['sample'].unique())

['WS_PLA_S9101767']

In [86]:
mapping = dict(zip(suppl_info['sample'], suppl_info['Stage_PCW']))

In [87]:
mapping

{'Pla_HDBR9518710': '6_PCW',
 'WS_PLA_S9101764': '8-9_PCW',
 'WS_PLA_S9101765': '8-9_PCW',
 'WS_PLA_S9101766': '8-9_PCW',
 'WS_PLA_S9101767': '8-9_PCW',
 'Pla_Camb9518737': '8-9_PCW',
 'WS_PLA_S9101769': '8_PCW',
 'WS_PLA_S9101770': '8_PCW'}

In [88]:
adata.obs['development_stage'] = adata.obs['sample'].map(mapping)

In [89]:
list(adata.obs['development_stage'].unique())

['8-9_PCW']

In [90]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,1406.656128,306.358154,173.590988,8294.897461,3075.969971,EFO:0010961,[],uSMC,CL:0002601,8-9_PCW
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,208.338104,112.841003,95.587929,1559.006470,51.520386,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,1128.393921,1044.214111,550.602661,9758.227539,23789.837891,EFO:0010961,[],uSMC,CL:0002601,8-9_PCW
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,191.662689,77.115181,70.012558,1337.785278,36.196022,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,228.461136,115.910271,100.460350,1444.847168,125.302269,EFO:0010961,[],EVT_2,CL:0008036,8-9_PCW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,2537.288086,466.144989,343.690277,28483.130859,1039.982788,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,176.824936,115.590141,105.315323,976.930420,292.549713,EFO:0010961,[],GC,CL:0002488,8-9_PCW
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,349.197845,179.898666,189.069473,1907.454834,329.403473,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,1172.445068,235.975174,322.210358,2903.570557,154.361496,EFO:0010961,[],SCT,CL:0000525,8-9_PCW


In [91]:
mapping = {'10_PCW':'HsapDv:0000048',
'8_PCW':'HsapDv:0000030',
'6_PCW':'HsapDv:0000025',
'7-8_PCW':'HsapDv:0000029',
'4-5_PCW':'HsapDv:0000021',
'9_PCW':'HsapDv:0000047',
'12_PCW':'HsapDv:0000049',
'12-13_PCW':'HsapDv:0000050',
'5_PCW':'HsapDv:0000023',
'8-9_PCW':'HsapDv:0000046',
'4_PCW':'HsapDv:0000020'}

In [92]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage'].map(mapping)

In [93]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,306.358154,173.590988,8294.897461,3075.969971,EFO:0010961,[],uSMC,CL:0002601,8-9_PCW,HsapDv:0000046
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,112.841003,95.587929,1559.006470,51.520386,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,1044.214111,550.602661,9758.227539,23789.837891,EFO:0010961,[],uSMC,CL:0002601,8-9_PCW,HsapDv:0000046
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,77.115181,70.012558,1337.785278,36.196022,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,115.910271,100.460350,1444.847168,125.302269,EFO:0010961,[],EVT_2,CL:0008036,8-9_PCW,HsapDv:0000046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,466.144989,343.690277,28483.130859,1039.982788,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,115.590141,105.315323,976.930420,292.549713,EFO:0010961,[],GC,CL:0002488,8-9_PCW,HsapDv:0000046
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,179.898666,189.069473,1907.454834,329.403473,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,235.975174,322.210358,2903.570557,154.361496,EFO:0010961,[],SCT,CL:0000525,8-9_PCW,HsapDv:0000046


## disease_ontology_term_id

In [94]:
# if same for all columns

In [95]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [96]:
# change the data type 

In [97]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [98]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,173.590988,8294.897461,3075.969971,EFO:0010961,[],uSMC,CL:0002601,8-9_PCW,HsapDv:0000046,PATO:0000461
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,95.587929,1559.006470,51.520386,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,550.602661,9758.227539,23789.837891,EFO:0010961,[],uSMC,CL:0002601,8-9_PCW,HsapDv:0000046,PATO:0000461
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,70.012558,1337.785278,36.196022,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,100.460350,1444.847168,125.302269,EFO:0010961,[],EVT_2,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,343.690277,28483.130859,1039.982788,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,105.315323,976.930420,292.549713,EFO:0010961,[],GC,CL:0002488,8-9_PCW,HsapDv:0000046,PATO:0000461
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,189.069473,1907.454834,329.403473,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,322.210358,2903.570557,154.361496,EFO:0010961,[],SCT,CL:0000525,8-9_PCW,HsapDv:0000046,PATO:0000461


## donor_id

In [99]:
mapping = dict(zip(suppl_info['sample'], suppl_info['Donor']))

In [100]:
mapping

{'Pla_HDBR9518710': 'Hrv43',
 'WS_PLA_S9101764': 'P13',
 'WS_PLA_S9101765': 'P13',
 'WS_PLA_S9101766': 'P13',
 'WS_PLA_S9101767': 'P13',
 'Pla_Camb9518737': 'P13',
 'WS_PLA_S9101769': 'P14',
 'WS_PLA_S9101770': 'P14'}

In [101]:
adata.obs['donor_id'] = adata.obs['sample'].map(mapping)

In [102]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,8294.897461,3075.969971,EFO:0010961,[],uSMC,CL:0002601,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,1559.006470,51.520386,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,9758.227539,23789.837891,EFO:0010961,[],uSMC,CL:0002601,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,1337.785278,36.196022,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,1444.847168,125.302269,EFO:0010961,[],EVT_2,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,28483.130859,1039.982788,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,976.930420,292.549713,EFO:0010961,[],GC,CL:0002488,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,1907.454834,329.403473,EFO:0010961,[],iEVT,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,2903.570557,154.361496,EFO:0010961,[],SCT,CL:0000525,8-9_PCW,HsapDv:0000046,PATO:0000461,P13


## is_primary_data

In [103]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [104]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [105]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [106]:
# change the data type 

In [107]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

## self_reported_ethnicity_ontology_term_id

In [108]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

## sex_ontology_term_id

In [109]:
# if same for all columns

In [110]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

## suspension_type

In [111]:
# if same for all columns

In [112]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [113]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,CL:0002601,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,CL:0002601,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,CL:0002488,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,CL:0008036,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,CL:0000525,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na


## tissue_ontology_term_id

In [114]:
suppl_tissue = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suppl_info/tissue_info.csv')

In [115]:
mapping = dict(zip(suppl_tissue['Donor'], suppl_tissue['Tissue']))

In [116]:
mapping

{'P13': 'Placenta, decidua basalis',
 'P14': 'Placenta, decidua basalis',
 '5': 'Placenta, decidua basalis',
 '10': 'Placenta, decidua basalis',
 '11': 'Placenta',
 'H2': 'Decidua basalis',
 'H7': 'Decidua basalis',
 'H9': 'Decidua basalis',
 'Hrv98': 'Placenta',
 'Hrv99': 'Placenta',
 'Hrv100': 'Placenta',
 'Hrv43': 'Decidua basalis',
 'Hrv46': 'Placenta, decidua basalis',
 'P34': 'Placenta, decidua basalis',
 'Hrv81': 'Decidua basalis',
 '6': 'Decidua basalis, peripheral blood',
 '7': 'Decidua basalis, peripheral blood',
 '8': 'Decidua basalis, peripheral blood',
 '9': 'Decidua basalis, peripheral blood',
 '12': 'Placenta, decidua basalis'}

In [117]:
adata.obs['tissue_ontology_term_id'] = adata.obs['donor_id'].map(mapping)

In [118]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"


In [119]:
list(adata.obs['tissue_ontology_term_id'].unique())

['Placenta, decidua basalis']

In [120]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0000453'] * len(adata.obs)

In [121]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453


In [122]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density',
       ...
       'development_stage', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=102)

### Data types

In [123]:
adata.obs.dtypes

in_tissue                                   category
array_row                                    float32
array_col                                    float32
sample                                        object
n_counts                                     float32
                                              ...   
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id      object
sex_ontology_term_id                          object
suspension_type                               object
tissue_ontology_term_id                       object
Length: 102, dtype: object

### obsm (Embeddings)

In [124]:
adata.obsm

AxisArrays with keys: X_spatial

### uns

In [125]:
adata.uns

OverloadedDict, wrapping:
	{'spatial': {'spaceranger110_count_34914_WS_PLA_S9101767_GRCh38-3_0_0_premrna': {'images': {'hires': array([[[0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        ...,
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ]],

       [[0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        ...,
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ]],

       [[0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        ...,
        [0.92941177, 0.94509804, 0.9529412 ],
        [0.92941177, 0.94509804, 0.9529412 ],
        [0.93333334, 0.94509804, 0.9529412 ]],


In [126]:
# define a field in uns

In [127]:
adata.uns['schema_version'] = '3.0.0'
adata.uns['default_embedding'] = 'X_spatial'
adata.uns['title'] = 'WS_PLA_S9101767'
adata.uns['image_caption'] = 'Shown here is an image of 10 μm cryosections of human maternal-fetal interface (decidua, placenta) tissue stained with H&E'

### Final checks

In [128]:
# View Anndata object

In [129]:
adata

AnnData object with n_obs × n_vars = 4186 × 33234
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

In [130]:
#View var

In [131]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG,False
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A,False
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5,False
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1,False
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3,False
...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2,False
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1,False
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1,False


In [132]:
#View obs

In [133]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34914_WS_PLA_S9101767,1,50.0,102.0,WS_PLA_S9101767,3916.0,8.272826,1935.0,0.025792,0.012564,0.030343,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACACCAATAACTGC-1_34914_WS_PLA_S9101767,1,59.0,19.0,WS_PLA_S9101767,1006.0,6.913737,647.0,0.112326,0.001363,0.001164,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGAGCGACTCCT-1_34914_WS_PLA_S9101767,1,14.0,94.0,WS_PLA_S9101767,12754.0,9.453600,3791.0,0.019366,0.080814,0.368133,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGCTTTCAGAAG-1_34914_WS_PLA_S9101767,1,43.0,9.0,WS_PLA_S9101767,522.0,6.257668,383.0,0.086207,0.000602,0.001444,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGGGTCTATATT-1_34914_WS_PLA_S9101767,1,47.0,13.0,WS_PLA_S9101767,1079.0,6.983790,730.0,0.064875,0.001288,0.002116,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1_34914_WS_PLA_S9101767,1,31.0,77.0,WS_PLA_S9101767,12373.0,9.423272,4009.0,0.008890,0.108494,0.119178,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCACATCCAGG-1_34914_WS_PLA_S9101767,1,58.0,42.0,WS_PLA_S9101767,992.0,6.899723,707.0,0.015121,0.001353,0.001753,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCATTAGTCTA-1_34914_WS_PLA_S9101767,1,60.0,30.0,WS_PLA_S9101767,1068.0,6.973543,738.0,0.029026,0.001423,0.002527,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCCATACAACT-1_34914_WS_PLA_S9101767,1,45.0,27.0,WS_PLA_S9101767,2199.0,7.695758,1390.0,0.020919,0.003304,0.006372,...,8-9_PCW,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453


In [134]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density',
       ...
       'development_stage', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=102)

In [135]:
del adata.obs['columns_with_multiple_max']
del adata.obs['development_stage']
del adata.obs['highest_cell_density_columns']

In [136]:
#check data type of obs

In [137]:
adata.obs.dtypes

in_tissue                                   category
array_row                                    float32
array_col                                    float32
sample                                        object
n_counts                                     float32
                                              ...   
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id      object
sex_ontology_term_id                          object
suspension_type                               object
tissue_ontology_term_id                       object
Length: 99, dtype: object

In [138]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed sample from object to category
changed assay_ontology_term_id from object to category
changed development_stage_ontology_term_id from object to category
changed donor_id from object to category
changed self_reported_ethnicity_ontology_term_id from object to category
changed sex_ontology_term_id from object to category
changed suspension_type from object to category
changed tissue_ontology_term_id from object to category


In [139]:
#change data type

In [140]:
#check data type of var

In [141]:
adata.var.dtypes

gene_ids                 object
feature_types          category
genome                 category
gene_symbols             object
feature_is_filtered        bool
dtype: object

In [142]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [143]:
# delete columns not required

In [144]:
# Convert expression matrix to sparse matrix format

In [145]:
adata.X

<4186x33234 sparse matrix of type '<class 'numpy.float32'>'
	with 10341691 stored elements in Compressed Sparse Column format>

In [146]:
# check the format of the expression matrix. If the matrix is in CSR (Compressed sparse row) format keep it as such
# If the file format of the matrix is compressed sparse column (CSC) format or dense numpy array, convert the expression matrix into CSR format.

In [147]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [148]:
adata.X

<4186x33234 sparse matrix of type '<class 'numpy.float32'>'
	with 10341691 stored elements in Compressed Sparse Row format>

In [149]:
del adata.raw

In [150]:
# save the final object

In [151]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/final_objects/WS_PLA_S9101767.h5ad', compression = 'gzip')