### **Curating WS_PLA_S9101770 datasets**

Article: Spatial multiomics map of trophoblast development in early pregnancy

DOI : https://doi.org/10.1038/s41586-023-05869-0

Data Source : https://www.reproductivecellatlas.org/mfi.html

## Mount farm

mount-farm

## Packages required for curation

### Import all packages required for the curation

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess

## Curation Schema

### X (Matrix Layers)

#### AnnData object

In [2]:
# Load the AnnData object

In [3]:
file = '/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/new_datasets/Visium_spatial_ID_34882_WS_PLA_S9101770_raw'

In [4]:
adata = sc.read_h5ad(file + '.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 3355 × 33538
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

#### Raw Counts matrix

In [7]:
# Check whether adata has raw counts or normalized counts

In [8]:
print(adata.X)

  (87, 0)	1.0
  (2348, 3)	1.0
  (2744, 3)	1.0
  (149, 12)	1.0
  (215, 12)	1.0
  (232, 12)	1.0
  (268, 12)	1.0
  (282, 12)	1.0
  (309, 12)	2.0
  (352, 12)	1.0
  (368, 12)	1.0
  (688, 12)	1.0
  (755, 12)	1.0
  (764, 12)	1.0
  (987, 12)	1.0
  (1059, 12)	1.0
  (1089, 12)	1.0
  (1106, 12)	1.0
  (1113, 12)	1.0
  (1157, 12)	1.0
  (1173, 12)	1.0
  (1193, 12)	1.0
  (1277, 12)	1.0
  (1326, 12)	1.0
  (1384, 12)	1.0
  :	:
  (1740, 33528)	1.0
  (1775, 33528)	1.0
  (1848, 33528)	1.0
  (2196, 33528)	1.0
  (2347, 33528)	1.0
  (2667, 33528)	1.0
  (2922, 33528)	1.0
  (3069, 33528)	1.0
  (3071, 33528)	1.0
  (3109, 33528)	1.0
  (3197, 33528)	1.0
  (3247, 33528)	1.0
  (2260, 33530)	1.0
  (2425, 33530)	1.0
  (3149, 33530)	1.0
  (1555, 33532)	1.0
  (183, 33535)	1.0
  (486, 33535)	1.0
  (817, 33535)	1.0
  (1694, 33535)	1.0
  (1695, 33535)	1.0
  (2225, 33535)	1.0
  (2900, 33535)	1.0
  (3007, 33535)	1.0
  (3017, 33535)	1.0


In [9]:
# if its floating point numbers, they are normalized counts.if they are integers, they are raw counts

In [10]:
print(adata.raw.X)

  (0, 26)	1.0
  (0, 27)	1.0
  (0, 39)	1.0
  (0, 43)	1.0
  (0, 46)	2.0
  (0, 51)	1.0
  (0, 52)	1.0
  (0, 53)	3.0
  (0, 54)	1.0
  (0, 61)	2.0
  (0, 66)	2.0
  (0, 71)	2.0
  (0, 72)	1.0
  (0, 93)	1.0
  (0, 160)	1.0
  (0, 161)	1.0
  (0, 166)	2.0
  (0, 190)	1.0
  (0, 201)	3.0
  (0, 259)	2.0
  (0, 269)	1.0
  (0, 270)	3.0
  (0, 274)	1.0
  (0, 338)	2.0
  (0, 343)	1.0
  :	:
  (3354, 33296)	1.0
  (3354, 33297)	2.0
  (3354, 33313)	2.0
  (3354, 33326)	1.0
  (3354, 33343)	2.0
  (3354, 33388)	1.0
  (3354, 33395)	3.0
  (3354, 33396)	3.0
  (3354, 33412)	4.0
  (3354, 33413)	1.0
  (3354, 33446)	6.0
  (3354, 33447)	2.0
  (3354, 33451)	2.0
  (3354, 33465)	13.0
  (3354, 33474)	7.0
  (3354, 33479)	11.0
  (3354, 33484)	2.0
  (3354, 33496)	5.0
  (3354, 33498)	16.0
  (3354, 33499)	23.0
  (3354, 33501)	7.0
  (3354, 33502)	15.0
  (3354, 33503)	7.0
  (3354, 33505)	15.0
  (3354, 33508)	11.0


In [11]:
adata.raw.X

<3355x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 8348905 stored elements in Compressed Sparse Row format>

In [12]:
adata.X

<3355x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 8348905 stored elements in Compressed Sparse Column format>

In [13]:
# Check whether adata and araw has same dimensions

In [14]:
adata

AnnData object with n_obs × n_vars = 3355 × 33538
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

## Variables (var)

In [15]:
# View adata.var and araw.var

In [16]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


In [17]:
adata.raw.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


#### Ensembl IDs

#### Ensembl ids provided in a single column

In [18]:
# Check whether ensembl ids  are provided in  var

In [19]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


##### Approved gene information file

In [20]:
# Give the approved gene information file name and path

In [21]:
var_mapping_file = '/home/jovyan/CellxGene_Curation/Gene_Info_File/genes_approved.csv'

In [22]:
var_map_df = pd.read_csv(var_mapping_file)

In [23]:
var_map_df

Unnamed: 0,feature_id,symb,num,length
0,ERCC-00002,ERCC-00002 (spike-in control),1.0,1061
1,ERCC-00003,ERCC-00003 (spike-in control),1.0,1023
2,ERCC-00004,ERCC-00004 (spike-in control),1.0,523
3,ERCC-00009,ERCC-00009 (spike-in control),1.0,984
4,ERCC-00012,ERCC-00012 (spike-in control),1.0,994
...,...,...,...,...
116179,ENSSASG00005000009,ORF7a,,366
116180,ENSSASG00005000012,ORF7b,,132
116181,ENSSASG00005000008,ORF8,,366
116182,ENSSASG00005000005,N,,1260


In [24]:
#Add a column for Gene IDs based on a column with Gene version if ensembl ids are provided as gene versions

In [25]:
adata.var['gene_symbols'] = adata.var.index

In [26]:
adata.var['ensembl_ids'] = adata.var['gene_ids']

In [27]:
# Make the ensembl ids column as the index column

In [28]:
adata.var = adata.var.set_index('ensembl_ids')

In [29]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1


##### Filter out genes that dont appear in the approved annotation

In [30]:
var_to_keep = adata.var.index.tolist()

In [31]:
var_to_keep

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000236601',
 'ENSG00000284733',
 'ENSG00000235146',
 'ENSG00000284662',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000273443',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',


In [32]:
len(var_to_keep)

33538

In [33]:
var_in_approved = adata.var.index[adata.var.index.isin(var_map_df['feature_id'])].tolist()

In [34]:
var_in_approved

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000236601',
 'ENSG00000284733',
 'ENSG00000235146',
 'ENSG00000284662',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000273443',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',


In [35]:
len(var_in_approved)

33234

In [36]:
var_to_keep = [e for e in var_to_keep if e in var_in_approved]


In [37]:
len(var_to_keep)

33234

In [38]:
adata = adata[:, var_to_keep]

In [39]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1


#### feature_is_filtered

In [40]:
#This MUST be True if the feature was filtered out in the normalized matrix (X) but is present in the raw matrix (raw.X).Otherwise false 

In [41]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [42]:
# View var

In [43]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG,False
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A,False
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5,False
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1,False
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3,False
...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2,False
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1,False
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1,False


In [44]:
# View adata and araw

In [45]:
adata

AnnData object with n_obs × n_vars = 3355 × 33234
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

## obs (Cell metadata)

In [46]:
# View obs

In [47]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dS1_mRNA_factors,u_sf_dS2_mRNA_factors,u_sf_dS3_mRNA_factors,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,34882_WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,2728.074463,157.309937,605.748230,239.617447,275.482971,153.291245,921.484741,460.963104,108.836113,10553.602539
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,34882_WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,3947.781982,178.701279,904.025391,375.245056,547.461304,216.464584,1056.127441,464.913910,244.670883,19342.654297
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,34882_WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,3197.495850,199.795425,860.068298,295.273682,396.949036,902.207275,1472.548828,633.330017,358.682037,12967.927734
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,34882_WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,987.470581,904.483276,8303.371094,397.534363,476.457520,1851.460571,485.803802,430.542084,9944.848633,2940.474121
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,34882_WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,5811.375977,280.924744,1232.750244,458.634827,694.911987,813.135620,1477.735229,737.192383,3389.728516,32378.585938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,34882_WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,1386.279541,145.023972,380.883209,130.913956,184.568710,530.714233,558.749268,447.065552,922.456177,5791.125000
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,34882_WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,1619.141968,671.208801,8929.078125,290.276672,401.398743,196.923264,504.522736,287.190399,172.603683,5928.634766
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,34882_WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,3531.526611,190.365295,1060.193115,240.476318,361.460449,132.703949,1362.227051,675.339722,136.083572,13892.478516
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,34882_WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,387.770569,131.884964,270.589020,96.215614,61.268623,260.624420,181.867569,228.044693,1717.996948,1477.301758


In [48]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density',
       'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density',
       'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density',
       'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density',
       'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density',
       'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density',
       'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density',
       'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density',
       'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density',
       'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density',
       'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density',
       'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density',
       'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density',
       'w_sf_dNK2_cell_density

### assay_ontology_term_id

In [49]:
# if same for all columns

In [50]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [51]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dS2_mRNA_factors,u_sf_dS3_mRNA_factors,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,34882_WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,157.309937,605.748230,239.617447,275.482971,153.291245,921.484741,460.963104,108.836113,10553.602539,EFO:0010961
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,34882_WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,178.701279,904.025391,375.245056,547.461304,216.464584,1056.127441,464.913910,244.670883,19342.654297,EFO:0010961
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,34882_WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,199.795425,860.068298,295.273682,396.949036,902.207275,1472.548828,633.330017,358.682037,12967.927734,EFO:0010961
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,34882_WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,904.483276,8303.371094,397.534363,476.457520,1851.460571,485.803802,430.542084,9944.848633,2940.474121,EFO:0010961
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,34882_WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,280.924744,1232.750244,458.634827,694.911987,813.135620,1477.735229,737.192383,3389.728516,32378.585938,EFO:0010961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,34882_WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,145.023972,380.883209,130.913956,184.568710,530.714233,558.749268,447.065552,922.456177,5791.125000,EFO:0010961
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,34882_WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,671.208801,8929.078125,290.276672,401.398743,196.923264,504.522736,287.190399,172.603683,5928.634766,EFO:0010961
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,34882_WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,190.365295,1060.193115,240.476318,361.460449,132.703949,1362.227051,675.339722,136.083572,13892.478516,EFO:0010961
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,34882_WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,131.884964,270.589020,96.215614,61.268623,260.624420,181.867569,228.044693,1717.996948,1477.301758,EFO:0010961


### cell_type_ontology_term_id

In [52]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density',
       'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density',
       'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density',
       'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density',
       'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density',
       'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density',
       'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density',
       'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density',
       'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density',
       'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density',
       'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density',
       'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density',
       'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density',
       'w_sf_dNK2_cell_density

In [53]:
cell_density_columns = [col for col in adata.obs.columns if 'cell_density' in col]

In [54]:
cell_density_columns

['w_sf_B_cells_cell_density',
 'w_sf_DC_cell_density',
 'w_sf_EVT_1_cell_density',
 'w_sf_EVT_2_cell_density',
 'w_sf_Endo_F_cell_density',
 'w_sf_Endo_L_cell_density',
 'w_sf_Endo_M_cell_density',
 'w_sf_GC_cell_density',
 'w_sf_Granulocytes_cell_density',
 'w_sf_HOFB_cell_density',
 'w_sf_ILC3_cell_density',
 'w_sf_M3_cell_density',
 'w_sf_MO_cell_density',
 'w_sf_NK_cell_density',
 'w_sf_PV_cell_density',
 'w_sf_Plasma_cell_density',
 'w_sf_SCT_cell_density',
 'w_sf_T_cells_cell_density',
 'w_sf_VCT_cell_density',
 'w_sf_VCT_CCC_cell_density',
 'w_sf_VCT_fusing_cell_density',
 'w_sf_VCT_p_cell_density',
 'w_sf_dDC_cell_density',
 'w_sf_dEpi_lumenal_cell_density',
 'w_sf_dEpi_secretory_cell_density',
 'w_sf_dM1_cell_density',
 'w_sf_dM2_cell_density',
 'w_sf_dNK1_cell_density',
 'w_sf_dNK2_cell_density',
 'w_sf_dNK3_cell_density',
 'w_sf_dS1_cell_density',
 'w_sf_dS2_cell_density',
 'w_sf_dS3_cell_density',
 'w_sf_dT_cells_cell_density',
 'w_sf_dT_regs_cell_density',
 'w_sf_eEVT_cell

In [55]:
max_columns = adata.obs[cell_density_columns].apply(lambda x: x[x == x.max()].index.tolist(), axis=1)

In [56]:
max_columns

barcode_sample
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770    [w_sf_uSMC_cell_density]
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770    [w_sf_uSMC_cell_density]
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770    [w_sf_uSMC_cell_density]
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770     [w_sf_dS3_cell_density]
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770    [w_sf_uSMC_cell_density]
                                                      ...           
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770    [w_sf_uSMC_cell_density]
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770    [w_sf_uSMC_cell_density]
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770    [w_sf_uSMC_cell_density]
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770    [w_sf_uSMC_cell_density]
TTGTTTCCATACAACT-1_34882_WS_PLA_S9101770    [w_sf_uSMC_cell_density]
Length: 3355, dtype: object

In [57]:
columns_with_multiple_max = max_columns.apply(lambda x: [col for col in x if x.count(col)>1])

In [58]:
adata.obs['columns_with_multiple_max'] = columns_with_multiple_max

In [59]:
adata.obs['columns_with_multiple_max']

barcode_sample
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770    []
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770    []
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770    []
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770    []
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770    []
                                            ..
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770    []
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770    []
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770    []
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770    []
TTGTTTCCATACAACT-1_34882_WS_PLA_S9101770    []
Name: columns_with_multiple_max, Length: 3355, dtype: object

In [60]:
non_empty_columns = columns_with_multiple_max[columns_with_multiple_max .apply(lambda x: len(x) > 0)]

In [61]:
len(non_empty_columns )

0

In [62]:
#max_columns = max_columns.apply(lambda x: [col.replace('w_sf_','').replace('_cell_density','') for col in x])

In [63]:
max_columns = [col[0].replace('w_sf_','').replace('_cell_density','')if len(col) > 0 else '' for col in max_columns]

In [64]:
adata.obs['highest_cell_density_columns'] = max_columns

In [65]:
adata.obs['highest_cell_density_columns']

barcode_sample
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770    uSMC
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770    uSMC
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770    uSMC
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770     dS3
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770    uSMC
                                            ... 
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770    uSMC
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770    uSMC
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770    uSMC
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770    uSMC
TTGTTTCCATACAACT-1_34882_WS_PLA_S9101770    uSMC
Name: highest_cell_density_columns, Length: 3355, dtype: object

In [66]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,34882_WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,239.617447,275.482971,153.291245,921.484741,460.963104,108.836113,10553.602539,EFO:0010961,[],uSMC
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,34882_WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,375.245056,547.461304,216.464584,1056.127441,464.913910,244.670883,19342.654297,EFO:0010961,[],uSMC
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,34882_WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,295.273682,396.949036,902.207275,1472.548828,633.330017,358.682037,12967.927734,EFO:0010961,[],uSMC
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,34882_WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,397.534363,476.457520,1851.460571,485.803802,430.542084,9944.848633,2940.474121,EFO:0010961,[],dS3
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,34882_WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,458.634827,694.911987,813.135620,1477.735229,737.192383,3389.728516,32378.585938,EFO:0010961,[],uSMC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,34882_WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,130.913956,184.568710,530.714233,558.749268,447.065552,922.456177,5791.125000,EFO:0010961,[],uSMC
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,34882_WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,290.276672,401.398743,196.923264,504.522736,287.190399,172.603683,5928.634766,EFO:0010961,[],uSMC
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,34882_WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,240.476318,361.460449,132.703949,1362.227051,675.339722,136.083572,13892.478516,EFO:0010961,[],uSMC
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,34882_WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,96.215614,61.268623,260.624420,181.867569,228.044693,1717.996948,1477.301758,EFO:0010961,[],uSMC


In [67]:
adata.obs['highest_cell_density_columns']

barcode_sample
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770    uSMC
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770    uSMC
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770    uSMC
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770     dS3
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770    uSMC
                                            ... 
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770    uSMC
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770    uSMC
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770    uSMC
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770    uSMC
TTGTTTCCATACAACT-1_34882_WS_PLA_S9101770    uSMC
Name: highest_cell_density_columns, Length: 3355, dtype: object

In [68]:
list(adata.obs['highest_cell_density_columns'].unique())

['uSMC',
 'dS3',
 'iEVT',
 'dEpi_secretory',
 'EVT_2',
 'eEVT',
 'SCT',
 'EVT_1',
 'dNK1',
 'Endo_M']

In [69]:
# if different samples have different terms

In [70]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['highest_cell_density_columns']

In [71]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['uSMC',
 'dS3',
 'iEVT',
 'dEpi_secretory',
 'EVT_2',
 'eEVT',
 'SCT',
 'EVT_1',
 'dNK1',
 'Endo_M']

In [72]:
#create a dictionary called mapping with version and assay_ontology_term_id

In [73]:
cl_tropho = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/cl_tropho.csv')

In [74]:
mapping = dict(zip(cl_tropho['cell_type'], cl_tropho['CL ID']))

In [75]:
mapping

{'dNK1': 'CL:0002343',
 'dT_cells': 'CL:0000084',
 'T_cells': 'CL:0000084',
 'dNK2': 'CL:0002343',
 'ILC3': 'CL:0001078',
 'dNK3': 'CL:0002343',
 'dT_regs': 'CL:0000815',
 'dM1': 'CL:0000235',
 'HOFB': 'CL:3000001',
 'B_cells': 'CL:0000236',
 'dDC': 'CL:0000451',
 'dM2': 'CL:0000235',
 'dS2': 'CL:0000499',
 'M3': 'CL:0000235',
 'VCT': 'CL:2000060',
 'NK': 'CL:0000623',
 'Granulocytes': 'CL:0000094',
 'uSMC': 'CL:0002601',
 'Endo_F': 'CL:0009092',
 'DC': 'CL:0000451',
 'dEpi_secretory': 'CL:0000066',
 'MO': 'CL:0000235',
 'EVT_2': 'CL:0008036',
 'Endo_M': 'CL:0009095',
 'SCT': 'CL:0000525',
 'dS1': 'CL:0000499',
 'dS3': 'CL:0000499',
 'dEpi_lumenal': 'CL:0000066',
 'iEVT': 'CL:0008036',
 'Endo_L': 'CL:0002138',
 'PV MMP11': 'CL:0000003',
 'PVMMP11': 'CL:0000003',
 'PVSTEAP4': 'CL:0000003',
 'PV STEAP4': 'CL:0000003',
 'EVT_1': 'CL:0008036',
 'PVAOC3': 'CL:0000003',
 'PV AOC3': 'CL:0000003',
 'fF1': 'CL:2000042',
 'VCT_CCC': 'CL:2000060',
 'fF2': 'CL:2000042',
 'VCT_p': 'CL:2000060',
 'P

In [76]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [77]:
# change the data type 

In [78]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [79]:
# if mapping from supplementary file

In [80]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,34882_WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,275.482971,153.291245,921.484741,460.963104,108.836113,10553.602539,EFO:0010961,[],uSMC,CL:0002601
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,34882_WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,547.461304,216.464584,1056.127441,464.913910,244.670883,19342.654297,EFO:0010961,[],uSMC,CL:0002601
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,34882_WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,396.949036,902.207275,1472.548828,633.330017,358.682037,12967.927734,EFO:0010961,[],uSMC,CL:0002601
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,34882_WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,476.457520,1851.460571,485.803802,430.542084,9944.848633,2940.474121,EFO:0010961,[],dS3,CL:0000499
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,34882_WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,694.911987,813.135620,1477.735229,737.192383,3389.728516,32378.585938,EFO:0010961,[],uSMC,CL:0002601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,34882_WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,184.568710,530.714233,558.749268,447.065552,922.456177,5791.125000,EFO:0010961,[],uSMC,CL:0002601
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,34882_WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,401.398743,196.923264,504.522736,287.190399,172.603683,5928.634766,EFO:0010961,[],uSMC,CL:0002601
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,34882_WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,361.460449,132.703949,1362.227051,675.339722,136.083572,13892.478516,EFO:0010961,[],uSMC,CL:0002601
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,34882_WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,61.268623,260.624420,181.867569,228.044693,1717.996948,1477.301758,EFO:0010961,[],uSMC,CL:0002601


## development_stage_ontology_term_id

In [81]:
# if same for all columns

In [82]:
suppl_info = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suppl_info/visium_metadata.csv')

In [83]:
adata.obs['sample'] = adata.obs['sample'].str.split('_',n=1).str[1]

In [84]:
list(adata.obs['sample'].unique())

['WS_PLA_S9101770']

In [85]:
list(adata.obs['sample'].unique())

['WS_PLA_S9101770']

In [86]:
mapping = dict(zip(suppl_info['sample'], suppl_info['Stage_PCW']))

In [87]:
mapping

{'Pla_HDBR9518710': '6_PCW',
 'WS_PLA_S9101764': '8-9_PCW',
 'WS_PLA_S9101765': '8-9_PCW',
 'WS_PLA_S9101766': '8-9_PCW',
 'WS_PLA_S9101767': '8-9_PCW',
 'Pla_Camb9518737': '8-9_PCW',
 'WS_PLA_S9101769': '8_PCW',
 'WS_PLA_S9101770': '8_PCW'}

In [88]:
adata.obs['development_stage'] = adata.obs['sample'].map(mapping)

In [89]:
list(adata.obs['development_stage'].unique())

['8_PCW']

In [90]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,153.291245,921.484741,460.963104,108.836113,10553.602539,EFO:0010961,[],uSMC,CL:0002601,8_PCW
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,216.464584,1056.127441,464.913910,244.670883,19342.654297,EFO:0010961,[],uSMC,CL:0002601,8_PCW
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,902.207275,1472.548828,633.330017,358.682037,12967.927734,EFO:0010961,[],uSMC,CL:0002601,8_PCW
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,1851.460571,485.803802,430.542084,9944.848633,2940.474121,EFO:0010961,[],dS3,CL:0000499,8_PCW
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,813.135620,1477.735229,737.192383,3389.728516,32378.585938,EFO:0010961,[],uSMC,CL:0002601,8_PCW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,530.714233,558.749268,447.065552,922.456177,5791.125000,EFO:0010961,[],uSMC,CL:0002601,8_PCW
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,196.923264,504.522736,287.190399,172.603683,5928.634766,EFO:0010961,[],uSMC,CL:0002601,8_PCW
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,132.703949,1362.227051,675.339722,136.083572,13892.478516,EFO:0010961,[],uSMC,CL:0002601,8_PCW
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,260.624420,181.867569,228.044693,1717.996948,1477.301758,EFO:0010961,[],uSMC,CL:0002601,8_PCW


In [91]:
mapping = {'10_PCW':'HsapDv:0000048',
'8_PCW':'HsapDv:0000030',
'6_PCW':'HsapDv:0000025',
'7-8_PCW':'HsapDv:0000029',
'4-5_PCW':'HsapDv:0000021',
'9_PCW':'HsapDv:0000047',
'12_PCW':'HsapDv:0000049',
'12-13_PCW':'HsapDv:0000050',
'5_PCW':'HsapDv:0000023',
'8-9_PCW':'HsapDv:0000046',
'4_PCW':'HsapDv:0000020'}

In [92]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage'].map(mapping)

In [93]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,921.484741,460.963104,108.836113,10553.602539,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,1056.127441,464.913910,244.670883,19342.654297,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,1472.548828,633.330017,358.682037,12967.927734,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,485.803802,430.542084,9944.848633,2940.474121,EFO:0010961,[],dS3,CL:0000499,8_PCW,HsapDv:0000030
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,1477.735229,737.192383,3389.728516,32378.585938,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,558.749268,447.065552,922.456177,5791.125000,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,504.522736,287.190399,172.603683,5928.634766,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,1362.227051,675.339722,136.083572,13892.478516,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,181.867569,228.044693,1717.996948,1477.301758,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030


## disease_ontology_term_id

In [94]:
# if same for all columns

In [95]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [96]:
# change the data type 

In [97]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [98]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,460.963104,108.836113,10553.602539,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,464.913910,244.670883,19342.654297,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,633.330017,358.682037,12967.927734,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,430.542084,9944.848633,2940.474121,EFO:0010961,[],dS3,CL:0000499,8_PCW,HsapDv:0000030,PATO:0000461
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,737.192383,3389.728516,32378.585938,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,447.065552,922.456177,5791.125000,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,287.190399,172.603683,5928.634766,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,675.339722,136.083572,13892.478516,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,228.044693,1717.996948,1477.301758,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461


## donor_id

In [99]:
mapping = dict(zip(suppl_info['sample'], suppl_info['Donor']))

In [100]:
mapping

{'Pla_HDBR9518710': 'Hrv43',
 'WS_PLA_S9101764': 'P13',
 'WS_PLA_S9101765': 'P13',
 'WS_PLA_S9101766': 'P13',
 'WS_PLA_S9101767': 'P13',
 'Pla_Camb9518737': 'P13',
 'WS_PLA_S9101769': 'P14',
 'WS_PLA_S9101770': 'P14'}

In [101]:
adata.obs['donor_id'] = adata.obs['sample'].map(mapping)

In [102]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,108.836113,10553.602539,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,244.670883,19342.654297,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,358.682037,12967.927734,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,9944.848633,2940.474121,EFO:0010961,[],dS3,CL:0000499,8_PCW,HsapDv:0000030,PATO:0000461,P14
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,3389.728516,32378.585938,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,922.456177,5791.125000,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,172.603683,5928.634766,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,136.083572,13892.478516,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,1717.996948,1477.301758,EFO:0010961,[],uSMC,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14


## is_primary_data

In [103]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [104]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [105]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [106]:
# change the data type 

In [107]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

## self_reported_ethnicity_ontology_term_id

In [108]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

## sex_ontology_term_id

In [109]:
# if same for all columns

In [110]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

## suspension_type

In [111]:
# if same for all columns

In [112]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [113]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,CL:0000499,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,CL:0002601,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na


## tissue_ontology_term_id

In [114]:
suppl_tissue = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suppl_info/tissue_info.csv')

In [115]:
mapping = dict(zip(suppl_tissue['Donor'], suppl_tissue['Tissue']))

In [116]:
mapping

{'P13': 'Placenta, decidua basalis',
 'P14': 'Placenta, decidua basalis',
 '5': 'Placenta, decidua basalis',
 '10': 'Placenta, decidua basalis',
 '11': 'Placenta',
 'H2': 'Decidua basalis',
 'H7': 'Decidua basalis',
 'H9': 'Decidua basalis',
 'Hrv98': 'Placenta',
 'Hrv99': 'Placenta',
 'Hrv100': 'Placenta',
 'Hrv43': 'Decidua basalis',
 'Hrv46': 'Placenta, decidua basalis',
 'P34': 'Placenta, decidua basalis',
 'Hrv81': 'Decidua basalis',
 '6': 'Decidua basalis, peripheral blood',
 '7': 'Decidua basalis, peripheral blood',
 '8': 'Decidua basalis, peripheral blood',
 '9': 'Decidua basalis, peripheral blood',
 '12': 'Placenta, decidua basalis'}

In [117]:
adata.obs['tissue_ontology_term_id'] = adata.obs['donor_id'].map(mapping)

In [118]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,"Placenta, decidua basalis"


In [119]:
list(adata.obs['tissue_ontology_term_id'].unique())

['Placenta, decidua basalis']

In [120]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0000453'] * len(adata.obs)

In [121]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453


In [122]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density',
       ...
       'development_stage', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=102)

### Data types

In [123]:
adata.obs.dtypes

in_tissue                                   category
array_row                                    float32
array_col                                    float32
sample                                        object
n_counts                                     float32
                                              ...   
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id      object
sex_ontology_term_id                          object
suspension_type                               object
tissue_ontology_term_id                       object
Length: 102, dtype: object

### obsm (Embeddings)

In [124]:
adata.obsm

AxisArrays with keys: X_spatial

### uns

In [125]:
adata.uns

OverloadedDict, wrapping:
	{'spatial': {'spaceranger110_count_34882_WS_PLA_S9101770_GRCh38-3_0_0-premrna': {'images': {'hires': array([[[0.92941177, 0.94509804, 0.9411765 ],
        [0.92941177, 0.94509804, 0.9411765 ],
        [0.92941177, 0.94509804, 0.9411765 ],
        ...,
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ]],

       [[0.92941177, 0.94509804, 0.9411765 ],
        [0.92941177, 0.94509804, 0.9411765 ],
        [0.92941177, 0.94509804, 0.9411765 ],
        ...,
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ]],

       [[0.92941177, 0.94509804, 0.9411765 ],
        [0.92941177, 0.94509804, 0.9411765 ],
        [0.92941177, 0.94509804, 0.9411765 ],
        ...,
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ],
        [0.92941177, 0.9411765 , 0.9529412 ]],


In [126]:
# define a field in uns

In [127]:
adata.uns['schema_version'] = '3.0.0'
adata.uns['default_embedding'] = 'X_spatial'
adata.uns['title'] = 'WS_PLA_S9101770'
adata.uns['image_caption'] = 'Shown here is an image of 10 μm cryosections of human maternal-fetal interface (decidua, placenta) tissue stained with H&E'

### Final checks

In [128]:
# View Anndata object

In [129]:
adata

AnnData object with n_obs × n_vars = 3355 × 33234
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

In [130]:
#View var

In [131]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG,False
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A,False
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5,False
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1,False
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3,False
...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2,False
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1,False
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1,False


In [132]:
#View obs

In [133]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_34882_WS_PLA_S9101770,1,50.0,102.0,WS_PLA_S9101770,5503.0,8.613049,2246.0,0.012175,0.033157,0.200659,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACACCAATAACTGC-1_34882_WS_PLA_S9101770,1,59.0,19.0,WS_PLA_S9101770,6995.0,8.852951,2735.0,0.010579,0.050046,0.310905,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGAGCGACTCCT-1_34882_WS_PLA_S9101770,1,14.0,94.0,WS_PLA_S9101770,7279.0,8.892749,2955.0,0.017035,0.054557,0.181182,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGCTTTCAGAAG-1_34882_WS_PLA_S9101770,1,43.0,9.0,WS_PLA_S9101770,8354.0,9.030496,3348.0,0.008858,0.106138,0.181287,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACAGGGTCTATATT-1_34882_WS_PLA_S9101770,1,47.0,13.0,WS_PLA_S9101770,11508.0,9.350798,3894.0,0.009906,0.129946,0.463597,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1_34882_WS_PLA_S9101770,1,24.0,64.0,WS_PLA_S9101770,3786.0,8.239065,1809.0,0.039884,0.016253,0.062807,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTGTGTGTCAAGA-1_34882_WS_PLA_S9101770,1,31.0,77.0,WS_PLA_S9101770,7387.0,8.907477,2950.0,0.008664,0.172363,0.308769,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCACATCCAGG-1_34882_WS_PLA_S9101770,1,58.0,42.0,WS_PLA_S9101770,5839.0,8.672315,2485.0,0.011646,0.048071,0.229763,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTTCATTAGTCTA-1_34882_WS_PLA_S9101770,1,60.0,30.0,WS_PLA_S9101770,1189.0,7.080868,845.0,0.007569,0.002062,0.005490,...,8_PCW,HsapDv:0000030,PATO:0000461,P14,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453


In [134]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density',
       ...
       'development_stage', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=102)

In [135]:
del adata.obs['columns_with_multiple_max']
del adata.obs['development_stage']
del adata.obs['highest_cell_density_columns']

In [136]:
#check data type of obs

In [137]:
adata.obs.dtypes

in_tissue                                   category
array_row                                    float32
array_col                                    float32
sample                                        object
n_counts                                     float32
                                              ...   
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id      object
sex_ontology_term_id                          object
suspension_type                               object
tissue_ontology_term_id                       object
Length: 99, dtype: object

In [138]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed sample from object to category
changed assay_ontology_term_id from object to category
changed development_stage_ontology_term_id from object to category
changed donor_id from object to category
changed self_reported_ethnicity_ontology_term_id from object to category
changed sex_ontology_term_id from object to category
changed suspension_type from object to category
changed tissue_ontology_term_id from object to category


In [139]:
#change data type

In [140]:
#check data type of var

In [141]:
adata.var.dtypes

gene_ids                 object
feature_types          category
genome                 category
gene_symbols             object
feature_is_filtered        bool
dtype: object

In [142]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [143]:
# delete columns not required

In [144]:
# Convert expression matrix to sparse matrix format

In [145]:
adata.X

<3355x33234 sparse matrix of type '<class 'numpy.float32'>'
	with 8343347 stored elements in Compressed Sparse Column format>

In [146]:
# check the format of the expression matrix. If the matrix is in CSR (Compressed sparse row) format keep it as such
# If the file format of the matrix is compressed sparse column (CSC) format or dense numpy array, convert the expression matrix into CSR format.

In [147]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [148]:
adata.X

<3355x33234 sparse matrix of type '<class 'numpy.float32'>'
	with 8343347 stored elements in Compressed Sparse Row format>

In [149]:
del adata.raw

In [150]:
# save the final object

In [151]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/final_objects/WS_PLA_S9101770.h5ad', compression = 'gzip')