### **Curating Pla_HDBR9518710 datasets**

Article: Spatial multiomics map of trophoblast development in early pregnancy

DOI : https://doi.org/10.1038/s41586-023-05869-0

Data Source : https://www.reproductivecellatlas.org/mfi.html

## Mount farm

mount-farm

## Packages required for curation

### Import all packages required for the curation

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess

## Curation Schema

### X (Matrix Layers)

#### AnnData object

In [2]:
# Load the AnnData object

In [3]:
file = '/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/new_datasets/Visium_spatial_ID_36173_Pla_HDBR9518710_raw'

In [4]:
adata = sc.read_h5ad(file + '.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 1216 × 33538
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

#### Raw Counts matrix

In [7]:
# Check whether adata has raw counts or normalized counts

In [8]:
print(adata.X)

  (14, 3)	1.0
  (233, 3)	1.0
  (287, 3)	1.0
  (88, 12)	2.0
  (105, 12)	1.0
  (141, 12)	1.0
  (169, 12)	1.0
  (204, 12)	1.0
  (225, 12)	1.0
  (228, 12)	1.0
  (279, 12)	1.0
  (295, 12)	1.0
  (299, 12)	1.0
  (345, 12)	1.0
  (464, 12)	1.0
  (499, 12)	1.0
  (502, 12)	1.0
  (596, 12)	1.0
  (604, 12)	1.0
  (620, 12)	1.0
  (630, 12)	1.0
  (673, 12)	1.0
  (680, 12)	1.0
  (702, 12)	1.0
  (807, 12)	1.0
  :	:
  (822, 33527)	1.0
  (836, 33527)	1.0
  (850, 33527)	1.0
  (853, 33527)	1.0
  (887, 33527)	1.0
  (906, 33527)	1.0
  (917, 33527)	1.0
  (921, 33527)	1.0
  (943, 33527)	1.0
  (954, 33527)	1.0
  (980, 33527)	1.0
  (1008, 33527)	1.0
  (1038, 33527)	1.0
  (1148, 33527)	1.0
  (1178, 33527)	1.0
  (1183, 33527)	1.0
  (1213, 33527)	1.0
  (214, 33528)	1.0
  (474, 33528)	1.0
  (484, 33528)	1.0
  (137, 33530)	1.0
  (266, 33532)	1.0
  (66, 33535)	1.0
  (487, 33535)	1.0
  (933, 33535)	1.0


In [9]:
# if its floating point numbers, they are normalized counts.if they are integers, they are raw counts

In [10]:
print(adata.raw.X)

  (0, 21)	1.0
  (0, 26)	1.0
  (0, 27)	4.0
  (0, 29)	2.0
  (0, 39)	6.0
  (0, 46)	1.0
  (0, 47)	1.0
  (0, 49)	1.0
  (0, 51)	2.0
  (0, 52)	4.0
  (0, 53)	3.0
  (0, 54)	1.0
  (0, 55)	2.0
  (0, 61)	1.0
  (0, 64)	1.0
  (0, 66)	3.0
  (0, 70)	1.0
  (0, 77)	1.0
  (0, 78)	5.0
  (0, 86)	1.0
  (0, 93)	2.0
  (0, 94)	1.0
  (0, 100)	1.0
  (0, 117)	1.0
  (0, 119)	1.0
  :	:
  (1215, 33445)	2.0
  (1215, 33446)	13.0
  (1215, 33447)	2.0
  (1215, 33448)	1.0
  (1215, 33451)	1.0
  (1215, 33465)	9.0
  (1215, 33470)	1.0
  (1215, 33474)	12.0
  (1215, 33479)	9.0
  (1215, 33487)	2.0
  (1215, 33491)	2.0
  (1215, 33492)	1.0
  (1215, 33495)	2.0
  (1215, 33496)	21.0
  (1215, 33497)	23.0
  (1215, 33498)	114.0
  (1215, 33499)	135.0
  (1215, 33500)	1.0
  (1215, 33501)	66.0
  (1215, 33502)	80.0
  (1215, 33503)	43.0
  (1215, 33504)	4.0
  (1215, 33505)	54.0
  (1215, 33506)	10.0
  (1215, 33508)	39.0


In [11]:
adata.raw.X

<1216x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 4248883 stored elements in Compressed Sparse Row format>

In [12]:
adata.X

<1216x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 4248883 stored elements in Compressed Sparse Column format>

In [13]:
# Check whether adata and araw has same dimensions

In [14]:
adata

AnnData object with n_obs × n_vars = 1216 × 33538
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

## Variables (var)

In [15]:
# View adata.var and araw.var

In [16]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


In [17]:
adata.raw.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


#### Ensembl IDs

#### Ensembl ids provided in a single column

In [18]:
# Check whether ensembl ids  are provided in  var

In [19]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


##### Approved gene information file

In [20]:
# Give the approved gene information file name and path

In [21]:
var_mapping_file = '/home/jovyan/CellxGene_Curation/Gene_Info_File/genes_approved.csv'

In [22]:
var_map_df = pd.read_csv(var_mapping_file)

In [23]:
var_map_df

Unnamed: 0,feature_id,symb,num,length
0,ERCC-00002,ERCC-00002 (spike-in control),1.0,1061
1,ERCC-00003,ERCC-00003 (spike-in control),1.0,1023
2,ERCC-00004,ERCC-00004 (spike-in control),1.0,523
3,ERCC-00009,ERCC-00009 (spike-in control),1.0,984
4,ERCC-00012,ERCC-00012 (spike-in control),1.0,994
...,...,...,...,...
116179,ENSSASG00005000009,ORF7a,,366
116180,ENSSASG00005000012,ORF7b,,132
116181,ENSSASG00005000008,ORF8,,366
116182,ENSSASG00005000005,N,,1260


In [24]:
#Add a column for Gene IDs based on a column with Gene version if ensembl ids are provided as gene versions

In [25]:
adata.var['gene_symbols'] = adata.var.index

In [26]:
adata.var['ensembl_ids'] = adata.var['gene_ids']

In [27]:
# Make the ensembl ids column as the index column

In [28]:
adata.var = adata.var.set_index('ensembl_ids')

In [29]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1


##### Filter out genes that dont appear in the approved annotation

In [30]:
var_to_keep = adata.var.index.tolist()

In [31]:
var_to_keep

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000236601',
 'ENSG00000284733',
 'ENSG00000235146',
 'ENSG00000284662',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000273443',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',


In [32]:
len(var_to_keep)

33538

In [33]:
var_in_approved = adata.var.index[adata.var.index.isin(var_map_df['feature_id'])].tolist()

In [34]:
var_in_approved

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000236601',
 'ENSG00000284733',
 'ENSG00000235146',
 'ENSG00000284662',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000273443',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',


In [35]:
len(var_in_approved)

33234

In [36]:
var_to_keep = [e for e in var_to_keep if e in var_in_approved]


In [37]:
len(var_to_keep)

33234

In [38]:
adata = adata[:, var_to_keep]

In [39]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3
...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1


#### feature_is_filtered

In [40]:
#This MUST be True if the feature was filtered out in the normalized matrix (X) but is present in the raw matrix (raw.X).Otherwise false 

In [41]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [42]:
# View var

In [43]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG,False
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A,False
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5,False
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1,False
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3,False
...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2,False
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1,False
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1,False


In [44]:
# View adata and araw

In [45]:
adata

AnnData object with n_obs × n_vars = 1216 × 33234
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

## obs (Cell metadata)

In [46]:
# View obs

In [47]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dS1_mRNA_factors,u_sf_dS2_mRNA_factors,u_sf_dS3_mRNA_factors,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,36173_Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,188.050827,350.598846,1691.233398,152.945587,225.546341,537.210815,218.911438,162.125534,3484.553223,189.585800
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,36173_Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,157.091644,431.066956,1393.642944,172.194626,223.162811,27507.046875,354.363403,437.217773,11929.161133,217.947678
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,36173_Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,269.140289,560.676025,6834.166016,156.910828,209.411835,382.280914,208.857254,153.404907,2987.002441,434.273956
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,36173_Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,186.057449,875.587952,14099.362305,233.736801,293.893097,1212.644287,238.906815,197.554626,2711.770508,175.694397
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,36173_Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,145.302048,123.342484,384.155060,53.093903,68.910667,65.358505,153.708038,112.560204,107.849228,251.829956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,36173_Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,46.524765,42.274155,108.727554,55.207237,77.061707,36.361221,219.334869,67.158279,32.876560,47.490475
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,36173_Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,300.398773,899.075195,14302.301758,188.231461,168.888947,269.134735,333.773987,192.444794,3235.163330,674.082703
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,36173_Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,662.582581,156.414963,2054.435791,74.568214,88.453781,68.000168,290.162689,208.038116,84.349480,2258.859131
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,36173_Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,132.217545,434.935364,7151.744141,168.481979,178.552078,98.128784,147.273499,107.159935,94.280006,88.141670


In [48]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density',
       'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density',
       'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density',
       'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density',
       'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density',
       'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density',
       'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density',
       'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density',
       'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density',
       'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density',
       'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density',
       'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density',
       'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density',
       'w_sf_dNK2_cell_density

### assay_ontology_term_id

In [49]:
# if same for all columns

In [50]:
adata.obs['assay_ontology_term_id'] = ['EFO:0010961'] * len(adata.obs)

In [51]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dS2_mRNA_factors,u_sf_dS3_mRNA_factors,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,36173_Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,350.598846,1691.233398,152.945587,225.546341,537.210815,218.911438,162.125534,3484.553223,189.585800,EFO:0010961
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,36173_Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,431.066956,1393.642944,172.194626,223.162811,27507.046875,354.363403,437.217773,11929.161133,217.947678,EFO:0010961
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,36173_Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,560.676025,6834.166016,156.910828,209.411835,382.280914,208.857254,153.404907,2987.002441,434.273956,EFO:0010961
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,36173_Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,875.587952,14099.362305,233.736801,293.893097,1212.644287,238.906815,197.554626,2711.770508,175.694397,EFO:0010961
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,36173_Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,123.342484,384.155060,53.093903,68.910667,65.358505,153.708038,112.560204,107.849228,251.829956,EFO:0010961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,36173_Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,42.274155,108.727554,55.207237,77.061707,36.361221,219.334869,67.158279,32.876560,47.490475,EFO:0010961
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,36173_Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,899.075195,14302.301758,188.231461,168.888947,269.134735,333.773987,192.444794,3235.163330,674.082703,EFO:0010961
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,36173_Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,156.414963,2054.435791,74.568214,88.453781,68.000168,290.162689,208.038116,84.349480,2258.859131,EFO:0010961
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,36173_Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,434.935364,7151.744141,168.481979,178.552078,98.128784,147.273499,107.159935,94.280006,88.141670,EFO:0010961


### cell_type_ontology_term_id

In [52]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density',
       'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density',
       'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density',
       'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density',
       'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density',
       'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density',
       'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density',
       'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density',
       'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density',
       'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density',
       'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density',
       'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density',
       'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density',
       'w_sf_dNK2_cell_density

In [53]:
cell_density_columns = [col for col in adata.obs.columns if 'cell_density' in col]

In [54]:
cell_density_columns

['w_sf_B_cells_cell_density',
 'w_sf_DC_cell_density',
 'w_sf_EVT_1_cell_density',
 'w_sf_EVT_2_cell_density',
 'w_sf_Endo_F_cell_density',
 'w_sf_Endo_L_cell_density',
 'w_sf_Endo_M_cell_density',
 'w_sf_GC_cell_density',
 'w_sf_Granulocytes_cell_density',
 'w_sf_HOFB_cell_density',
 'w_sf_ILC3_cell_density',
 'w_sf_M3_cell_density',
 'w_sf_MO_cell_density',
 'w_sf_NK_cell_density',
 'w_sf_PV_cell_density',
 'w_sf_Plasma_cell_density',
 'w_sf_SCT_cell_density',
 'w_sf_T_cells_cell_density',
 'w_sf_VCT_cell_density',
 'w_sf_VCT_CCC_cell_density',
 'w_sf_VCT_fusing_cell_density',
 'w_sf_VCT_p_cell_density',
 'w_sf_dDC_cell_density',
 'w_sf_dEpi_lumenal_cell_density',
 'w_sf_dEpi_secretory_cell_density',
 'w_sf_dM1_cell_density',
 'w_sf_dM2_cell_density',
 'w_sf_dNK1_cell_density',
 'w_sf_dNK2_cell_density',
 'w_sf_dNK3_cell_density',
 'w_sf_dS1_cell_density',
 'w_sf_dS2_cell_density',
 'w_sf_dS3_cell_density',
 'w_sf_dT_cells_cell_density',
 'w_sf_dT_regs_cell_density',
 'w_sf_eEVT_cell

In [55]:
max_columns = adata.obs[cell_density_columns].apply(lambda x: x[x == x.max()].index.tolist(), axis=1)

In [56]:
max_columns

barcode_sample
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710    [w_sf_dEpi_secretory_cell_density]
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710              [w_sf_eEVT_cell_density]
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710    [w_sf_dEpi_secretory_cell_density]
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710               [w_sf_dS3_cell_density]
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710    [w_sf_dEpi_secretory_cell_density]
                                                           ...                
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710             [w_sf_VCT_p_cell_density]
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710               [w_sf_dS3_cell_density]
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710                [w_sf_PV_cell_density]
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710               [w_sf_dS3_cell_density]
TTGTTTCATTAGTCTA-1_36173_Pla_HDBR9518710              [w_sf_iEVT_cell_density]
Length: 1216, dtype: object

In [57]:
columns_with_multiple_max = max_columns.apply(lambda x: [col for col in x if x.count(col)>1])

In [58]:
adata.obs['columns_with_multiple_max'] = columns_with_multiple_max

In [59]:
adata.obs['columns_with_multiple_max']

barcode_sample
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710    []
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710    []
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710    []
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710    []
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710    []
                                            ..
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710    []
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710    []
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710    []
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710    []
TTGTTTCATTAGTCTA-1_36173_Pla_HDBR9518710    []
Name: columns_with_multiple_max, Length: 1216, dtype: object

In [60]:
non_empty_columns = columns_with_multiple_max[columns_with_multiple_max .apply(lambda x: len(x) > 0)]

In [61]:
len(non_empty_columns )

0

In [62]:
#max_columns = max_columns.apply(lambda x: [col.replace('w_sf_','').replace('_cell_density','') for col in x])

In [63]:
max_columns = [col[0].replace('w_sf_','').replace('_cell_density','')if len(col) > 0 else '' for col in max_columns]

In [64]:
adata.obs['highest_cell_density_columns'] = max_columns

In [65]:
adata.obs['highest_cell_density_columns']

barcode_sample
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710    dEpi_secretory
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710              eEVT
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710    dEpi_secretory
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710               dS3
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710    dEpi_secretory
                                                 ...      
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710             VCT_p
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710               dS3
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710                PV
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710               dS3
TTGTTTCATTAGTCTA-1_36173_Pla_HDBR9518710              iEVT
Name: highest_cell_density_columns, Length: 1216, dtype: object

In [66]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dT_cells_mRNA_factors,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,36173_Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,152.945587,225.546341,537.210815,218.911438,162.125534,3484.553223,189.585800,EFO:0010961,[],dEpi_secretory
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,36173_Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,172.194626,223.162811,27507.046875,354.363403,437.217773,11929.161133,217.947678,EFO:0010961,[],eEVT
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,36173_Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,156.910828,209.411835,382.280914,208.857254,153.404907,2987.002441,434.273956,EFO:0010961,[],dEpi_secretory
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,36173_Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,233.736801,293.893097,1212.644287,238.906815,197.554626,2711.770508,175.694397,EFO:0010961,[],dS3
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,36173_Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,53.093903,68.910667,65.358505,153.708038,112.560204,107.849228,251.829956,EFO:0010961,[],dEpi_secretory
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,36173_Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,55.207237,77.061707,36.361221,219.334869,67.158279,32.876560,47.490475,EFO:0010961,[],VCT_p
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,36173_Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,188.231461,168.888947,269.134735,333.773987,192.444794,3235.163330,674.082703,EFO:0010961,[],dS3
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,36173_Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,74.568214,88.453781,68.000168,290.162689,208.038116,84.349480,2258.859131,EFO:0010961,[],PV
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,36173_Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,168.481979,178.552078,98.128784,147.273499,107.159935,94.280006,88.141670,EFO:0010961,[],dS3


In [67]:
adata.obs['highest_cell_density_columns']

barcode_sample
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710    dEpi_secretory
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710              eEVT
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710    dEpi_secretory
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710               dS3
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710    dEpi_secretory
                                                 ...      
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710             VCT_p
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710               dS3
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710                PV
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710               dS3
TTGTTTCATTAGTCTA-1_36173_Pla_HDBR9518710              iEVT
Name: highest_cell_density_columns, Length: 1216, dtype: object

In [68]:
list(adata.obs['highest_cell_density_columns'].unique())

['dEpi_secretory',
 'eEVT',
 'dS3',
 'B_cells',
 'Endo_L',
 'uSMC',
 'dNK1',
 'iEVT',
 'GC',
 'Endo_M',
 'VCT_p',
 'VCT',
 'VCT_fusing',
 'dM1',
 'SCT',
 'fF2',
 'EVT_2',
 'PV']

In [69]:
# if different samples have different terms

In [70]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['highest_cell_density_columns']

In [71]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['dEpi_secretory',
 'eEVT',
 'dS3',
 'B_cells',
 'Endo_L',
 'uSMC',
 'dNK1',
 'iEVT',
 'GC',
 'Endo_M',
 'VCT_p',
 'VCT',
 'VCT_fusing',
 'dM1',
 'SCT',
 'fF2',
 'EVT_2',
 'PV']

In [72]:
#create a dictionary called mapping with version and assay_ontology_term_id

In [73]:
cl_tropho = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/cl_tropho.csv')

In [74]:
mapping = dict(zip(cl_tropho['cell_type'], cl_tropho['CL ID']))

In [75]:
mapping

{'dNK1': 'CL:0002343',
 'dT_cells': 'CL:0000084',
 'T_cells': 'CL:0000084',
 'dNK2': 'CL:0002343',
 'ILC3': 'CL:0001078',
 'dNK3': 'CL:0002343',
 'dT_regs': 'CL:0000815',
 'dM1': 'CL:0000235',
 'HOFB': 'CL:3000001',
 'B_cells': 'CL:0000236',
 'dDC': 'CL:0000451',
 'dM2': 'CL:0000235',
 'dS2': 'CL:0000499',
 'M3': 'CL:0000235',
 'VCT': 'CL:2000060',
 'NK': 'CL:0000623',
 'Granulocytes': 'CL:0000094',
 'uSMC': 'CL:0002601',
 'Endo_F': 'CL:0009092',
 'DC': 'CL:0000451',
 'dEpi_secretory': 'CL:0000066',
 'MO': 'CL:0000235',
 'EVT_2': 'CL:0008036',
 'Endo_M': 'CL:0009095',
 'SCT': 'CL:0000525',
 'dS1': 'CL:0000499',
 'dS3': 'CL:0000499',
 'dEpi_lumenal': 'CL:0000066',
 'iEVT': 'CL:0008036',
 'Endo_L': 'CL:0002138',
 'PVMMP11': 'CL:0000003',
 'PVSTEAP4': 'CL:0000003',
 'EVT_1': 'CL:0008036',
 'PVAOC3': 'CL:0000003',
 'fF1': 'CL:2000042',
 'VCT_CCC': 'CL:2000060',
 'fF2': 'CL:2000042',
 'VCT_p': 'CL:2000060',
 'Plasma': 'CL:0000786',
 'VCT_fusing': 'CL:2000060',
 'eEVT': 'CL:0008036',
 'GC': 

In [76]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [77]:
# change the data type 

In [78]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [79]:
# if mapping from supplementary file

In [80]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_dT_regs_mRNA_factors,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,36173_Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,225.546341,537.210815,218.911438,162.125534,3484.553223,189.585800,EFO:0010961,[],dEpi_secretory,CL:0000066
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,36173_Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,223.162811,27507.046875,354.363403,437.217773,11929.161133,217.947678,EFO:0010961,[],eEVT,CL:0008036
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,36173_Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,209.411835,382.280914,208.857254,153.404907,2987.002441,434.273956,EFO:0010961,[],dEpi_secretory,CL:0000066
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,36173_Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,293.893097,1212.644287,238.906815,197.554626,2711.770508,175.694397,EFO:0010961,[],dS3,CL:0000499
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,36173_Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,68.910667,65.358505,153.708038,112.560204,107.849228,251.829956,EFO:0010961,[],dEpi_secretory,CL:0000066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,36173_Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,77.061707,36.361221,219.334869,67.158279,32.876560,47.490475,EFO:0010961,[],VCT_p,CL:2000060
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,36173_Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,168.888947,269.134735,333.773987,192.444794,3235.163330,674.082703,EFO:0010961,[],dS3,CL:0000499
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,36173_Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,88.453781,68.000168,290.162689,208.038116,84.349480,2258.859131,EFO:0010961,[],PV,CL:0000003
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,36173_Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,178.552078,98.128784,147.273499,107.159935,94.280006,88.141670,EFO:0010961,[],dS3,CL:0000499


## development_stage_ontology_term_id

In [81]:
# if same for all columns

In [82]:
suppl_info = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suppl_info/visium_metadata.csv')

In [83]:
adata.obs['sample'] = adata.obs['sample'].str.split('_',n=1).str[1]

In [84]:
list(adata.obs['sample'].unique())

['Pla_HDBR9518710']

In [85]:
list(adata.obs['sample'].unique())

['Pla_HDBR9518710']

In [86]:
mapping = dict(zip(suppl_info['sample'], suppl_info['Stage_PCW']))

In [87]:
mapping

{'Pla_HDBR9518710': '6_PCW',
 'WS_PLA_S9101764': '8-9_PCW',
 'WS_PLA_S9101765': '8-9_PCW',
 'WS_PLA_S9101766': '8-9_PCW',
 'WS_PLA_S9101767': '8-9_PCW',
 'Pla_Camb9518737': '8-9_PCW',
 'WS_PLA_S9101769': '8_PCW',
 'WS_PLA_S9101770': '8_PCW'}

In [88]:
adata.obs['development_stage'] = adata.obs['sample'].map(mapping)

In [89]:
list(adata.obs['development_stage'].unique())

['6_PCW']

In [90]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_eEVT_mRNA_factors,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,537.210815,218.911438,162.125534,3484.553223,189.585800,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,27507.046875,354.363403,437.217773,11929.161133,217.947678,EFO:0010961,[],eEVT,CL:0008036,6_PCW
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,382.280914,208.857254,153.404907,2987.002441,434.273956,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,1212.644287,238.906815,197.554626,2711.770508,175.694397,EFO:0010961,[],dS3,CL:0000499,6_PCW
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,65.358505,153.708038,112.560204,107.849228,251.829956,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,36.361221,219.334869,67.158279,32.876560,47.490475,EFO:0010961,[],VCT_p,CL:2000060,6_PCW
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,269.134735,333.773987,192.444794,3235.163330,674.082703,EFO:0010961,[],dS3,CL:0000499,6_PCW
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,68.000168,290.162689,208.038116,84.349480,2258.859131,EFO:0010961,[],PV,CL:0000003,6_PCW
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,98.128784,147.273499,107.159935,94.280006,88.141670,EFO:0010961,[],dS3,CL:0000499,6_PCW


In [91]:
mapping = {'10_PCW':'HsapDv:0000048',
'8_PCW':'HsapDv:0000030',
'6_PCW':'HsapDv:0000025',
'7-8_PCW':'HsapDv:0000029',
'4-5_PCW':'HsapDv:0000021',
'9_PCW':'HsapDv:0000047',
'12_PCW':'HsapDv:0000049',
'12-13_PCW':'HsapDv:0000050',
'5_PCW':'HsapDv:0000023',
'8-9_PCW':'HsapDv:0000046',
'4_PCW':'HsapDv:0000020'}

In [92]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage'].map(mapping)

In [93]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_fF1_mRNA_factors,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,218.911438,162.125534,3484.553223,189.585800,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW,HsapDv:0000025
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,354.363403,437.217773,11929.161133,217.947678,EFO:0010961,[],eEVT,CL:0008036,6_PCW,HsapDv:0000025
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,208.857254,153.404907,2987.002441,434.273956,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW,HsapDv:0000025
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,238.906815,197.554626,2711.770508,175.694397,EFO:0010961,[],dS3,CL:0000499,6_PCW,HsapDv:0000025
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,153.708038,112.560204,107.849228,251.829956,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW,HsapDv:0000025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,219.334869,67.158279,32.876560,47.490475,EFO:0010961,[],VCT_p,CL:2000060,6_PCW,HsapDv:0000025
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,333.773987,192.444794,3235.163330,674.082703,EFO:0010961,[],dS3,CL:0000499,6_PCW,HsapDv:0000025
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,290.162689,208.038116,84.349480,2258.859131,EFO:0010961,[],PV,CL:0000003,6_PCW,HsapDv:0000025
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,147.273499,107.159935,94.280006,88.141670,EFO:0010961,[],dS3,CL:0000499,6_PCW,HsapDv:0000025


## disease_ontology_term_id

In [94]:
# if same for all columns

In [95]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [96]:
# change the data type 

In [97]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [98]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_fF2_mRNA_factors,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,162.125534,3484.553223,189.585800,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW,HsapDv:0000025,PATO:0000461
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,437.217773,11929.161133,217.947678,EFO:0010961,[],eEVT,CL:0008036,6_PCW,HsapDv:0000025,PATO:0000461
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,153.404907,2987.002441,434.273956,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW,HsapDv:0000025,PATO:0000461
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,197.554626,2711.770508,175.694397,EFO:0010961,[],dS3,CL:0000499,6_PCW,HsapDv:0000025,PATO:0000461
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,112.560204,107.849228,251.829956,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW,HsapDv:0000025,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,67.158279,32.876560,47.490475,EFO:0010961,[],VCT_p,CL:2000060,6_PCW,HsapDv:0000025,PATO:0000461
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,192.444794,3235.163330,674.082703,EFO:0010961,[],dS3,CL:0000499,6_PCW,HsapDv:0000025,PATO:0000461
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,208.038116,84.349480,2258.859131,EFO:0010961,[],PV,CL:0000003,6_PCW,HsapDv:0000025,PATO:0000461
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,107.159935,94.280006,88.141670,EFO:0010961,[],dS3,CL:0000499,6_PCW,HsapDv:0000025,PATO:0000461


## donor_id

In [99]:
mapping = dict(zip(suppl_info['sample'], suppl_info['Donor']))

In [100]:
mapping

{'Pla_HDBR9518710': 'Hrv43',
 'WS_PLA_S9101764': 'P13',
 'WS_PLA_S9101765': 'P13',
 'WS_PLA_S9101766': 'P13',
 'WS_PLA_S9101767': 'P13',
 'Pla_Camb9518737': 'P13',
 'WS_PLA_S9101769': 'P14',
 'WS_PLA_S9101770': 'P14'}

In [101]:
adata.obs['donor_id'] = adata.obs['sample'].map(mapping)

In [102]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,u_sf_iEVT_mRNA_factors,u_sf_uSMC_mRNA_factors,assay_ontology_term_id,columns_with_multiple_max,highest_cell_density_columns,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,3484.553223,189.585800,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,11929.161133,217.947678,EFO:0010961,[],eEVT,CL:0008036,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,2987.002441,434.273956,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,2711.770508,175.694397,EFO:0010961,[],dS3,CL:0000499,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,107.849228,251.829956,EFO:0010961,[],dEpi_secretory,CL:0000066,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,32.876560,47.490475,EFO:0010961,[],VCT_p,CL:2000060,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,3235.163330,674.082703,EFO:0010961,[],dS3,CL:0000499,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,84.349480,2258.859131,EFO:0010961,[],PV,CL:0000003,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,94.280006,88.141670,EFO:0010961,[],dS3,CL:0000499,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43


## is_primary_data

In [103]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [104]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [105]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [106]:
# change the data type 

In [107]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

## self_reported_ethnicity_ontology_term_id

In [108]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

## sex_ontology_term_id

In [109]:
# if same for all columns

In [110]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

## suspension_type

In [111]:
# if same for all columns

In [112]:
adata.obs['suspension_type'] = ['na'] * len(adata.obs)

In [113]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,CL:0000066,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,CL:0008036,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,CL:0000066,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,CL:0000499,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,CL:0000066,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,CL:2000060,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,CL:0000499,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,CL:0000003,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,CL:0000499,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na


## tissue_ontology_term_id

In [114]:
suppl_tissue = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suppl_info/tissue_info.csv')

In [115]:
mapping = dict(zip(suppl_tissue['Donor'], suppl_tissue['Tissue']))

In [116]:
mapping

{'P13': 'Placenta, decidua basalis',
 'P14': 'Placenta, decidua basalis',
 '5': 'Placenta, decidua basalis',
 '10': 'Placenta, decidua basalis',
 '11': 'Placenta',
 'H2': 'Decidua basalis',
 'H7': 'Decidua basalis',
 'H9': 'Decidua basalis',
 'Hrv98': 'Placenta',
 'Hrv99': 'Placenta',
 'Hrv100': 'Placenta',
 'Hrv43': 'Decidua basalis',
 'Hrv46': 'Placenta, decidua basalis',
 'P34': 'Placenta, decidua basalis',
 'Hrv81': 'Decidua basalis',
 '6': 'Decidua basalis, peripheral blood',
 '7': 'Decidua basalis, peripheral blood',
 '8': 'Decidua basalis, peripheral blood',
 '9': 'Decidua basalis, peripheral blood',
 '12': 'Placenta, decidua basalis'}

In [117]:
adata.obs['tissue_ontology_term_id'] = adata.obs['donor_id'].map(mapping)

In [118]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,Decidua basalis
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,Decidua basalis
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,Decidua basalis
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,Decidua basalis
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,Decidua basalis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,Decidua basalis
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,Decidua basalis
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,Decidua basalis
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,Decidua basalis


In [119]:
list(adata.obs['tissue_ontology_term_id'].unique())

['Decidua basalis']

In [120]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0000453'] * len(adata.obs)

In [121]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453


In [122]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density',
       ...
       'development_stage', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=102)

### Data types

In [123]:
adata.obs.dtypes

in_tissue                                   category
array_row                                    float32
array_col                                    float32
sample                                        object
n_counts                                     float32
                                              ...   
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id      object
sex_ontology_term_id                          object
suspension_type                               object
tissue_ontology_term_id                       object
Length: 102, dtype: object

### obsm (Embeddings)

In [124]:
adata.obsm

AxisArrays with keys: X_spatial

### uns

In [125]:
adata.uns

OverloadedDict, wrapping:
	{'spatial': {'spaceranger110_count_36173_Pla_HDBR9518710_GRCh38-3_0_0_premrna': {'images': {'hires': array([[[0.93333334, 0.93333334, 0.9411765 ],
        [0.93333334, 0.93333334, 0.9411765 ],
        [0.93333334, 0.93333334, 0.9411765 ],
        ...,
        [0.9372549 , 0.9372549 , 0.94509804],
        [0.9372549 , 0.9372549 , 0.94509804],
        [0.9372549 , 0.9372549 , 0.94509804]],

       [[0.93333334, 0.93333334, 0.9411765 ],
        [0.93333334, 0.93333334, 0.9411765 ],
        [0.93333334, 0.93333334, 0.9411765 ],
        ...,
        [0.9372549 , 0.9372549 , 0.94509804],
        [0.9372549 , 0.9372549 , 0.94509804],
        [0.9372549 , 0.9372549 , 0.94509804]],

       [[0.93333334, 0.93333334, 0.9411765 ],
        [0.93333334, 0.93333334, 0.9411765 ],
        [0.93333334, 0.93333334, 0.9411765 ],
        ...,
        [0.9372549 , 0.9372549 , 0.94509804],
        [0.9372549 , 0.9372549 , 0.94509804],
        [0.9372549 , 0.9372549 , 0.94509804]],


In [126]:
# define a field in uns

In [127]:
adata.uns['schema_version'] = '3.0.0'
adata.uns['default_embedding'] = 'X_spatial'
adata.uns['title'] = 'Pla_HDBR9518710'
adata.uns['image_caption'] = 'Shown here is an image of 10 μm cryosections of human maternal-fetal interface (decidua, placenta) tissue stained with H&E'

### Final checks

In [128]:
# View Anndata object

In [129]:
adata

AnnData object with n_obs × n_vars = 1216 × 33234
    obs: 'in_tissue', 'array_row', 'array_col', 'sample', 'n_counts', 'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density', 'w_sf_DC_cell_density', 'w_sf_EVT_1_cell_density', 'w_sf_EVT_2_cell_density', 'w_sf_Endo_F_cell_density', 'w_sf_Endo_L_cell_density', 'w_sf_Endo_M_cell_density', 'w_sf_GC_cell_density', 'w_sf_Granulocytes_cell_density', 'w_sf_HOFB_cell_density', 'w_sf_ILC3_cell_density', 'w_sf_M3_cell_density', 'w_sf_MO_cell_density', 'w_sf_NK_cell_density', 'w_sf_PV_cell_density', 'w_sf_Plasma_cell_density', 'w_sf_SCT_cell_density', 'w_sf_T_cells_cell_density', 'w_sf_VCT_cell_density', 'w_sf_VCT_CCC_cell_density', 'w_sf_VCT_fusing_cell_density', 'w_sf_VCT_p_cell_density', 'w_sf_dDC_cell_density', 'w_sf_dEpi_lumenal_cell_density', 'w_sf_dEpi_secretory_cell_density', 'w_sf_dM1_cell_density', 'w_sf_dM2_cell_density', 'w_sf_dNK1_cell_density', 'w_sf_dNK2_cell_density', 'w_sf_dNK3_cell_density', 'w_sf_dS1_cell_density',

In [130]:
#View var

In [131]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,gene_symbols,feature_is_filtered
ensembl_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna,MIR1302-2HG,False
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna,FAM138A,False
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna,OR4F5,False
ENSG00000238009,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna,AL627309.1,False
ENSG00000239945,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna,AL627309.3,False
...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna,AC233755.2,False
ENSG00000275063,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna,AC233755.1,False
ENSG00000271254,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna,AC240274.1,False
ENSG00000277475,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna,AC213203.1,False


In [132]:
#View obs

In [133]:
adata.obs

Unnamed: 0_level_0,in_tissue,array_row,array_col,sample,n_counts,log_counts,n_genes,percent_mito,w_sf_B_cells_cell_density,w_sf_DC_cell_density,...,development_stage,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1_36173_Pla_HDBR9518710,1,50.0,102.0,Pla_HDBR9518710,13725.0,9.526975,3763.0,0.029727,0.033085,0.028762,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACACCAATAACTGC-1_36173_Pla_HDBR9518710,1,59.0,19.0,Pla_HDBR9518710,41607.0,10.636024,5942.0,0.028986,0.026961,0.020040,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACATGGTGAGAGGA-1_36173_Pla_HDBR9518710,1,62.0,0.0,Pla_HDBR9518710,18644.0,9.833280,4274.0,0.030036,0.051780,0.061757,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACATTTCCCGGATT-1_36173_Pla_HDBR9518710,1,61.0,97.0,Pla_HDBR9518710,17960.0,9.795902,4695.0,0.013029,0.062498,0.090028,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
AAACCGTTCGTCCAGG-1_36173_Pla_HDBR9518710,1,52.0,42.0,Pla_HDBR9518710,3544.0,8.173012,1546.0,0.096219,0.025342,0.004594,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTGGCCCTGACAGT-1_36173_Pla_HDBR9518710,1,18.0,60.0,Pla_HDBR9518710,4317.0,8.370316,1626.0,0.146861,0.001302,0.001700,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTGGTAGGAGGGAT-1_36173_Pla_HDBR9518710,1,50.0,28.0,Pla_HDBR9518710,15422.0,9.643550,4277.0,0.018480,0.041558,0.062895,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTGGTGGTACTAAG-1_36173_Pla_HDBR9518710,1,63.0,95.0,Pla_HDBR9518710,5191.0,8.554682,2210.0,0.020613,0.006534,0.019164,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453
TTGTTAGCAAATTCGA-1_36173_Pla_HDBR9518710,1,22.0,42.0,Pla_HDBR9518710,11098.0,9.314520,3274.0,0.012345,0.029796,0.135006,...,6_PCW,HsapDv:0000025,PATO:0000461,Hrv43,True,NCBITaxon:9606,unknown,unknown,na,UBERON:0000453


In [134]:
adata.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'sample', 'n_counts',
       'log_counts', 'n_genes', 'percent_mito', 'w_sf_B_cells_cell_density',
       'w_sf_DC_cell_density',
       ...
       'development_stage', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object', length=102)

In [135]:
del adata.obs['columns_with_multiple_max']
del adata.obs['development_stage']

In [136]:
#check data type of obs

In [137]:
adata.obs.dtypes

in_tissue                                   category
array_row                                    float32
array_col                                    float32
sample                                        object
n_counts                                     float32
                                              ...   
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id      object
sex_ontology_term_id                          object
suspension_type                               object
tissue_ontology_term_id                       object
Length: 100, dtype: object

In [138]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed sample from object to category
changed assay_ontology_term_id from object to category
changed highest_cell_density_columns from object to category
changed development_stage_ontology_term_id from object to category
changed donor_id from object to category
changed self_reported_ethnicity_ontology_term_id from object to category
changed sex_ontology_term_id from object to category
changed suspension_type from object to category
changed tissue_ontology_term_id from object to category


In [139]:
#change data type

In [140]:
#check data type of var

In [141]:
adata.var.dtypes

gene_ids                 object
feature_types          category
genome                 category
gene_symbols             object
feature_is_filtered        bool
dtype: object

In [142]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [143]:
# delete columns not required

In [144]:
# Convert expression matrix to sparse matrix format

In [145]:
adata.X

<1216x33234 sparse matrix of type '<class 'numpy.float32'>'
	with 4246385 stored elements in Compressed Sparse Column format>

In [146]:
# check the format of the expression matrix. If the matrix is in CSR (Compressed sparse row) format keep it as such
# If the file format of the matrix is compressed sparse column (CSC) format or dense numpy array, convert the expression matrix into CSR format.

In [147]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [148]:
adata.X

<1216x33234 sparse matrix of type '<class 'numpy.float32'>'
	with 4246385 stored elements in Compressed Sparse Row format>

In [149]:
del adata.raw

In [150]:
# save the final object

In [151]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/final_objects/Pla_HDBR9518710.h5ad', compression = 'gzip')