#  Curating All_donors_all_celltypes dataset 

In [1]:
Article: 'Spatial multiomics map of trophoblast development in early pregnancy'

In [2]:
DOI : 'https://doi.org/10.1038/s41586-023-05869-0'

In [3]:
Data_Source : 'https://www.reproductivecellatlas.org/mfi.html'

### Mount farm

mount-farm

### Packages required for curation

#### Import all packages required for the curation

In [4]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math

### Curation Schema

#### X (Matrix Layers)

#### AnnData object

In [5]:
# Load the AnnData object

In [6]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/All_donors_all_celltypes.h5ad')

In [7]:
# View the AnnData object

In [8]:
adata

AnnData object with n_obs × n_vars = 325665 × 30800
    obs: 'batch', 'cell_type', 'celltype_predictions', 'coarse_annot', 'dataset', 'dev_age', 'donor', 'number_of_individuals_multiplexed', 'origin_M_F', 'phase', 'sample', 'technology', 'tissue', 'n_counts', 'barcode_sample_copy'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-14', 'feature_types-14', 'genome-14', 'n_cells-14', 'gene_ids-15', 'feature_types-15', 'genome-15', 'n_cells-15', 'gene_ids-16', 'feature_types-16', 'genome-16', 'n_cells-16', 'gene_ids-17', 'feature_types-17', 'genome-17', 'n_cells-17', 'gene_ids-18', 'feature_types-18', 'genome-18', 'n_cells-18', 'gene_ids-19', 'feature_types-19', 

##### Raw Counts matrix

In [9]:
# Check whether adata has raw counts or normalized counts

In [10]:
print(adata.X)

  (7, 0)	0.41699895
  (20, 0)	0.44862214
  (21, 0)	0.79338884
  (23, 0)	0.9673984
  (25, 0)	0.8753028
  (42, 0)	0.64698803
  (43, 0)	0.4615943
  (50, 0)	0.56738895
  (51, 0)	0.64946383
  (60, 0)	0.8226497
  (68, 0)	1.1083244
  (76, 0)	0.47893676
  (84, 0)	0.7075762
  (89, 0)	0.7554972
  (108, 0)	0.9997589
  (116, 0)	0.49977666
  (139, 0)	1.3935777
  (142, 0)	1.4833972
  (161, 0)	0.34759593
  (171, 0)	1.0793464
  (195, 0)	0.5643817
  (199, 0)	0.33001572
  (211, 0)	0.724423
  (214, 0)	0.96127605
  (228, 0)	0.7969091
  :	:
  (325619, 30799)	0.23219529
  (325621, 30799)	0.6936047
  (325623, 30799)	0.46866632
  (325627, 30799)	1.1931299
  (325629, 30799)	0.79786247
  (325630, 30799)	1.2642754
  (325632, 30799)	1.9188838
  (325633, 30799)	0.44599012
  (325634, 30799)	0.30507258
  (325635, 30799)	0.36560437
  (325637, 30799)	0.61832994
  (325640, 30799)	0.3259356
  (325642, 30799)	1.0064696
  (325643, 30799)	0.44146213
  (325645, 30799)	0.37743264
  (325647, 30799)	0.34338242
  (325648, 30799

In [11]:
print(adata.raw.X)

  (7, 0)	1.0
  (20, 0)	1.0
  (21, 0)	1.0
  (23, 0)	1.0
  (25, 0)	1.0
  (42, 0)	1.0
  (43, 0)	1.0
  (50, 0)	1.0
  (51, 0)	2.0
  (60, 0)	1.0
  (68, 0)	2.0
  (76, 0)	1.0
  (84, 0)	1.0
  (89, 0)	1.0
  (108, 0)	1.0
  (116, 0)	1.0
  (139, 0)	2.0
  (142, 0)	1.0
  (161, 0)	1.0
  (171, 0)	1.0
  (195, 0)	1.0
  (199, 0)	1.0
  (211, 0)	1.0
  (214, 0)	1.0
  (228, 0)	1.0
  :	:
  (325619, 30799)	1.0
  (325621, 30799)	2.0
  (325623, 30799)	1.0
  (325627, 30799)	1.0
  (325629, 30799)	1.0
  (325630, 30799)	1.0
  (325632, 30799)	2.0
  (325633, 30799)	3.0
  (325634, 30799)	1.0
  (325635, 30799)	2.0
  (325637, 30799)	1.0
  (325640, 30799)	1.0
  (325642, 30799)	5.0
  (325643, 30799)	1.0
  (325645, 30799)	1.0
  (325647, 30799)	1.0
  (325648, 30799)	1.0
  (325650, 30799)	2.0
  (325652, 30799)	2.0
  (325653, 30799)	3.0
  (325654, 30799)	4.0
  (325655, 30799)	1.0
  (325660, 30799)	1.0
  (325661, 30799)	3.0
  (325663, 30799)	1.0


In [12]:
#since raw counts are present in adata.raw, copy the counts to araw

In [13]:
araw = adata.raw.to_adata()

In [14]:
araw

AnnData object with n_obs × n_vars = 325665 × 30800
    obs: 'batch', 'cell_type', 'celltype_predictions', 'coarse_annot', 'dataset', 'dev_age', 'donor', 'number_of_individuals_multiplexed', 'origin_M_F', 'phase', 'sample', 'technology', 'tissue', 'n_counts', 'barcode_sample_copy'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-14', 'feature_types-14', 'genome-14', 'n_cells-14', 'gene_ids-15', 'feature_types-15', 'genome-15', 'n_cells-15', 'gene_ids-16', 'feature_types-16', 'genome-16', 'n_cells-16', 'gene_ids-17', 'feature_types-17', 'genome-17', 'n_cells-17', 'gene_ids-18', 'feature_types-18', 'genome-18', 'n_cells-18', 'gene_ids-19', 'feature_types-19', 

#### Variables (var)

In [15]:
# View var

In [16]:
adata.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,genome-7,n_cells-7,gene_ids-8,feature_types-8,genome-8,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9
A1BG,ENSG00000121410,Gene Expression,GRCh38,58.0,ENSG00000121410,Gene Expression,GRCh38,118.0,ENSG00000121410,Gene Expression,...,GRCh38,140.0,ENSG00000121410,Gene Expression,GRCh38,185.0,ENSG00000121410,Gene Expression,GRCh38,608.0
A1BG-AS1,ENSG00000268895,Gene Expression,GRCh38,7.0,ENSG00000268895,Gene Expression,GRCh38,21.0,ENSG00000268895,Gene Expression,...,GRCh38,14.0,ENSG00000268895,Gene Expression,GRCh38,20.0,ENSG00000268895,Gene Expression,GRCh38,91.0
A1CF,,,,,,,,,,,...,,,,,,,,,,
A2M,ENSG00000175899,Gene Expression,GRCh38,66.0,ENSG00000175899,Gene Expression,GRCh38,171.0,ENSG00000175899,Gene Expression,...,,,ENSG00000175899,Gene Expression,GRCh38,5.0,ENSG00000175899,Gene Expression,GRCh38,2117.0
A2M-AS1,ENSG00000245105,Gene Expression,GRCh38,6.0,ENSG00000245105,Gene Expression,GRCh38,5.0,ENSG00000245105,Gene Expression,...,GRCh38,59.0,ENSG00000245105,Gene Expression,GRCh38,61.0,ENSG00000245105,Gene Expression,GRCh38,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ENSG00000070476,Gene Expression,GRCh38,11.0,ENSG00000070476,Gene Expression,GRCh38,21.0,ENSG00000070476,Gene Expression,...,GRCh38,26.0,ENSG00000070476,Gene Expression,GRCh38,32.0,ENSG00000070476,Gene Expression,GRCh38,122.0
ZYG11A,,,,,,,,,,,...,,,,,,,,,,
ZYG11B,ENSG00000162378,Gene Expression,GRCh38,15.0,ENSG00000162378,Gene Expression,GRCh38,41.0,ENSG00000162378,Gene Expression,...,GRCh38,22.0,ENSG00000162378,Gene Expression,GRCh38,47.0,ENSG00000162378,Gene Expression,GRCh38,183.0
ZYX,ENSG00000159840,Gene Expression,GRCh38,134.0,ENSG00000159840,Gene Expression,GRCh38,284.0,ENSG00000159840,Gene Expression,...,GRCh38,265.0,ENSG00000159840,Gene Expression,GRCh38,319.0,ENSG00000159840,Gene Expression,GRCh38,2351.0


In [17]:
araw.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,genome-7,n_cells-7,gene_ids-8,feature_types-8,genome-8,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9
A1BG,ENSG00000121410,Gene Expression,GRCh38,58.0,ENSG00000121410,Gene Expression,GRCh38,118.0,ENSG00000121410,Gene Expression,...,GRCh38,140.0,ENSG00000121410,Gene Expression,GRCh38,185.0,ENSG00000121410,Gene Expression,GRCh38,608.0
A1BG-AS1,ENSG00000268895,Gene Expression,GRCh38,7.0,ENSG00000268895,Gene Expression,GRCh38,21.0,ENSG00000268895,Gene Expression,...,GRCh38,14.0,ENSG00000268895,Gene Expression,GRCh38,20.0,ENSG00000268895,Gene Expression,GRCh38,91.0
A1CF,,,,,,,,,,,...,,,,,,,,,,
A2M,ENSG00000175899,Gene Expression,GRCh38,66.0,ENSG00000175899,Gene Expression,GRCh38,171.0,ENSG00000175899,Gene Expression,...,,,ENSG00000175899,Gene Expression,GRCh38,5.0,ENSG00000175899,Gene Expression,GRCh38,2117.0
A2M-AS1,ENSG00000245105,Gene Expression,GRCh38,6.0,ENSG00000245105,Gene Expression,GRCh38,5.0,ENSG00000245105,Gene Expression,...,GRCh38,59.0,ENSG00000245105,Gene Expression,GRCh38,61.0,ENSG00000245105,Gene Expression,GRCh38,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ENSG00000070476,Gene Expression,GRCh38,11.0,ENSG00000070476,Gene Expression,GRCh38,21.0,ENSG00000070476,Gene Expression,...,GRCh38,26.0,ENSG00000070476,Gene Expression,GRCh38,32.0,ENSG00000070476,Gene Expression,GRCh38,122.0
ZYG11A,,,,,,,,,,,...,,,,,,,,,,
ZYG11B,ENSG00000162378,Gene Expression,GRCh38,15.0,ENSG00000162378,Gene Expression,GRCh38,41.0,ENSG00000162378,Gene Expression,...,GRCh38,22.0,ENSG00000162378,Gene Expression,GRCh38,47.0,ENSG00000162378,Gene Expression,GRCh38,183.0
ZYX,ENSG00000159840,Gene Expression,GRCh38,134.0,ENSG00000159840,Gene Expression,GRCh38,284.0,ENSG00000159840,Gene Expression,...,GRCh38,265.0,ENSG00000159840,Gene Expression,GRCh38,319.0,ENSG00000159840,Gene Expression,GRCh38,2351.0


### Ensembl IDs

In [18]:
#Ensembl ids are provided in multiple columns in adata.var

In [19]:
#Select columns from adata.var that have column name starting with gene_ids and assigns them to a variable gene_names

In [20]:
gene_names = adata.var.loc[:,adata.var.columns.str.startswith('gene_ids')]

In [21]:
gene_names

Unnamed: 0,gene_ids-0,gene_ids-1,gene_ids-10,gene_ids-11,gene_ids-12,gene_ids-13,gene_ids-14,gene_ids-15,gene_ids-16,gene_ids-17,...,gene_ids-56,gene_ids-57,gene_ids-58,gene_ids-59,gene_ids-6,gene_ids-60,gene_ids-61,gene_ids-7,gene_ids-8,gene_ids-9
A1BG,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,...,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410
A1BG-AS1,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,...,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895
A1CF,,,,,,,ENSG00000148584,,,,...,ENSG00000148584,ENSG00000148584,ENSG00000148584,ENSG00000148584,,ENSG00000148584,ENSG00000148584,,,
A2M,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,...,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,ENSG00000175899,,ENSG00000175899,ENSG00000175899
A2M-AS1,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,...,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,...,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476
ZYG11A,,,,ENSG00000203995,,,ENSG00000203995,,ENSG00000203995,,...,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,,ENSG00000203995,ENSG00000203995,,,
ZYG11B,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,...,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378
ZYX,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,...,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840


In [22]:
len(gene_names)

30800

In [23]:
#Creates an empty list to store extracted gene names

In [24]:
ensg = []

for k in tqdm(range(0,len(adata.var))):
    gene_valcount = gene_names.iloc[k].value_counts()
    if sum(gene_names.iloc[k].value_counts().index.str.startswith('ENSG')) > 0:
        ensg.append(gene_valcount[gene_valcount.index.str.startswith('ENSG')].index[0])
    else:
        ensg.append('nan')

100%|██████████| 30800/30800 [00:51<00:00, 594.33it/s]


In [25]:
ensg

['ENSG00000121410',
 'ENSG00000268895',
 'ENSG00000148584',
 'ENSG00000175899',
 'ENSG00000245105',
 'ENSG00000166535',
 'ENSG00000256661',
 'ENSG00000184389',
 'ENSG00000128274',
 'ENSG00000118017',
 'ENSG00000094914',
 'ENSG00000081760',
 'ENSG00000114771',
 'ENSG00000197953',
 'ENSG00000242908',
 'ENSG00000188984',
 'ENSG00000204518',
 'ENSG00000109576',
 'ENSG00000158122',
 'ENSG00000103591',
 'ENSG00000115977',
 'ENSG00000087884',
 'ENSG00000127837',
 'ENSG00000129673',
 'ENSG00000131043',
 'ENSG00000205002',
 'ENSG00000090861',
 'ENSG00000124608',
 'ENSG00000266967',
 'ENSG00000157426',
 'ENSG00000149313',
 'ENSG00000008311',
 'ENSG00000215458',
 'ENSG00000275700',
 'ENSG00000181409',
 'ENSG00000254180',
 'ENSG00000281376',
 'ENSG00000183044',
 'ENSG00000165029',
 'ENSG00000154263',
 'ENSG00000144452',
 'ENSG00000179869',
 'ENSG00000107331',
 'ENSG00000167972',
 'ENSG00000198691',
 'ENSG00000154265',
 'ENSG00000154262',
 'ENSG00000064687',
 'ENSG00000141338',
 'ENSG00000154258',


In [26]:
len(ensg)

30800

In [27]:
# copy the index column values to a new column called gene_symbols

In [28]:
adata.var['gene_symbols'] = adata.var_names
araw.var['gene_symbols'] = adata.var_names

In [29]:
#set ensembl ids as index column

In [30]:
adata.var_names = ensg

In [31]:
araw.var_names = ensg

In [32]:
#Load the approved genes file in the curation

In [33]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [34]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [35]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [36]:
# Filter out genes that don't appear in the approved annotation

In [37]:
var_to_keep_adata = [x for x in ensg if (x in genedict)]

In [38]:
len(var_to_keep_adata)

30534

In [39]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_adata].copy()

In [40]:
araw.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,n_cells-7,gene_ids-8,feature_types-8,genome-8,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9,gene_symbols
ENSG00000121410,ENSG00000121410,Gene Expression,GRCh38,58.0,ENSG00000121410,Gene Expression,GRCh38,118.0,ENSG00000121410,Gene Expression,...,140.0,ENSG00000121410,Gene Expression,GRCh38,185.0,ENSG00000121410,Gene Expression,GRCh38,608.0,A1BG
ENSG00000268895,ENSG00000268895,Gene Expression,GRCh38,7.0,ENSG00000268895,Gene Expression,GRCh38,21.0,ENSG00000268895,Gene Expression,...,14.0,ENSG00000268895,Gene Expression,GRCh38,20.0,ENSG00000268895,Gene Expression,GRCh38,91.0,A1BG-AS1
ENSG00000148584,,,,,,,,,,,...,,,,,,,,,,A1CF
ENSG00000175899,ENSG00000175899,Gene Expression,GRCh38,66.0,ENSG00000175899,Gene Expression,GRCh38,171.0,ENSG00000175899,Gene Expression,...,,ENSG00000175899,Gene Expression,GRCh38,5.0,ENSG00000175899,Gene Expression,GRCh38,2117.0,A2M
ENSG00000245105,ENSG00000245105,Gene Expression,GRCh38,6.0,ENSG00000245105,Gene Expression,GRCh38,5.0,ENSG00000245105,Gene Expression,...,59.0,ENSG00000245105,Gene Expression,GRCh38,61.0,ENSG00000245105,Gene Expression,GRCh38,13.0,A2M-AS1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000070476,ENSG00000070476,Gene Expression,GRCh38,11.0,ENSG00000070476,Gene Expression,GRCh38,21.0,ENSG00000070476,Gene Expression,...,26.0,ENSG00000070476,Gene Expression,GRCh38,32.0,ENSG00000070476,Gene Expression,GRCh38,122.0,ZXDC
ENSG00000203995,,,,,,,,,,,...,,,,,,,,,,ZYG11A
ENSG00000162378,ENSG00000162378,Gene Expression,GRCh38,15.0,ENSG00000162378,Gene Expression,GRCh38,41.0,ENSG00000162378,Gene Expression,...,22.0,ENSG00000162378,Gene Expression,GRCh38,47.0,ENSG00000162378,Gene Expression,GRCh38,183.0,ZYG11B
ENSG00000159840,ENSG00000159840,Gene Expression,GRCh38,134.0,ENSG00000159840,Gene Expression,GRCh38,284.0,ENSG00000159840,Gene Expression,...,265.0,ENSG00000159840,Gene Expression,GRCh38,319.0,ENSG00000159840,Gene Expression,GRCh38,2351.0,ZYX


#### feature is filtered

In [41]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [42]:
#View var

In [43]:
adata.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,gene_ids-8,feature_types-8,genome-8,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9,gene_symbols,feature_is_filtered
ENSG00000121410,ENSG00000121410,Gene Expression,GRCh38,58.0,ENSG00000121410,Gene Expression,GRCh38,118.0,ENSG00000121410,Gene Expression,...,ENSG00000121410,Gene Expression,GRCh38,185.0,ENSG00000121410,Gene Expression,GRCh38,608.0,A1BG,False
ENSG00000268895,ENSG00000268895,Gene Expression,GRCh38,7.0,ENSG00000268895,Gene Expression,GRCh38,21.0,ENSG00000268895,Gene Expression,...,ENSG00000268895,Gene Expression,GRCh38,20.0,ENSG00000268895,Gene Expression,GRCh38,91.0,A1BG-AS1,False
ENSG00000148584,,,,,,,,,,,...,,,,,,,,,A1CF,False
ENSG00000175899,ENSG00000175899,Gene Expression,GRCh38,66.0,ENSG00000175899,Gene Expression,GRCh38,171.0,ENSG00000175899,Gene Expression,...,ENSG00000175899,Gene Expression,GRCh38,5.0,ENSG00000175899,Gene Expression,GRCh38,2117.0,A2M,False
ENSG00000245105,ENSG00000245105,Gene Expression,GRCh38,6.0,ENSG00000245105,Gene Expression,GRCh38,5.0,ENSG00000245105,Gene Expression,...,ENSG00000245105,Gene Expression,GRCh38,61.0,ENSG00000245105,Gene Expression,GRCh38,13.0,A2M-AS1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000070476,ENSG00000070476,Gene Expression,GRCh38,11.0,ENSG00000070476,Gene Expression,GRCh38,21.0,ENSG00000070476,Gene Expression,...,ENSG00000070476,Gene Expression,GRCh38,32.0,ENSG00000070476,Gene Expression,GRCh38,122.0,ZXDC,False
ENSG00000203995,,,,,,,,,,,...,,,,,,,,,ZYG11A,False
ENSG00000162378,ENSG00000162378,Gene Expression,GRCh38,15.0,ENSG00000162378,Gene Expression,GRCh38,41.0,ENSG00000162378,Gene Expression,...,ENSG00000162378,Gene Expression,GRCh38,47.0,ENSG00000162378,Gene Expression,GRCh38,183.0,ZYG11B,False
ENSG00000159840,ENSG00000159840,Gene Expression,GRCh38,134.0,ENSG00000159840,Gene Expression,GRCh38,284.0,ENSG00000159840,Gene Expression,...,ENSG00000159840,Gene Expression,GRCh38,319.0,ENSG00000159840,Gene Expression,GRCh38,2351.0,ZYX,False


In [44]:
araw.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,n_cells-7,gene_ids-8,feature_types-8,genome-8,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9,gene_symbols
ENSG00000121410,ENSG00000121410,Gene Expression,GRCh38,58.0,ENSG00000121410,Gene Expression,GRCh38,118.0,ENSG00000121410,Gene Expression,...,140.0,ENSG00000121410,Gene Expression,GRCh38,185.0,ENSG00000121410,Gene Expression,GRCh38,608.0,A1BG
ENSG00000268895,ENSG00000268895,Gene Expression,GRCh38,7.0,ENSG00000268895,Gene Expression,GRCh38,21.0,ENSG00000268895,Gene Expression,...,14.0,ENSG00000268895,Gene Expression,GRCh38,20.0,ENSG00000268895,Gene Expression,GRCh38,91.0,A1BG-AS1
ENSG00000148584,,,,,,,,,,,...,,,,,,,,,,A1CF
ENSG00000175899,ENSG00000175899,Gene Expression,GRCh38,66.0,ENSG00000175899,Gene Expression,GRCh38,171.0,ENSG00000175899,Gene Expression,...,,ENSG00000175899,Gene Expression,GRCh38,5.0,ENSG00000175899,Gene Expression,GRCh38,2117.0,A2M
ENSG00000245105,ENSG00000245105,Gene Expression,GRCh38,6.0,ENSG00000245105,Gene Expression,GRCh38,5.0,ENSG00000245105,Gene Expression,...,59.0,ENSG00000245105,Gene Expression,GRCh38,61.0,ENSG00000245105,Gene Expression,GRCh38,13.0,A2M-AS1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000070476,ENSG00000070476,Gene Expression,GRCh38,11.0,ENSG00000070476,Gene Expression,GRCh38,21.0,ENSG00000070476,Gene Expression,...,26.0,ENSG00000070476,Gene Expression,GRCh38,32.0,ENSG00000070476,Gene Expression,GRCh38,122.0,ZXDC
ENSG00000203995,,,,,,,,,,,...,,,,,,,,,,ZYG11A
ENSG00000162378,ENSG00000162378,Gene Expression,GRCh38,15.0,ENSG00000162378,Gene Expression,GRCh38,41.0,ENSG00000162378,Gene Expression,...,22.0,ENSG00000162378,Gene Expression,GRCh38,47.0,ENSG00000162378,Gene Expression,GRCh38,183.0,ZYG11B
ENSG00000159840,ENSG00000159840,Gene Expression,GRCh38,134.0,ENSG00000159840,Gene Expression,GRCh38,284.0,ENSG00000159840,Gene Expression,...,265.0,ENSG00000159840,Gene Expression,GRCh38,319.0,ENSG00000159840,Gene Expression,GRCh38,2351.0,ZYX


## obs (Cell metadata)

In [45]:
#view obs

In [46]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,sample,technology,tissue,n_counts,barcode_sample_copy
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5992.0,AAACGGGCATTGGCGC-1_FCA7167219
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,3525.0,AAACGGGTCGCGATCG-1_FCA7167219
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,4452.0,AAAGATGAGCAATATG-1_FCA7167219
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,6888.0,AAAGATGAGTTCGCGC-1_FCA7167219
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5882.0,AAAGATGCATGTCGAT-1_FCA7167219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,5227.0,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,17379.0,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,7779.0,TTTGTTGGTCACAGCG-1_Pla_Camb10714920
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,20078.0,TTTGTTGGTTTACTTG-1_Pla_Camb10714920


#### assay_ontology_term_id

In [47]:
list(adata.obs['technology'].unique())

["10X_3'_scRNA-seq", "10X_3'_snRNA-seq", '10X_multiome']

In [48]:
mapping= {"10X_3'_scRNA-seq": 'EFO:0030003', "10X_3'_snRNA-seq": 'EFO:0030003', '10X_multiome':'EFO:0030059'}

In [49]:
adata.obs['assay_ontology_term_id'] = adata.obs['technology'].map(mapping)

In [50]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,sample,technology,tissue,n_counts,barcode_sample_copy,assay_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5992.0,AAACGGGCATTGGCGC-1_FCA7167219,EFO:0030003
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,3525.0,AAACGGGTCGCGATCG-1_FCA7167219,EFO:0030003
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,4452.0,AAAGATGAGCAATATG-1_FCA7167219,EFO:0030003
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,6888.0,AAAGATGAGTTCGCGC-1_FCA7167219,EFO:0030003
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5882.0,AAAGATGCATGTCGAT-1_FCA7167219,EFO:0030003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,5227.0,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,EFO:0030059
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,17379.0,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,EFO:0030059
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,7779.0,TTTGTTGGTCACAGCG-1_Pla_Camb10714920,EFO:0030059
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,20078.0,TTTGTTGGTTTACTTG-1_Pla_Camb10714920,EFO:0030059


#### cell_type_ontology_term_id

In [51]:
#get the column in adata.obs related. to cell type annotation

In [52]:
list(adata.obs['cell_type'].unique())

['dNK1',
 'dT_cells',
 'T_cells',
 'dNK2',
 'ILC3',
 'dNK3',
 'dT_regs',
 'dM1',
 'HOFB',
 'B_cells',
 'dDC',
 'dM2',
 'dS2',
 'M3',
 'VCT',
 'NK',
 'Granulocytes',
 'uSMC',
 'Endo_F',
 'DC',
 'dEpi_secretory',
 'MO',
 'EVT_2',
 'Endo_M',
 'SCT',
 'dS1',
 'dS3',
 'dEpi_lumenal',
 'iEVT',
 'Endo_L',
 'PV MMP11',
 'PV STEAP4',
 'EVT_1',
 'PV AOC3',
 'fF1',
 'VCT_CCC',
 'fF2',
 'VCT_p',
 'Plasma',
 'VCT_fusing',
 'eEVT',
 'GC']

In [53]:
cl_tropho = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/cl_tropho.csv')

In [54]:
mapping = dict(zip(cl_tropho['cell_type'], cl_tropho['CL ID']))

In [55]:
mapping

{'dNK1': 'CL:0002343',
 'dT_cells': 'CL:0000084',
 'T_cells': 'CL:0000084',
 'dNK2': 'CL:0002343',
 'ILC3': 'CL:0001078',
 'dNK3': 'CL:0002343',
 'dT_regs': 'CL:0000815',
 'dM1': 'CL:0000235',
 'HOFB': 'CL:3000001',
 'B_cells': 'CL:0000236',
 'dDC': 'CL:0000451',
 'dM2': 'CL:0000235',
 'dS2': 'CL:0000499',
 'M3': 'CL:0000235',
 'VCT': 'CL:2000060',
 'NK': 'CL:0000623',
 'Granulocytes': 'CL:0000094',
 'uSMC': 'CL:0002601',
 'Endo_F': 'CL:0009092',
 'DC': 'CL:0000451',
 'dEpi_secretory': 'CL:0000066',
 'MO': 'CL:0000235',
 'EVT_2': 'CL:0008036',
 'Endo_M': 'CL:0009095',
 'SCT': 'CL:0000525',
 'dS1': 'CL:0000499',
 'dS3': 'CL:0000499',
 'dEpi_lumenal': 'CL:0000066',
 'iEVT': 'CL:0008036',
 'Endo_L': 'CL:0002138',
 'PV MMP11': 'CL:0000003',
 'PVMMP11': 'CL:0000003',
 'PVSTEAP4': 'CL:0000003',
 'PV STEAP4': 'CL:0000003',
 'EVT_1': 'CL:0008036',
 'PVAOC3': 'CL:0000003',
 'PV AOC3': 'CL:0000003',
 'fF1': 'CL:2000042',
 'VCT_CCC': 'CL:2000060',
 'fF2': 'CL:2000042',
 'VCT_p': 'CL:2000060',
 'P

In [56]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type'].map(mapping)

In [57]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [58]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,sample,technology,tissue,n_counts,barcode_sample_copy,assay_ontology_term_id,cell_type_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5992.0,AAACGGGCATTGGCGC-1_FCA7167219,EFO:0030003,CL:0002343
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,3525.0,AAACGGGTCGCGATCG-1_FCA7167219,EFO:0030003,CL:0000084
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,4452.0,AAAGATGAGCAATATG-1_FCA7167219,EFO:0030003,CL:0000084
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,6888.0,AAAGATGAGTTCGCGC-1_FCA7167219,EFO:0030003,CL:0002343
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5882.0,AAAGATGCATGTCGAT-1_FCA7167219,EFO:0030003,CL:0000084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,5227.0,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,EFO:0030059,CL:0002601
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,17379.0,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,EFO:0030059,CL:0000525
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,7779.0,TTTGTTGGTCACAGCG-1_Pla_Camb10714920,EFO:0030059,CL:3000001
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,20078.0,TTTGTTGGTTTACTTG-1_Pla_Camb10714920,EFO:0030059,CL:0000525


## development_stage_ontology_term_id

In [59]:
list(adata.obs['dev_age'].unique())

['10_PCW',
 '8_PCW',
 '6_PCW',
 '7-8_PCW',
 '4-5_PCW',
 '9_PCW',
 '12_PCW',
 '12-13_PCW',
 '5_PCW',
 '8-9_PCW']

In [60]:
mapping = {'10_PCW':'HsapDv:0000048',
 '8_PCW':'HsapDv:0000030',
 '6_PCW':'HsapDv:0000025',
 '7-8_PCW':'HsapDv:0000029',
 '4-5_PCW':'HsapDv:0000021',
 '9_PCW':'HsapDv:0000047',
 '12_PCW':'HsapDv:0000049',
 '12-13_PCW':'HsapDv:0000050',
 '5_PCW':'HsapDv:0000023',
 '8-9_PCW':'HsapDv:0000046'
          }

In [61]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['dev_age'].map(mapping)

In [62]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,sample,technology,tissue,n_counts,barcode_sample_copy,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5992.0,AAACGGGCATTGGCGC-1_FCA7167219,EFO:0030003,CL:0002343,HsapDv:0000048
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,3525.0,AAACGGGTCGCGATCG-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,4452.0,AAAGATGAGCAATATG-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,6888.0,AAAGATGAGTTCGCGC-1_FCA7167219,EFO:0030003,CL:0002343,HsapDv:0000048
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5882.0,AAAGATGCATGTCGAT-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,5227.0,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,EFO:0030059,CL:0002601,HsapDv:0000046
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,17379.0,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,EFO:0030059,CL:0000525,HsapDv:0000046
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,7779.0,TTTGTTGGTCACAGCG-1_Pla_Camb10714920,EFO:0030059,CL:3000001,HsapDv:0000046
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,20078.0,TTTGTTGGTTTACTTG-1_Pla_Camb10714920,EFO:0030059,CL:0000525,HsapDv:0000046


#### disease_ontology_term_id

In [63]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461']* len(adata.obs)

In [64]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,sample,technology,tissue,n_counts,barcode_sample_copy,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5992.0,AAACGGGCATTGGCGC-1_FCA7167219,EFO:0030003,CL:0002343,HsapDv:0000048,PATO:0000461
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,3525.0,AAACGGGTCGCGATCG-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,4452.0,AAAGATGAGCAATATG-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,6888.0,AAAGATGAGTTCGCGC-1_FCA7167219,EFO:0030003,CL:0002343,HsapDv:0000048,PATO:0000461
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5882.0,AAAGATGCATGTCGAT-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,5227.0,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,EFO:0030059,CL:0002601,HsapDv:0000046,PATO:0000461
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,17379.0,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,EFO:0030059,CL:0000525,HsapDv:0000046,PATO:0000461
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,7779.0,TTTGTTGGTCACAGCG-1_Pla_Camb10714920,EFO:0030059,CL:3000001,HsapDv:0000046,PATO:0000461
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,20078.0,TTTGTTGGTTTACTTG-1_Pla_Camb10714920,EFO:0030059,CL:0000525,HsapDv:0000046,PATO:0000461


#### donor_id

In [65]:
adata.obs['donor_id'] = adata.obs['donor']

In [66]:
list(adata.obs['donor_id'].unique())

['6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 'Hrv43',
 'Hrv46',
 'H2',
 'R0',
 'R1',
 'Hrv98',
 'Hrv99',
 'Hrv100',
 'P13',
 'P14',
 'P34']

In [67]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,sample,technology,tissue,n_counts,barcode_sample_copy,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5992.0,AAACGGGCATTGGCGC-1_FCA7167219,EFO:0030003,CL:0002343,HsapDv:0000048,PATO:0000461,6
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,3525.0,AAACGGGTCGCGATCG-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461,6
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,4452.0,AAAGATGAGCAATATG-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461,6
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,FCA7167219,10X_3'_scRNA-seq,decidua_immune,6888.0,AAAGATGAGTTCGCGC-1_FCA7167219,EFO:0030003,CL:0002343,HsapDv:0000048,PATO:0000461,6
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,FCA7167219,10X_3'_scRNA-seq,decidua_immune,5882.0,AAAGATGCATGTCGAT-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,5227.0,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,EFO:0030059,CL:0002601,HsapDv:0000046,PATO:0000461,P13
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,17379.0,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,EFO:0030059,CL:0000525,HsapDv:0000046,PATO:0000461,P13
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,7779.0,TTTGTTGGTCACAGCG-1_Pla_Camb10714920,EFO:0030059,CL:3000001,HsapDv:0000046,PATO:0000461,P13
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,Pla_Camb10714920,10X_multiome,decidua_placenta_myometrium,20078.0,TTTGTTGGTTTACTTG-1_Pla_Camb10714920,EFO:0030059,CL:0000525,HsapDv:0000046,PATO:0000461,P13


#### is_primary_data

In [68]:
list(adata.obs['dataset'].unique())

['Vento_Nature',
 'HDBR_new',
 'samples_MFI_new_sc_batch_1',
 'samples_MFI_new_sc_batch_2',
 'Ashley_collection_sn']

In [69]:
mapping = {'Vento_Nature':'False',
 'HDBR_new':'True',
 'samples_MFI_new_sc_batch_1':'True',
 'samples_MFI_new_sc_batch_2':'True',
 'Ashley_collection_sn':'True'}

In [70]:
adata.obs['is_primary_data'] = adata.obs['dataset'].map(mapping)

In [71]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

#### organism_ontology_term_id

In [72]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [73]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [74]:
adata.obs['organism_ontology_term_id']

barcode_sample
AAACGGGCATTGGCGC-1_FCA7167219          NCBITaxon:9606
AAACGGGTCGCGATCG-1_FCA7167219          NCBITaxon:9606
AAAGATGAGCAATATG-1_FCA7167219          NCBITaxon:9606
AAAGATGAGTTCGCGC-1_FCA7167219          NCBITaxon:9606
AAAGATGCATGTCGAT-1_FCA7167219          NCBITaxon:9606
                                            ...      
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920    NCBITaxon:9606
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920    NCBITaxon:9606
TTTGTTGGTCACAGCG-1_Pla_Camb10714920    NCBITaxon:9606
TTTGTTGGTTTACTTG-1_Pla_Camb10714920    NCBITaxon:9606
TTTGTTGGTTTGAGCA-1_Pla_Camb10714920    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 325665, dtype: category
Categories (1, object): ['NCBITaxon:9606']

#### self_reported_ethnicity_ontology_term_id

In [75]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [76]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [77]:
adata.obs['self_reported_ethnicity_ontology_term_id']

barcode_sample
AAACGGGCATTGGCGC-1_FCA7167219          unknown
AAACGGGTCGCGATCG-1_FCA7167219          unknown
AAAGATGAGCAATATG-1_FCA7167219          unknown
AAAGATGAGTTCGCGC-1_FCA7167219          unknown
AAAGATGCATGTCGAT-1_FCA7167219          unknown
                                        ...   
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920    unknown
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920    unknown
TTTGTTGGTCACAGCG-1_Pla_Camb10714920    unknown
TTTGTTGGTTTACTTG-1_Pla_Camb10714920    unknown
TTTGTTGGTTTGAGCA-1_Pla_Camb10714920    unknown
Name: self_reported_ethnicity_ontology_term_id, Length: 325665, dtype: category
Categories (1, object): ['unknown']

In [78]:
adata.obs.columns

Index(['batch', 'cell_type', 'celltype_predictions', 'coarse_annot', 'dataset',
       'dev_age', 'donor', 'number_of_individuals_multiplexed', 'origin_M_F',
       'phase', 'sample', 'technology', 'tissue', 'n_counts',
       'barcode_sample_copy', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### sex_ontology_term_id

In [79]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [80]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,...,barcode_sample_copy,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,...,AAACGGGCATTGGCGC-1_FCA7167219,EFO:0030003,CL:0002343,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,...,AAACGGGTCGCGATCG-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,...,AAAGATGAGCAATATG-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,...,AAAGATGAGTTCGCGC-1_FCA7167219,EFO:0030003,CL:0002343,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,...,AAAGATGCATGTCGAT-1_FCA7167219,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,EFO:0030059,CL:0002601,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,EFO:0030059,CL:0000525,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920,EFO:0030059,CL:3000001,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920,EFO:0030059,CL:0000525,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown


### suspension_type

In [81]:
list(adata.obs['sample'].unique())

['FCA7167219',
 'FCA7167221',
 'FCA7167222',
 'FCA7167223',
 'FCA7167224',
 'FCA7167226',
 'FCA7167230',
 'FCA7167231',
 'FCA7167232',
 'FCA7196218',
 'FCA7196219',
 'FCA7196220',
 'FCA7196224',
 'FCA7196225',
 'FCA7196226',
 'FCA7196229',
 'FCA7196231',
 'FCA7474062',
 'FCA7474063',
 'FCA7474064',
 'FCA7474065',
 'FCA7474068',
 'FCA7511881',
 'FCA7511882',
 'FCA7511884',
 'FCA7474066',
 'FCA7474069',
 'FCA7511883',
 'FCA7511885',
 'FCA7511886',
 'Pla_HDBR10142767',
 'Pla_HDBR10142768',
 'Pla_HDBR10142769',
 'Pla_HDBR10142770',
 'Pla_HDBR8624430',
 'Pla_HDBR8624431',
 'Pla_HDBR8768477',
 'Pla_HDBR8715512',
 'Pla_HDBR8715514',
 'Pla_HDBR10917730',
 'Pla_HDBR10917731',
 'Pla_HDBR10917733',
 'Pla_HDBR10701667',
 'Pla_HDBR10701668',
 'WSSS_PLA8764121',
 'WSSS_PLA8764122',
 'WSSS_PLA8810750',
 'WSSS_PLA8810751',
 'WSSS_PLA8811068',
 'WSSS_PLA8811069',
 'WSSS_PLA8811070',
 'Pla_HDBR10142863',
 'Pla_HDBR10142864',
 'Pla_HDBR10142865',
 'Pla_Camb10691970',
 'Pla_Camb10691971',
 'Pla_Camb106919

In [82]:
suspension = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suspension.csv')

In [83]:
mapping = dict(zip(suspension['sample'], suspension['suspension']))

In [84]:
mapping

{'FCA7167219': 'cell',
 'FCA7167221': 'cell',
 'FCA7167222': 'cell',
 'FCA7167223': 'cell',
 'FCA7167224': 'cell',
 'FCA7167226': 'cell',
 'FCA7167230': 'cell',
 'FCA7167231': 'cell',
 'FCA7167232': 'cell',
 'FCA7196218': 'cell',
 'FCA7196219': 'cell',
 'FCA7196220': 'cell',
 'FCA7196224': 'cell',
 'FCA7196225': 'cell',
 'FCA7196226': 'cell',
 'FCA7196229': 'cell',
 'FCA7196231': 'cell',
 'FCA7474062': 'cell',
 'FCA7474063': 'cell',
 'FCA7474064': 'cell',
 'FCA7474065': 'cell',
 'FCA7474068': 'cell',
 'FCA7511881': 'cell',
 'FCA7511882': 'cell',
 'FCA7511884': 'cell',
 'FCA7474066': 'cell',
 'FCA7474069': 'cell',
 'FCA7511883': 'cell',
 'FCA7511885': 'cell',
 'FCA7511886': 'cell',
 'Pla_HDBR10142767': 'cell',
 'Pla_HDBR10142768': 'cell',
 'Pla_HDBR10142769': 'cell',
 'Pla_HDBR10142770': 'cell',
 'Pla_HDBR8624430': 'cell',
 'Pla_HDBR8624431': 'cell',
 'Pla_HDBR8768477': 'cell',
 'Pla_HDBR8715512': 'cell',
 'Pla_HDBR8715514': 'cell',
 'Pla_HDBR10917730': 'cell',
 'Pla_HDBR10917731': 'cel

In [85]:
adata.obs['suspension_type'] = adata.obs['sample'].map(mapping)

In [86]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,...,EFO:0030003,CL:0002343,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,...,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,...,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,...,EFO:0030003,CL:0002343,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,...,EFO:0030003,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,...,EFO:0030059,CL:0002601,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,EFO:0030059,CL:0000525,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,EFO:0030059,CL:3000001,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,EFO:0030059,CL:0000525,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus


#### tissue_ontology_term_id

In [87]:
list(adata.obs['tissue'].unique())

['decidua_immune',
 'decidua_non_immune',
 'blood',
 'placenta',
 'decidua',
 'decidua_placenta_myometrium',
 'decidua_myometrium']

In [88]:
mapping={'decidua_immune':'UBERON:0002450',#not immune
 'decidua_non_immune':'UBERON:0002450',#not non immune
 'placenta':'UBERON:0001987',
 'decidua':'UBERON:0002450',
 'decidua_placenta_myometrium':'UBERON:0000453', #not sure 
 'decidua_myometrium': 'UBERON:0000453' ,#not sure
         'blood':'UBERON:0000178'
        }

In [89]:
adata.obs['tissue_ontology_term_id'] =adata.obs['tissue'].map(mapping)

In [90]:
adata.obs.columns

Index(['batch', 'cell_type', 'celltype_predictions', 'coarse_annot', 'dataset',
       'dev_age', 'donor', 'number_of_individuals_multiplexed', 'origin_M_F',
       'phase', 'sample', 'technology', 'tissue', 'n_counts',
       'barcode_sample_copy', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [91]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,...,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,...,CL:0002343,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,...,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,...,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,...,CL:0002343,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,...,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,...,CL:0002601,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus,UBERON:0000453
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,CL:0000525,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus,UBERON:0000453
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,CL:3000001,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus,UBERON:0000453
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,CL:0000525,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus,UBERON:0000453


### obsm (Embeddings)

In [92]:
adata.obsm

AxisArrays with keys: X_scVI_n_latent_14_sample_n_layers_3, X_scVI_n_latent_20_sample_n_layers_3, X_umap, X_umap_scVI_n_latent_14_sample_n_layers_3, X_umap_scVI_n_latent_20_sample_n_layers_3

#### uns (Dataset Metadata)

In [93]:
adata.uns

OverloadedDict, wrapping:
	{'cell_labels_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
       '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
       '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
       '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
       '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa',
       '#d16100', '#ddefff', '#000035', '#7b4f4b', '#a1c299', '#300018',
       '#0aa6d8', '#013349', '#00846f', '#372101', '#ffb500', '#c2ffed'],
      dtype=object), 'dev_age_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf', '#aec7e8'], dtype=object), 'log1p': {}, 'neighbors_scVI_n_latent_14_sample_n_layers_3': {'connectivities_key': 'neighbors_scVI_n_latent_14_sample_n_layers_3_connectivities', 'distances_key': 'neighbors_scVI_n_latent_14_sample_n_layers_3_distances', 'params': {'method': 'umap', 'metric'

In [94]:
adata.uns['schema_version'] = '3.0.0'

In [95]:
adata.uns['title'] = 'All donors all cell states (in vivo)'

In [96]:
adata.uns['default_embedding'] = 'X_umap'

### Final checks and adjustments

In [97]:
adata

AnnData object with n_obs × n_vars = 325665 × 30534
    obs: 'batch', 'cell_type', 'celltype_predictions', 'coarse_annot', 'dataset', 'dev_age', 'donor', 'number_of_individuals_multiplexed', 'origin_M_F', 'phase', 'sample', 'technology', 'tissue', 'n_counts', 'barcode_sample_copy', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-14', 'feature_types-14', 'genome-14', '

In [98]:
adata.obs.dtypes

batch                                       category
cell_type                                   category
celltype_predictions                        category
coarse_annot                                category
dataset                                     category
dev_age                                     category
donor                                       category
number_of_individuals_multiplexed           category
origin_M_F                                  category
phase                                       category
sample                                      category
technology                                  category
tissue                                      category
n_counts                                     float32
barcode_sample_copy                           object
assay_ontology_term_id                        object
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                      

In [99]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

changed n_cells-0 from float64 to float32
changed n_cells-1 from float64 to float32
changed n_cells-10 from float64 to float32
changed n_cells-11 from float64 to float32
changed n_cells-12 from float64 to float32
changed n_cells-13 from float64 to float32
changed n_cells-14 from float64 to float32
changed n_cells-15 from float64 to float32
changed n_cells-16 from float64 to float32
changed n_cells-17 from float64 to float32
changed n_cells-18 from float64 to float32
changed n_cells-19 from float64 to float32
changed n_cells-2 from float64 to float32
changed n_cells-20 from float64 to float32
changed n_cells-21 from float64 to float32
changed n_cells-22 from float64 to float32
changed n_cells-23 from float64 to float32
changed n_cells-24 from float64 to float32
changed n_cells-25 from float64 to float32
changed n_cells-26 from float64 to float32
changed n_cells-27 from float64 to float32
changed n_cells-28 from float64 to float32
changed n_cells-29 from float64 to float32
changed n_cell

In [100]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed barcode_sample_copy from object to category
changed assay_ontology_term_id from object to category
changed disease_ontology_term_id from object to category
changed sex_ontology_term_id from object to category
changed suspension_type from object to category
changed tissue_ontology_term_id from object to category


In [101]:
adata.obs

Unnamed: 0_level_0,batch,cell_type,celltype_predictions,coarse_annot,dataset,dev_age,donor,number_of_individuals_multiplexed,origin_M_F,phase,...,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACGGGCATTGGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,...,CL:0002343,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
AAACGGGTCGCGATCG-1_FCA7167219,0,dT_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,...,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
AAAGATGAGCAATATG-1_FCA7167219,0,T_cells,2_dNK2,T,Vento_Nature,10_PCW,6,1,M,G1,...,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
AAAGATGAGTTCGCGC-1_FCA7167219,0,dNK1,5_dNK1,NK,Vento_Nature,10_PCW,6,1,M,G1,...,CL:0002343,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
AAAGATGCATGTCGAT-1_FCA7167219,0,T_cells,8_Tcells,T,Vento_Nature,10_PCW,6,1,M,S,...,CL:0000084,HsapDv:0000048,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0002450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,61,uSMC,0_dS1,dS_uSMC,Ashley_collection_sn,8-9_PCW,P13,1,M,S,...,CL:0002601,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus,UBERON:0000453
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,61,SCT,20_SCT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,CL:0000525,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus,UBERON:0000453
TTTGTTGGTCACAGCG-1_Pla_Camb10714920,61,HOFB,18_HB,Myeloid,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,CL:3000001,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus,UBERON:0000453
TTTGTTGGTTTACTTG-1_Pla_Camb10714920,61,SCT,16_EVT,Trophoblast,Ashley_collection_sn,8-9_PCW,P13,1,F,G1,...,CL:0000525,HsapDv:0000046,PATO:0000461,P13,True,NCBITaxon:9606,unknown,unknown,nucleus,UBERON:0000453


In [102]:
del adata.obs['cell_type']
del adata.obs['tissue']

In [103]:
#check the format of expression matrix

In [104]:
adata.X

<325665x30534 sparse matrix of type '<class 'numpy.float32'>'
	with 746108294 stored elements in Compressed Sparse Column format>

In [105]:
# Convert expression matrix to sparse matrix format

In [106]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [107]:
adata.X

<325665x30534 sparse matrix of type '<class 'numpy.float32'>'
	with 746108294 stored elements in Compressed Sparse Row format>

In [108]:
adata.raw = araw

In [109]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/final_objects/All_donors_all_cell_states_final.h5ad', compression = 'gzip')