#  Curating primary_trophoblast_organoid dataset 

In [1]:
Article: 'Spatial multiomics map of trophoblast development in early pregnancy'

In [2]:
DOI : 'https://doi.org/10.1038/s41586-023-05869-0'

In [3]:
Data_Source : 'https://www.reproductivecellatlas.org/mfi.html'

### Mount farm

mount-farm

### Packages required for curation

#### Import all packages required for the curation

In [4]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math

### Curation Schema

#### X (Matrix Layers)

#### AnnData object

In [5]:
# Load the AnnData object

In [6]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/primary_trophoblast_organoid.h5ad')

In [7]:
# View the AnnData object

In [8]:
adata

AnnData object with n_obs × n_vars = 26853 × 23281
    obs: 'n_genes', 'time_point', 'media', 'sample', 'percent_mito', 'barcode', 'donor', 'leiden', 'S_score', 'G2M_score', 'phase', 'cell_annotation'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7', 'gene_ids-8', 'feature_types-8', 'genome-8', 'n_cells-8', 'gene_ids-9', 'featu

##### Raw Counts matrix

In [9]:
# Check whether adata has raw counts or normalized counts

In [10]:
print(adata.X)

  (0, 14790)	2.4958725
  (0, 14796)	1.4304695
  (0, 14794)	1.7528441
  (0, 14793)	1.7528441
  (0, 14789)	1.9962397
  (0, 14785)	1.9962397
  (0, 14788)	1.9962397
  (0, 14787)	1.9962397
  (0, 14792)	1.7528441
  (0, 17214)	1.4304695
  (0, 14195)	0.95178556
  (0, 19786)	0.95178556
  (0, 17500)	0.95178556
  (0, 18397)	0.95178556
  (0, 8312)	3.1469343
  (0, 9618)	1.9962397
  (0, 9925)	0.95178556
  (0, 11488)	1.9962397
  (0, 9094)	0.95178556
  (0, 21567)	1.4304695
  (0, 14586)	0.95178556
  (0, 6004)	1.4304695
  (0, 10555)	0.95178556
  (0, 19685)	0.95178556
  (0, 20707)	1.4304695
  :	:
  (26852, 10949)	0.45426166
  (26852, 9479)	1.722781
  (26852, 17931)	0.7654772
  (26852, 9614)	1.1939347
  (26852, 3897)	0.45426166
  (26852, 16146)	1.4929177
  (26852, 22023)	0.7654772
  (26852, 6922)	0.7654772
  (26852, 12545)	0.7654772
  (26852, 3378)	0.7654772
  (26852, 18278)	2.9041815
  (26852, 13787)	1.0024796
  (26852, 17930)	0.45426166
  (26852, 9744)	0.7654772
  (26852, 17199)	0.45426166
  (26852, 150

In [11]:
print(adata.raw.X)

  (0, 14790)	7.0
  (0, 14796)	2.0
  (0, 14794)	3.0
  (0, 14793)	3.0
  (0, 14789)	4.0
  (0, 14785)	4.0
  (0, 14788)	4.0
  (0, 14787)	4.0
  (0, 14792)	3.0
  (0, 17214)	2.0
  (0, 14195)	1.0
  (0, 19786)	1.0
  (0, 17500)	1.0
  (0, 18397)	1.0
  (0, 8312)	14.0
  (0, 9618)	4.0
  (0, 9925)	1.0
  (0, 11488)	4.0
  (0, 9094)	1.0
  (0, 21567)	2.0
  (0, 14586)	1.0
  (0, 6004)	2.0
  (0, 10555)	1.0
  (0, 19685)	1.0
  (0, 20707)	2.0
  :	:
  (26852, 10949)	1.0
  (26852, 9479)	8.0
  (26852, 17931)	2.0
  (26852, 9614)	4.0
  (26852, 3897)	1.0
  (26852, 16146)	6.0
  (26852, 22023)	2.0
  (26852, 6922)	2.0
  (26852, 12545)	2.0
  (26852, 3378)	2.0
  (26852, 18278)	30.0
  (26852, 13787)	3.0
  (26852, 17930)	1.0
  (26852, 9744)	2.0
  (26852, 17199)	1.0
  (26852, 15051)	2.0
  (26852, 7416)	1.0
  (26852, 19988)	1.0
  (26852, 22153)	1.0
  (26852, 14674)	2.0
  (26852, 6069)	6.0
  (26852, 12016)	2.0
  (26852, 3347)	1.0
  (26852, 21761)	2.0
  (26852, 6627)	1.0


In [12]:
#since raw counts are present in adata.raw, copy the counts to araw

In [13]:
araw = adata.raw.to_adata()

In [14]:
araw

AnnData object with n_obs × n_vars = 26853 × 23281
    obs: 'n_genes', 'time_point', 'media', 'sample', 'percent_mito', 'barcode', 'donor', 'leiden', 'S_score', 'G2M_score', 'phase', 'cell_annotation'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7', 'gene_ids-8', 'feature_types-8', 'genome-8', 'n_cells-8', 'gene_ids-9', 'featu

#### Variables (var)

In [15]:
# View var

In [16]:
adata.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9,highly_variable,highly_variable_rank,means,variances,variances_norm
A1BG,ENSG00000121410,Gene Expression,GRCh38,609.0,ENSG00000121410,Gene Expression,GRCh38,861.0,ENSG00000121410,Gene Expression,...,381.0,ENSG00000121410,Gene Expression,GRCh38,602.0,True,3734.0,0.135962,0.198965,1.075057
A1BG-AS1,ENSG00000268895,Gene Expression,GRCh38,116.0,ENSG00000268895,Gene Expression,GRCh38,174.0,ENSG00000268895,Gene Expression,...,59.0,ENSG00000268895,Gene Expression,GRCh38,116.0,False,,0.026701,0.028745,0.853681
A2M,,,,,,,,,,,...,3.0,,,,,False,,0.000186,0.000186,0.974953
A2M-AS1,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,...,5.0,ENSG00000245105,Gene Expression,GRCh38,9.0,False,,0.002495,0.002489,0.907474
A2ML1,ENSG00000166535,Gene Expression,GRCh38,53.0,ENSG00000166535,Gene Expression,GRCh38,49.0,ENSG00000166535,Gene Expression,...,40.0,ENSG00000166535,Gene Expression,GRCh38,51.0,False,,0.011209,0.011754,0.880022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ENSG00000070476,Gene Expression,GRCh38,466.0,ENSG00000070476,Gene Expression,GRCh38,630.0,ENSG00000070476,Gene Expression,...,206.0,ENSG00000070476,Gene Expression,GRCh38,269.0,False,,0.104420,0.114077,0.816581
ZYG11A,ENSG00000203995,Gene Expression,GRCh38,99.0,ENSG00000203995,Gene Expression,GRCh38,156.0,ENSG00000203995,Gene Expression,...,44.0,ENSG00000203995,Gene Expression,GRCh38,74.0,False,,0.023536,0.024845,0.842500
ZYG11B,ENSG00000162378,Gene Expression,GRCh38,1648.0,ENSG00000162378,Gene Expression,GRCh38,2233.0,ENSG00000162378,Gene Expression,...,1442.0,ENSG00000162378,Gene Expression,GRCh38,1862.0,True,3835.0,0.672364,1.462962,1.068170
ZYX,ENSG00000159840,Gene Expression,GRCh38,2152.0,ENSG00000159840,Gene Expression,GRCh38,3590.0,ENSG00000159840,Gene Expression,...,2277.0,ENSG00000159840,Gene Expression,GRCh38,3658.0,True,2806.0,0.790266,2.025278,1.173732


In [17]:
araw.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,genome-7,n_cells-7,gene_ids-8,feature_types-8,genome-8,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9
A1BG,ENSG00000121410,Gene Expression,GRCh38,609.0,ENSG00000121410,Gene Expression,GRCh38,861.0,ENSG00000121410,Gene Expression,...,GRCh38,280.0,ENSG00000121410,Gene Expression,GRCh38,381.0,ENSG00000121410,Gene Expression,GRCh38,602.0
A1BG-AS1,ENSG00000268895,Gene Expression,GRCh38,116.0,ENSG00000268895,Gene Expression,GRCh38,174.0,ENSG00000268895,Gene Expression,...,GRCh38,90.0,ENSG00000268895,Gene Expression,GRCh38,59.0,ENSG00000268895,Gene Expression,GRCh38,116.0
A2M,,,,,,,,,,,...,,,ENSG00000175899,Gene Expression,GRCh38,3.0,,,,
A2M-AS1,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,...,GRCh38,20.0,ENSG00000245105,Gene Expression,GRCh38,5.0,ENSG00000245105,Gene Expression,GRCh38,9.0
A2ML1,ENSG00000166535,Gene Expression,GRCh38,53.0,ENSG00000166535,Gene Expression,GRCh38,49.0,ENSG00000166535,Gene Expression,...,GRCh38,99.0,ENSG00000166535,Gene Expression,GRCh38,40.0,ENSG00000166535,Gene Expression,GRCh38,51.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ENSG00000070476,Gene Expression,GRCh38,466.0,ENSG00000070476,Gene Expression,GRCh38,630.0,ENSG00000070476,Gene Expression,...,GRCh38,446.0,ENSG00000070476,Gene Expression,GRCh38,206.0,ENSG00000070476,Gene Expression,GRCh38,269.0
ZYG11A,ENSG00000203995,Gene Expression,GRCh38,99.0,ENSG00000203995,Gene Expression,GRCh38,156.0,ENSG00000203995,Gene Expression,...,GRCh38,121.0,ENSG00000203995,Gene Expression,GRCh38,44.0,ENSG00000203995,Gene Expression,GRCh38,74.0
ZYG11B,ENSG00000162378,Gene Expression,GRCh38,1648.0,ENSG00000162378,Gene Expression,GRCh38,2233.0,ENSG00000162378,Gene Expression,...,GRCh38,2235.0,ENSG00000162378,Gene Expression,GRCh38,1442.0,ENSG00000162378,Gene Expression,GRCh38,1862.0
ZYX,ENSG00000159840,Gene Expression,GRCh38,2152.0,ENSG00000159840,Gene Expression,GRCh38,3590.0,ENSG00000159840,Gene Expression,...,GRCh38,2223.0,ENSG00000159840,Gene Expression,GRCh38,2277.0,ENSG00000159840,Gene Expression,GRCh38,3658.0


### Ensembl IDs

In [18]:
#Ensembl ids are provided in multiple columns in adata.var

In [19]:
#Select columns from adata.var that have column name starting with gene_ids and assigns them to a variable gene_names

In [20]:
gene_names = adata.var.loc[:,adata.var.columns.str.startswith('gene_ids')]

In [21]:
gene_names

Unnamed: 0,gene_ids-0,gene_ids-1,gene_ids-10,gene_ids-11,gene_ids-12,gene_ids-13,gene_ids-2,gene_ids-3,gene_ids-4,gene_ids-5,gene_ids-6,gene_ids-7,gene_ids-8,gene_ids-9
A1BG,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410,ENSG00000121410
A1BG-AS1,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895,ENSG00000268895
A2M,,,,,ENSG00000175899,,,,,,,,ENSG00000175899,
A2M-AS1,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105,ENSG00000245105
A2ML1,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535,ENSG00000166535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476,ENSG00000070476
ZYG11A,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995,ENSG00000203995
ZYG11B,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378,ENSG00000162378
ZYX,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840,ENSG00000159840


In [22]:
len(gene_names)

23281

In [23]:
#Creates an empty list to store extracted gene names

In [24]:
ensg = []

for k in tqdm(range(0,len(adata.var))):
    gene_valcount = gene_names.iloc[k].value_counts()
    if sum(gene_names.iloc[k].value_counts().index.str.startswith('ENSG')) > 0:
        ensg.append(gene_valcount[gene_valcount.index.str.startswith('ENSG')].index[0])
    else:
        ensg.append('nan')

100%|██████████| 23281/23281 [00:25<00:00, 898.84it/s]


In [25]:
ensg

['ENSG00000121410',
 'ENSG00000268895',
 'ENSG00000175899',
 'ENSG00000245105',
 'ENSG00000166535',
 'ENSG00000256661',
 'ENSG00000256904',
 'ENSG00000128274',
 'ENSG00000094914',
 'ENSG00000081760',
 'ENSG00000114771',
 'ENSG00000197953',
 'ENSG00000242908',
 'ENSG00000188984',
 'ENSG00000109576',
 'ENSG00000158122',
 'ENSG00000103591',
 'ENSG00000115977',
 'ENSG00000087884',
 'ENSG00000127837',
 'ENSG00000129673',
 'ENSG00000131043',
 'ENSG00000205002',
 'ENSG00000090861',
 'ENSG00000124608',
 'ENSG00000266967',
 'ENSG00000157426',
 'ENSG00000149313',
 'ENSG00000008311',
 'ENSG00000215458',
 'ENSG00000275700',
 'ENSG00000181409',
 'ENSG00000281376',
 'ENSG00000183044',
 'ENSG00000165029',
 'ENSG00000154263',
 'ENSG00000144452',
 'ENSG00000179869',
 'ENSG00000107331',
 'ENSG00000167972',
 'ENSG00000198691',
 'ENSG00000154265',
 'ENSG00000154262',
 'ENSG00000064687',
 'ENSG00000141338',
 'ENSG00000154258',
 'ENSG00000231749',
 'ENSG00000085563',
 'ENSG00000135776',
 'ENSG00000073734',


In [26]:
len(ensg)

23281

In [27]:
# copy the index column values to a new column called gebe_symbola

In [28]:
adata.var['gene_symbols'] = adata.var_names
araw.var['gene_symbols'] = adata.var_names

In [29]:
#set ensembl ids as index column

In [30]:
adata.var_names = ensg

In [31]:
araw.var_names = ensg

In [32]:
#Load the approved genes file in the curation

In [33]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [34]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [35]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [36]:
# Filter out genes that don't appear in the approved annotation

In [37]:
var_to_keep_adata = [x for x in ensg if (x in genedict)]

In [38]:
len(var_to_keep_adata)

23133

In [39]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_adata].copy()

In [40]:
adata.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,gene_ids-9,feature_types-9,genome-9,n_cells-9,highly_variable,highly_variable_rank,means,variances,variances_norm,gene_symbols
ENSG00000121410,ENSG00000121410,Gene Expression,GRCh38,609.0,ENSG00000121410,Gene Expression,GRCh38,861.0,ENSG00000121410,Gene Expression,...,ENSG00000121410,Gene Expression,GRCh38,602.0,True,3734.0,0.135962,0.198965,1.075057,A1BG
ENSG00000268895,ENSG00000268895,Gene Expression,GRCh38,116.0,ENSG00000268895,Gene Expression,GRCh38,174.0,ENSG00000268895,Gene Expression,...,ENSG00000268895,Gene Expression,GRCh38,116.0,False,,0.026701,0.028745,0.853681,A1BG-AS1
ENSG00000175899,,,,,,,,,,,...,,,,,False,,0.000186,0.000186,0.974953,A2M
ENSG00000245105,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,...,ENSG00000245105,Gene Expression,GRCh38,9.0,False,,0.002495,0.002489,0.907474,A2M-AS1
ENSG00000166535,ENSG00000166535,Gene Expression,GRCh38,53.0,ENSG00000166535,Gene Expression,GRCh38,49.0,ENSG00000166535,Gene Expression,...,ENSG00000166535,Gene Expression,GRCh38,51.0,False,,0.011209,0.011754,0.880022,A2ML1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000070476,ENSG00000070476,Gene Expression,GRCh38,466.0,ENSG00000070476,Gene Expression,GRCh38,630.0,ENSG00000070476,Gene Expression,...,ENSG00000070476,Gene Expression,GRCh38,269.0,False,,0.104420,0.114077,0.816581,ZXDC
ENSG00000203995,ENSG00000203995,Gene Expression,GRCh38,99.0,ENSG00000203995,Gene Expression,GRCh38,156.0,ENSG00000203995,Gene Expression,...,ENSG00000203995,Gene Expression,GRCh38,74.0,False,,0.023536,0.024845,0.842500,ZYG11A
ENSG00000162378,ENSG00000162378,Gene Expression,GRCh38,1648.0,ENSG00000162378,Gene Expression,GRCh38,2233.0,ENSG00000162378,Gene Expression,...,ENSG00000162378,Gene Expression,GRCh38,1862.0,True,3835.0,0.672364,1.462962,1.068170,ZYG11B
ENSG00000159840,ENSG00000159840,Gene Expression,GRCh38,2152.0,ENSG00000159840,Gene Expression,GRCh38,3590.0,ENSG00000159840,Gene Expression,...,ENSG00000159840,Gene Expression,GRCh38,3658.0,True,2806.0,0.790266,2.025278,1.173732,ZYX


In [41]:
araw.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,n_cells-7,gene_ids-8,feature_types-8,genome-8,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9,gene_symbols
ENSG00000121410,ENSG00000121410,Gene Expression,GRCh38,609.0,ENSG00000121410,Gene Expression,GRCh38,861.0,ENSG00000121410,Gene Expression,...,280.0,ENSG00000121410,Gene Expression,GRCh38,381.0,ENSG00000121410,Gene Expression,GRCh38,602.0,A1BG
ENSG00000268895,ENSG00000268895,Gene Expression,GRCh38,116.0,ENSG00000268895,Gene Expression,GRCh38,174.0,ENSG00000268895,Gene Expression,...,90.0,ENSG00000268895,Gene Expression,GRCh38,59.0,ENSG00000268895,Gene Expression,GRCh38,116.0,A1BG-AS1
ENSG00000175899,,,,,,,,,,,...,,ENSG00000175899,Gene Expression,GRCh38,3.0,,,,,A2M
ENSG00000245105,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,...,20.0,ENSG00000245105,Gene Expression,GRCh38,5.0,ENSG00000245105,Gene Expression,GRCh38,9.0,A2M-AS1
ENSG00000166535,ENSG00000166535,Gene Expression,GRCh38,53.0,ENSG00000166535,Gene Expression,GRCh38,49.0,ENSG00000166535,Gene Expression,...,99.0,ENSG00000166535,Gene Expression,GRCh38,40.0,ENSG00000166535,Gene Expression,GRCh38,51.0,A2ML1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000070476,ENSG00000070476,Gene Expression,GRCh38,466.0,ENSG00000070476,Gene Expression,GRCh38,630.0,ENSG00000070476,Gene Expression,...,446.0,ENSG00000070476,Gene Expression,GRCh38,206.0,ENSG00000070476,Gene Expression,GRCh38,269.0,ZXDC
ENSG00000203995,ENSG00000203995,Gene Expression,GRCh38,99.0,ENSG00000203995,Gene Expression,GRCh38,156.0,ENSG00000203995,Gene Expression,...,121.0,ENSG00000203995,Gene Expression,GRCh38,44.0,ENSG00000203995,Gene Expression,GRCh38,74.0,ZYG11A
ENSG00000162378,ENSG00000162378,Gene Expression,GRCh38,1648.0,ENSG00000162378,Gene Expression,GRCh38,2233.0,ENSG00000162378,Gene Expression,...,2235.0,ENSG00000162378,Gene Expression,GRCh38,1442.0,ENSG00000162378,Gene Expression,GRCh38,1862.0,ZYG11B
ENSG00000159840,ENSG00000159840,Gene Expression,GRCh38,2152.0,ENSG00000159840,Gene Expression,GRCh38,3590.0,ENSG00000159840,Gene Expression,...,2223.0,ENSG00000159840,Gene Expression,GRCh38,2277.0,ENSG00000159840,Gene Expression,GRCh38,3658.0,ZYX


#### feature is filtered

In [42]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [43]:
#View var

In [44]:
adata.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,feature_types-9,genome-9,n_cells-9,highly_variable,highly_variable_rank,means,variances,variances_norm,gene_symbols,feature_is_filtered
ENSG00000121410,ENSG00000121410,Gene Expression,GRCh38,609.0,ENSG00000121410,Gene Expression,GRCh38,861.0,ENSG00000121410,Gene Expression,...,Gene Expression,GRCh38,602.0,True,3734.0,0.135962,0.198965,1.075057,A1BG,False
ENSG00000268895,ENSG00000268895,Gene Expression,GRCh38,116.0,ENSG00000268895,Gene Expression,GRCh38,174.0,ENSG00000268895,Gene Expression,...,Gene Expression,GRCh38,116.0,False,,0.026701,0.028745,0.853681,A1BG-AS1,False
ENSG00000175899,,,,,,,,,,,...,,,,False,,0.000186,0.000186,0.974953,A2M,False
ENSG00000245105,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,...,Gene Expression,GRCh38,9.0,False,,0.002495,0.002489,0.907474,A2M-AS1,False
ENSG00000166535,ENSG00000166535,Gene Expression,GRCh38,53.0,ENSG00000166535,Gene Expression,GRCh38,49.0,ENSG00000166535,Gene Expression,...,Gene Expression,GRCh38,51.0,False,,0.011209,0.011754,0.880022,A2ML1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000070476,ENSG00000070476,Gene Expression,GRCh38,466.0,ENSG00000070476,Gene Expression,GRCh38,630.0,ENSG00000070476,Gene Expression,...,Gene Expression,GRCh38,269.0,False,,0.104420,0.114077,0.816581,ZXDC,False
ENSG00000203995,ENSG00000203995,Gene Expression,GRCh38,99.0,ENSG00000203995,Gene Expression,GRCh38,156.0,ENSG00000203995,Gene Expression,...,Gene Expression,GRCh38,74.0,False,,0.023536,0.024845,0.842500,ZYG11A,False
ENSG00000162378,ENSG00000162378,Gene Expression,GRCh38,1648.0,ENSG00000162378,Gene Expression,GRCh38,2233.0,ENSG00000162378,Gene Expression,...,Gene Expression,GRCh38,1862.0,True,3835.0,0.672364,1.462962,1.068170,ZYG11B,False
ENSG00000159840,ENSG00000159840,Gene Expression,GRCh38,2152.0,ENSG00000159840,Gene Expression,GRCh38,3590.0,ENSG00000159840,Gene Expression,...,Gene Expression,GRCh38,3658.0,True,2806.0,0.790266,2.025278,1.173732,ZYX,False


In [45]:
araw.var

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,n_cells-7,gene_ids-8,feature_types-8,genome-8,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9,gene_symbols
ENSG00000121410,ENSG00000121410,Gene Expression,GRCh38,609.0,ENSG00000121410,Gene Expression,GRCh38,861.0,ENSG00000121410,Gene Expression,...,280.0,ENSG00000121410,Gene Expression,GRCh38,381.0,ENSG00000121410,Gene Expression,GRCh38,602.0,A1BG
ENSG00000268895,ENSG00000268895,Gene Expression,GRCh38,116.0,ENSG00000268895,Gene Expression,GRCh38,174.0,ENSG00000268895,Gene Expression,...,90.0,ENSG00000268895,Gene Expression,GRCh38,59.0,ENSG00000268895,Gene Expression,GRCh38,116.0,A1BG-AS1
ENSG00000175899,,,,,,,,,,,...,,ENSG00000175899,Gene Expression,GRCh38,3.0,,,,,A2M
ENSG00000245105,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,...,20.0,ENSG00000245105,Gene Expression,GRCh38,5.0,ENSG00000245105,Gene Expression,GRCh38,9.0,A2M-AS1
ENSG00000166535,ENSG00000166535,Gene Expression,GRCh38,53.0,ENSG00000166535,Gene Expression,GRCh38,49.0,ENSG00000166535,Gene Expression,...,99.0,ENSG00000166535,Gene Expression,GRCh38,40.0,ENSG00000166535,Gene Expression,GRCh38,51.0,A2ML1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000070476,ENSG00000070476,Gene Expression,GRCh38,466.0,ENSG00000070476,Gene Expression,GRCh38,630.0,ENSG00000070476,Gene Expression,...,446.0,ENSG00000070476,Gene Expression,GRCh38,206.0,ENSG00000070476,Gene Expression,GRCh38,269.0,ZXDC
ENSG00000203995,ENSG00000203995,Gene Expression,GRCh38,99.0,ENSG00000203995,Gene Expression,GRCh38,156.0,ENSG00000203995,Gene Expression,...,121.0,ENSG00000203995,Gene Expression,GRCh38,44.0,ENSG00000203995,Gene Expression,GRCh38,74.0,ZYG11A
ENSG00000162378,ENSG00000162378,Gene Expression,GRCh38,1648.0,ENSG00000162378,Gene Expression,GRCh38,2233.0,ENSG00000162378,Gene Expression,...,2235.0,ENSG00000162378,Gene Expression,GRCh38,1442.0,ENSG00000162378,Gene Expression,GRCh38,1862.0,ZYG11B
ENSG00000159840,ENSG00000159840,Gene Expression,GRCh38,2152.0,ENSG00000159840,Gene Expression,GRCh38,3590.0,ENSG00000159840,Gene Expression,...,2223.0,ENSG00000159840,Gene Expression,GRCh38,2277.0,ENSG00000159840,Gene Expression,GRCh38,3658.0,ZYX


## obs (Cell metadata)

In [46]:
#view obs

In [47]:
adata.obs

Unnamed: 0,n_genes,time_point,media,sample,percent_mito,barcode,donor,leiden,S_score,G2M_score,phase,cell_annotation
AAACCCAGTGCACGCT-1,2116,3H,EVTM,6044STDY8640561,0.005407,AAACCCAGTGCACGCT-1,2,6,-0.276754,-0.224035,G1,EVT-2 early
AAACCCAGTTGGCCTG-1,2672,3H,EVTM,6044STDY8640561,0.111556,AAACCCAGTTGGCCTG-1,2,0,0.246288,-0.224035,S,VCT-p
AAACGAACAAGTCGTT-1,2244,3H,EVTM,6044STDY8640561,0.074968,AAACGAACAAGTCGTT-1,2,6,-0.217102,-0.362766,G1,EVT-2 early
AAACGAAGTGCATTAC-1,4490,3H,EVTM,6044STDY8640561,0.078677,AAACGAAGTGCATTAC-1,3,8,-1.352279,-1.537638,G1,iEVT
AAACGCTGTTGACTAC-1,3265,3H,EVTM,6044STDY8640561,0.093415,AAACGCTGTTGACTAC-1,2,8,-0.618792,-1.059937,G1,iEVT
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACTTGAGT-1,2237,96H,TOM,Pla_Camb10123934,0.077096,TTTGTTGCACTTGAGT-1,4,5,-0.369944,-0.516462,G1,VCT
TTTGTTGGTACGATTC-1,4295,96H,TOM,Pla_Camb10123934,0.096090,TTTGTTGGTACGATTC-1,4,2,-0.343062,-1.141500,G1,VCT
TTTGTTGGTATCGGTT-1,3899,96H,TOM,Pla_Camb10123934,0.098137,TTTGTTGGTATCGGTT-1,6,5,-0.462622,-0.716627,G1,VCT
TTTGTTGGTTGTGGCC-1,3576,96H,TOM,Pla_Camb10123934,0.086538,TTTGTTGGTTGTGGCC-1,4,7,-0.544803,-0.977851,G1,VCT-fusing


#### assay_ontology_term_id

In [48]:
adata.obs['assay_ontology_term_id'] = ['EFO:0030080']* len(adata.obs)

In [49]:
list(adata.obs['sample'].unique())

['6044STDY8640561',
 '6044STDY8640563',
 '6044STDY8640565',
 'Pla_Camb10123928',
 'Pla_Camb10123929',
 'Pla_Camb10123930',
 'Pla_Camb10123932',
 'Pla_Camb10123934']

In [50]:
adata.obs

Unnamed: 0,n_genes,time_point,media,sample,percent_mito,barcode,donor,leiden,S_score,G2M_score,phase,cell_annotation,assay_ontology_term_id
AAACCCAGTGCACGCT-1,2116,3H,EVTM,6044STDY8640561,0.005407,AAACCCAGTGCACGCT-1,2,6,-0.276754,-0.224035,G1,EVT-2 early,EFO:0030080
AAACCCAGTTGGCCTG-1,2672,3H,EVTM,6044STDY8640561,0.111556,AAACCCAGTTGGCCTG-1,2,0,0.246288,-0.224035,S,VCT-p,EFO:0030080
AAACGAACAAGTCGTT-1,2244,3H,EVTM,6044STDY8640561,0.074968,AAACGAACAAGTCGTT-1,2,6,-0.217102,-0.362766,G1,EVT-2 early,EFO:0030080
AAACGAAGTGCATTAC-1,4490,3H,EVTM,6044STDY8640561,0.078677,AAACGAAGTGCATTAC-1,3,8,-1.352279,-1.537638,G1,iEVT,EFO:0030080
AAACGCTGTTGACTAC-1,3265,3H,EVTM,6044STDY8640561,0.093415,AAACGCTGTTGACTAC-1,2,8,-0.618792,-1.059937,G1,iEVT,EFO:0030080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACTTGAGT-1,2237,96H,TOM,Pla_Camb10123934,0.077096,TTTGTTGCACTTGAGT-1,4,5,-0.369944,-0.516462,G1,VCT,EFO:0030080
TTTGTTGGTACGATTC-1,4295,96H,TOM,Pla_Camb10123934,0.096090,TTTGTTGGTACGATTC-1,4,2,-0.343062,-1.141500,G1,VCT,EFO:0030080
TTTGTTGGTATCGGTT-1,3899,96H,TOM,Pla_Camb10123934,0.098137,TTTGTTGGTATCGGTT-1,6,5,-0.462622,-0.716627,G1,VCT,EFO:0030080
TTTGTTGGTTGTGGCC-1,3576,96H,TOM,Pla_Camb10123934,0.086538,TTTGTTGGTTGTGGCC-1,4,7,-0.544803,-0.977851,G1,VCT-fusing,EFO:0030080


#### cell_type_ontology_term_id

In [51]:
#get the column in adata.obs related. to cell type annotation

In [52]:
list(adata.obs['cell_annotation'].unique())

['EVT-2 early',
 'VCT-p',
 'iEVT',
 'EVT-2',
 'VCT-CCC',
 'VCT-fusing',
 'EVT-1',
 'VCT',
 'preSCT']

In [53]:
cl_tropho = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/cl_tropho.csv')

In [54]:
mapping = dict(zip(cl_tropho['cell_type'], cl_tropho['CL ID']))

In [55]:
mapping

{'dNK1': 'CL:0002343',
 'dT_cells': 'CL:0000084',
 'T_cells': 'CL:0000084',
 'dNK2': 'CL:0002343',
 'ILC3': 'CL:0001078',
 'dNK3': 'CL:0002343',
 'dT_regs': 'CL:0000815',
 'dM1': 'CL:0000235',
 'HOFB': 'CL:3000001',
 'B_cells': 'CL:0000236',
 'dDC': 'CL:0000451',
 'dM2': 'CL:0000235',
 'dS2': 'CL:0000499',
 'M3': 'CL:0000235',
 'VCT': 'CL:2000060',
 'NK': 'CL:0000623',
 'Granulocytes': 'CL:0000094',
 'uSMC': 'CL:0002601',
 'Endo_F': 'CL:0009092',
 'DC': 'CL:0000451',
 'dEpi_secretory': 'CL:0000066',
 'MO': 'CL:0000235',
 'EVT_2': 'CL:0008036',
 'Endo_M': 'CL:0009095',
 'SCT': 'CL:0000525',
 'dS1': 'CL:0000499',
 'dS3': 'CL:0000499',
 'dEpi_lumenal': 'CL:0000066',
 'iEVT': 'CL:0008036',
 'Endo_L': 'CL:0002138',
 'PV MMP11': 'CL:0000003',
 'PVMMP11': 'CL:0000003',
 'PVSTEAP4': 'CL:0000003',
 'PV STEAP4': 'CL:0000003',
 'EVT_1': 'CL:0008036',
 'PVAOC3': 'CL:0000003',
 'PV AOC3': 'CL:0000003',
 'fF1': 'CL:2000042',
 'VCT_CCC': 'CL:2000060',
 'fF2': 'CL:2000042',
 'VCT_p': 'CL:2000060',
 'P

In [56]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_annotation'].map(mapping)

In [57]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [58]:
adata.obs

Unnamed: 0,n_genes,time_point,media,sample,percent_mito,barcode,donor,leiden,S_score,G2M_score,phase,cell_annotation,assay_ontology_term_id,cell_type_ontology_term_id
AAACCCAGTGCACGCT-1,2116,3H,EVTM,6044STDY8640561,0.005407,AAACCCAGTGCACGCT-1,2,6,-0.276754,-0.224035,G1,EVT-2 early,EFO:0030080,CL:0008036
AAACCCAGTTGGCCTG-1,2672,3H,EVTM,6044STDY8640561,0.111556,AAACCCAGTTGGCCTG-1,2,0,0.246288,-0.224035,S,VCT-p,EFO:0030080,CL:2000060
AAACGAACAAGTCGTT-1,2244,3H,EVTM,6044STDY8640561,0.074968,AAACGAACAAGTCGTT-1,2,6,-0.217102,-0.362766,G1,EVT-2 early,EFO:0030080,CL:0008036
AAACGAAGTGCATTAC-1,4490,3H,EVTM,6044STDY8640561,0.078677,AAACGAAGTGCATTAC-1,3,8,-1.352279,-1.537638,G1,iEVT,EFO:0030080,CL:0008036
AAACGCTGTTGACTAC-1,3265,3H,EVTM,6044STDY8640561,0.093415,AAACGCTGTTGACTAC-1,2,8,-0.618792,-1.059937,G1,iEVT,EFO:0030080,CL:0008036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACTTGAGT-1,2237,96H,TOM,Pla_Camb10123934,0.077096,TTTGTTGCACTTGAGT-1,4,5,-0.369944,-0.516462,G1,VCT,EFO:0030080,CL:2000060
TTTGTTGGTACGATTC-1,4295,96H,TOM,Pla_Camb10123934,0.096090,TTTGTTGGTACGATTC-1,4,2,-0.343062,-1.141500,G1,VCT,EFO:0030080,CL:2000060
TTTGTTGGTATCGGTT-1,3899,96H,TOM,Pla_Camb10123934,0.098137,TTTGTTGGTATCGGTT-1,6,5,-0.462622,-0.716627,G1,VCT,EFO:0030080,CL:2000060
TTTGTTGGTTGTGGCC-1,3576,96H,TOM,Pla_Camb10123934,0.086538,TTTGTTGGTTGTGGCC-1,4,7,-0.544803,-0.977851,G1,VCT-fusing,EFO:0030080,CL:2000060


In [59]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['CL:0008036', 'CL:2000060', 'CL:0000525']

## development_stage_ontology_term_id

In [60]:
adata.obs['development_stage_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [61]:
adata.obs

Unnamed: 0,n_genes,time_point,media,sample,percent_mito,barcode,donor,leiden,S_score,G2M_score,phase,cell_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
AAACCCAGTGCACGCT-1,2116,3H,EVTM,6044STDY8640561,0.005407,AAACCCAGTGCACGCT-1,2,6,-0.276754,-0.224035,G1,EVT-2 early,EFO:0030080,CL:0008036,unknown
AAACCCAGTTGGCCTG-1,2672,3H,EVTM,6044STDY8640561,0.111556,AAACCCAGTTGGCCTG-1,2,0,0.246288,-0.224035,S,VCT-p,EFO:0030080,CL:2000060,unknown
AAACGAACAAGTCGTT-1,2244,3H,EVTM,6044STDY8640561,0.074968,AAACGAACAAGTCGTT-1,2,6,-0.217102,-0.362766,G1,EVT-2 early,EFO:0030080,CL:0008036,unknown
AAACGAAGTGCATTAC-1,4490,3H,EVTM,6044STDY8640561,0.078677,AAACGAAGTGCATTAC-1,3,8,-1.352279,-1.537638,G1,iEVT,EFO:0030080,CL:0008036,unknown
AAACGCTGTTGACTAC-1,3265,3H,EVTM,6044STDY8640561,0.093415,AAACGCTGTTGACTAC-1,2,8,-0.618792,-1.059937,G1,iEVT,EFO:0030080,CL:0008036,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACTTGAGT-1,2237,96H,TOM,Pla_Camb10123934,0.077096,TTTGTTGCACTTGAGT-1,4,5,-0.369944,-0.516462,G1,VCT,EFO:0030080,CL:2000060,unknown
TTTGTTGGTACGATTC-1,4295,96H,TOM,Pla_Camb10123934,0.096090,TTTGTTGGTACGATTC-1,4,2,-0.343062,-1.141500,G1,VCT,EFO:0030080,CL:2000060,unknown
TTTGTTGGTATCGGTT-1,3899,96H,TOM,Pla_Camb10123934,0.098137,TTTGTTGGTATCGGTT-1,6,5,-0.462622,-0.716627,G1,VCT,EFO:0030080,CL:2000060,unknown
TTTGTTGGTTGTGGCC-1,3576,96H,TOM,Pla_Camb10123934,0.086538,TTTGTTGGTTGTGGCC-1,4,7,-0.544803,-0.977851,G1,VCT-fusing,EFO:0030080,CL:2000060,unknown


#### disease_ontology_term_id

In [62]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461']* len(adata.obs)

In [63]:
adata.obs

Unnamed: 0,n_genes,time_point,media,sample,percent_mito,barcode,donor,leiden,S_score,G2M_score,phase,cell_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
AAACCCAGTGCACGCT-1,2116,3H,EVTM,6044STDY8640561,0.005407,AAACCCAGTGCACGCT-1,2,6,-0.276754,-0.224035,G1,EVT-2 early,EFO:0030080,CL:0008036,unknown,PATO:0000461
AAACCCAGTTGGCCTG-1,2672,3H,EVTM,6044STDY8640561,0.111556,AAACCCAGTTGGCCTG-1,2,0,0.246288,-0.224035,S,VCT-p,EFO:0030080,CL:2000060,unknown,PATO:0000461
AAACGAACAAGTCGTT-1,2244,3H,EVTM,6044STDY8640561,0.074968,AAACGAACAAGTCGTT-1,2,6,-0.217102,-0.362766,G1,EVT-2 early,EFO:0030080,CL:0008036,unknown,PATO:0000461
AAACGAAGTGCATTAC-1,4490,3H,EVTM,6044STDY8640561,0.078677,AAACGAAGTGCATTAC-1,3,8,-1.352279,-1.537638,G1,iEVT,EFO:0030080,CL:0008036,unknown,PATO:0000461
AAACGCTGTTGACTAC-1,3265,3H,EVTM,6044STDY8640561,0.093415,AAACGCTGTTGACTAC-1,2,8,-0.618792,-1.059937,G1,iEVT,EFO:0030080,CL:0008036,unknown,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACTTGAGT-1,2237,96H,TOM,Pla_Camb10123934,0.077096,TTTGTTGCACTTGAGT-1,4,5,-0.369944,-0.516462,G1,VCT,EFO:0030080,CL:2000060,unknown,PATO:0000461
TTTGTTGGTACGATTC-1,4295,96H,TOM,Pla_Camb10123934,0.096090,TTTGTTGGTACGATTC-1,4,2,-0.343062,-1.141500,G1,VCT,EFO:0030080,CL:2000060,unknown,PATO:0000461
TTTGTTGGTATCGGTT-1,3899,96H,TOM,Pla_Camb10123934,0.098137,TTTGTTGGTATCGGTT-1,6,5,-0.462622,-0.716627,G1,VCT,EFO:0030080,CL:2000060,unknown,PATO:0000461
TTTGTTGGTTGTGGCC-1,3576,96H,TOM,Pla_Camb10123934,0.086538,TTTGTTGGTTGTGGCC-1,4,7,-0.544803,-0.977851,G1,VCT-fusing,EFO:0030080,CL:2000060,unknown,PATO:0000461


#### donor_id

In [64]:
adata.obs['donor_id'] = adata.obs['donor']

In [65]:
adata.obs

Unnamed: 0,n_genes,time_point,media,sample,percent_mito,barcode,donor,leiden,S_score,G2M_score,phase,cell_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id
AAACCCAGTGCACGCT-1,2116,3H,EVTM,6044STDY8640561,0.005407,AAACCCAGTGCACGCT-1,2,6,-0.276754,-0.224035,G1,EVT-2 early,EFO:0030080,CL:0008036,unknown,PATO:0000461,2
AAACCCAGTTGGCCTG-1,2672,3H,EVTM,6044STDY8640561,0.111556,AAACCCAGTTGGCCTG-1,2,0,0.246288,-0.224035,S,VCT-p,EFO:0030080,CL:2000060,unknown,PATO:0000461,2
AAACGAACAAGTCGTT-1,2244,3H,EVTM,6044STDY8640561,0.074968,AAACGAACAAGTCGTT-1,2,6,-0.217102,-0.362766,G1,EVT-2 early,EFO:0030080,CL:0008036,unknown,PATO:0000461,2
AAACGAAGTGCATTAC-1,4490,3H,EVTM,6044STDY8640561,0.078677,AAACGAAGTGCATTAC-1,3,8,-1.352279,-1.537638,G1,iEVT,EFO:0030080,CL:0008036,unknown,PATO:0000461,3
AAACGCTGTTGACTAC-1,3265,3H,EVTM,6044STDY8640561,0.093415,AAACGCTGTTGACTAC-1,2,8,-0.618792,-1.059937,G1,iEVT,EFO:0030080,CL:0008036,unknown,PATO:0000461,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACTTGAGT-1,2237,96H,TOM,Pla_Camb10123934,0.077096,TTTGTTGCACTTGAGT-1,4,5,-0.369944,-0.516462,G1,VCT,EFO:0030080,CL:2000060,unknown,PATO:0000461,4
TTTGTTGGTACGATTC-1,4295,96H,TOM,Pla_Camb10123934,0.096090,TTTGTTGGTACGATTC-1,4,2,-0.343062,-1.141500,G1,VCT,EFO:0030080,CL:2000060,unknown,PATO:0000461,4
TTTGTTGGTATCGGTT-1,3899,96H,TOM,Pla_Camb10123934,0.098137,TTTGTTGGTATCGGTT-1,6,5,-0.462622,-0.716627,G1,VCT,EFO:0030080,CL:2000060,unknown,PATO:0000461,6
TTTGTTGGTTGTGGCC-1,3576,96H,TOM,Pla_Camb10123934,0.086538,TTTGTTGGTTGTGGCC-1,4,7,-0.544803,-0.977851,G1,VCT-fusing,EFO:0030080,CL:2000060,unknown,PATO:0000461,4


In [66]:
list(adata.obs['donor_id'].unique())

['2', '3', '1', '4', '5', '6']

#### is_primary_data

In [67]:
adata.obs['is_primary_data'] = ['True'] * len(adata.obs)

In [68]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

#### organism_ontology_term_id

In [69]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [70]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [71]:
adata.obs['organism_ontology_term_id']

AAACCCAGTGCACGCT-1    NCBITaxon:9606
AAACCCAGTTGGCCTG-1    NCBITaxon:9606
AAACGAACAAGTCGTT-1    NCBITaxon:9606
AAACGAAGTGCATTAC-1    NCBITaxon:9606
AAACGCTGTTGACTAC-1    NCBITaxon:9606
                           ...      
TTTGTTGCACTTGAGT-1    NCBITaxon:9606
TTTGTTGGTACGATTC-1    NCBITaxon:9606
TTTGTTGGTATCGGTT-1    NCBITaxon:9606
TTTGTTGGTTGTGGCC-1    NCBITaxon:9606
TTTGTTGTCGCATTAG-1    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 26853, dtype: category
Categories (1, object): ['NCBITaxon:9606']

#### self_reported_ethnicity_ontology_term_id

In [72]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [73]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [74]:
adata.obs['self_reported_ethnicity_ontology_term_id']

AAACCCAGTGCACGCT-1    unknown
AAACCCAGTTGGCCTG-1    unknown
AAACGAACAAGTCGTT-1    unknown
AAACGAAGTGCATTAC-1    unknown
AAACGCTGTTGACTAC-1    unknown
                       ...   
TTTGTTGCACTTGAGT-1    unknown
TTTGTTGGTACGATTC-1    unknown
TTTGTTGGTATCGGTT-1    unknown
TTTGTTGGTTGTGGCC-1    unknown
TTTGTTGTCGCATTAG-1    unknown
Name: self_reported_ethnicity_ontology_term_id, Length: 26853, dtype: category
Categories (1, object): ['unknown']

In [75]:
adata.obs.columns

Index(['n_genes', 'time_point', 'media', 'sample', 'percent_mito', 'barcode',
       'donor', 'leiden', 'S_score', 'G2M_score', 'phase', 'cell_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'donor_id', 'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### sex_ontology_term_id

In [76]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [77]:
adata.obs

Unnamed: 0,n_genes,time_point,media,sample,percent_mito,barcode,donor,leiden,S_score,G2M_score,...,cell_annotation,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCCAGTGCACGCT-1,2116,3H,EVTM,6044STDY8640561,0.005407,AAACCCAGTGCACGCT-1,2,6,-0.276754,-0.224035,...,EVT-2 early,EFO:0030080,CL:0008036,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown
AAACCCAGTTGGCCTG-1,2672,3H,EVTM,6044STDY8640561,0.111556,AAACCCAGTTGGCCTG-1,2,0,0.246288,-0.224035,...,VCT-p,EFO:0030080,CL:2000060,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown
AAACGAACAAGTCGTT-1,2244,3H,EVTM,6044STDY8640561,0.074968,AAACGAACAAGTCGTT-1,2,6,-0.217102,-0.362766,...,EVT-2 early,EFO:0030080,CL:0008036,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown
AAACGAAGTGCATTAC-1,4490,3H,EVTM,6044STDY8640561,0.078677,AAACGAAGTGCATTAC-1,3,8,-1.352279,-1.537638,...,iEVT,EFO:0030080,CL:0008036,unknown,PATO:0000461,3,True,NCBITaxon:9606,unknown,unknown
AAACGCTGTTGACTAC-1,3265,3H,EVTM,6044STDY8640561,0.093415,AAACGCTGTTGACTAC-1,2,8,-0.618792,-1.059937,...,iEVT,EFO:0030080,CL:0008036,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACTTGAGT-1,2237,96H,TOM,Pla_Camb10123934,0.077096,TTTGTTGCACTTGAGT-1,4,5,-0.369944,-0.516462,...,VCT,EFO:0030080,CL:2000060,unknown,PATO:0000461,4,True,NCBITaxon:9606,unknown,unknown
TTTGTTGGTACGATTC-1,4295,96H,TOM,Pla_Camb10123934,0.096090,TTTGTTGGTACGATTC-1,4,2,-0.343062,-1.141500,...,VCT,EFO:0030080,CL:2000060,unknown,PATO:0000461,4,True,NCBITaxon:9606,unknown,unknown
TTTGTTGGTATCGGTT-1,3899,96H,TOM,Pla_Camb10123934,0.098137,TTTGTTGGTATCGGTT-1,6,5,-0.462622,-0.716627,...,VCT,EFO:0030080,CL:2000060,unknown,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown
TTTGTTGGTTGTGGCC-1,3576,96H,TOM,Pla_Camb10123934,0.086538,TTTGTTGGTTGTGGCC-1,4,7,-0.544803,-0.977851,...,VCT-fusing,EFO:0030080,CL:2000060,unknown,PATO:0000461,4,True,NCBITaxon:9606,unknown,unknown


### suspension_type

In [78]:
suspension = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/trophoblast/suspension.csv')

In [79]:
mapping = dict(zip(suspension['sample'], suspension['suspension']))

In [80]:
adata.obs['suspension_type'] = adata.obs['sample'].map(mapping)

In [81]:
adata.obs.columns

Index(['n_genes', 'time_point', 'media', 'sample', 'percent_mito', 'barcode',
       'donor', 'leiden', 'S_score', 'G2M_score', 'phase', 'cell_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'donor_id', 'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type'],
      dtype='object')

In [82]:
list(adata.obs['suspension_type'].unique())

['cell']

#### tissue_ontology_term_id

In [83]:
adata.obs['tissue_ontology_term_id'] =['UBERON:0000088'] * len(adata.obs)

In [84]:
adata.obs.columns

Index(['n_genes', 'time_point', 'media', 'sample', 'percent_mito', 'barcode',
       'donor', 'leiden', 'S_score', 'G2M_score', 'phase', 'cell_annotation',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'donor_id', 'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [85]:
adata.obs

Unnamed: 0,n_genes,time_point,media,sample,percent_mito,barcode,donor,leiden,S_score,G2M_score,...,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
AAACCCAGTGCACGCT-1,2116,3H,EVTM,6044STDY8640561,0.005407,AAACCCAGTGCACGCT-1,2,6,-0.276754,-0.224035,...,CL:0008036,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
AAACCCAGTTGGCCTG-1,2672,3H,EVTM,6044STDY8640561,0.111556,AAACCCAGTTGGCCTG-1,2,0,0.246288,-0.224035,...,CL:2000060,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
AAACGAACAAGTCGTT-1,2244,3H,EVTM,6044STDY8640561,0.074968,AAACGAACAAGTCGTT-1,2,6,-0.217102,-0.362766,...,CL:0008036,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
AAACGAAGTGCATTAC-1,4490,3H,EVTM,6044STDY8640561,0.078677,AAACGAAGTGCATTAC-1,3,8,-1.352279,-1.537638,...,CL:0008036,unknown,PATO:0000461,3,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
AAACGCTGTTGACTAC-1,3265,3H,EVTM,6044STDY8640561,0.093415,AAACGCTGTTGACTAC-1,2,8,-0.618792,-1.059937,...,CL:0008036,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACTTGAGT-1,2237,96H,TOM,Pla_Camb10123934,0.077096,TTTGTTGCACTTGAGT-1,4,5,-0.369944,-0.516462,...,CL:2000060,unknown,PATO:0000461,4,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
TTTGTTGGTACGATTC-1,4295,96H,TOM,Pla_Camb10123934,0.096090,TTTGTTGGTACGATTC-1,4,2,-0.343062,-1.141500,...,CL:2000060,unknown,PATO:0000461,4,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
TTTGTTGGTATCGGTT-1,3899,96H,TOM,Pla_Camb10123934,0.098137,TTTGTTGGTATCGGTT-1,6,5,-0.462622,-0.716627,...,CL:2000060,unknown,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
TTTGTTGGTTGTGGCC-1,3576,96H,TOM,Pla_Camb10123934,0.086538,TTTGTTGGTTGTGGCC-1,4,7,-0.544803,-0.977851,...,CL:2000060,unknown,PATO:0000461,4,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088


### obsm (Embeddings)

In [86]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

#### uns (Dataset Metadata)

In [87]:
adata.uns

OverloadedDict, wrapping:
	OrderedDict()
With overloaded keys:
	['neighbors'].

In [88]:
adata.uns['schema_version'] = '3.0.0'

In [89]:
adata.uns['title'] = 'primary_trophoblast_organoid'

In [90]:
adata.uns['default_embedding'] = 'X_umap'

### Final checks and adjustments

In [91]:
adata

AnnData object with n_obs × n_vars = 26853 × 23133
    obs: 'n_genes', 'time_point', 'media', 'sample', 'percent_mito', 'barcode', 'donor', 'leiden', 'S_score', 'G2M_score', 'phase', 'cell_annotation', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 

In [92]:
adata.obs.dtypes

n_genes                                        int64
time_point                                  category
media                                       category
sample                                      category
percent_mito                                 float32
barcode                                       object
donor                                       category
leiden                                      category
S_score                                      float64
G2M_score                                    float64
phase                                       category
cell_annotation                             category
assay_ontology_term_id                        object
cell_type_ontology_term_id                  category
development_stage_ontology_term_id            object
disease_ontology_term_id                      object
donor_id                                    category
is_primary_data                                 bool
organism_ontology_term_id                   ca

In [93]:
adata.obs

Unnamed: 0,n_genes,time_point,media,sample,percent_mito,barcode,donor,leiden,S_score,G2M_score,...,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
AAACCCAGTGCACGCT-1,2116,3H,EVTM,6044STDY8640561,0.005407,AAACCCAGTGCACGCT-1,2,6,-0.276754,-0.224035,...,CL:0008036,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
AAACCCAGTTGGCCTG-1,2672,3H,EVTM,6044STDY8640561,0.111556,AAACCCAGTTGGCCTG-1,2,0,0.246288,-0.224035,...,CL:2000060,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
AAACGAACAAGTCGTT-1,2244,3H,EVTM,6044STDY8640561,0.074968,AAACGAACAAGTCGTT-1,2,6,-0.217102,-0.362766,...,CL:0008036,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
AAACGAAGTGCATTAC-1,4490,3H,EVTM,6044STDY8640561,0.078677,AAACGAAGTGCATTAC-1,3,8,-1.352279,-1.537638,...,CL:0008036,unknown,PATO:0000461,3,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
AAACGCTGTTGACTAC-1,3265,3H,EVTM,6044STDY8640561,0.093415,AAACGCTGTTGACTAC-1,2,8,-0.618792,-1.059937,...,CL:0008036,unknown,PATO:0000461,2,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACTTGAGT-1,2237,96H,TOM,Pla_Camb10123934,0.077096,TTTGTTGCACTTGAGT-1,4,5,-0.369944,-0.516462,...,CL:2000060,unknown,PATO:0000461,4,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
TTTGTTGGTACGATTC-1,4295,96H,TOM,Pla_Camb10123934,0.096090,TTTGTTGGTACGATTC-1,4,2,-0.343062,-1.141500,...,CL:2000060,unknown,PATO:0000461,4,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
TTTGTTGGTATCGGTT-1,3899,96H,TOM,Pla_Camb10123934,0.098137,TTTGTTGGTATCGGTT-1,6,5,-0.462622,-0.716627,...,CL:2000060,unknown,PATO:0000461,6,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088
TTTGTTGGTTGTGGCC-1,3576,96H,TOM,Pla_Camb10123934,0.086538,TTTGTTGGTTGTGGCC-1,4,7,-0.544803,-0.977851,...,CL:2000060,unknown,PATO:0000461,4,True,NCBITaxon:9606,unknown,unknown,cell,UBERON:0000088


In [94]:
#check the format of expression matrix

In [95]:
adata.X

<26853x23133 sparse matrix of type '<class 'numpy.float32'>'
	with 92916252 stored elements in Compressed Sparse Row format>

In [96]:
# Convert expression matrix to sparse matrix format

In [97]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [98]:
adata.X

<26853x23133 sparse matrix of type '<class 'numpy.float32'>'
	with 92916252 stored elements in Compressed Sparse Row format>

In [99]:
adata.raw = araw

In [100]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/trophoblast/final_objects/primary_trophoblast_organoid.h5ad', compression = 'gzip')