# **Curating Fetal and Pediatric Cell Atlas datasets:**

### Fetal dataset

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

Do not forget `mount-farm`

In [2]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/gutcellatlas/final_fetal_object_cellxgene.h5ad')

In [3]:
adata2 = sc.read('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/gutcellatlas/fetal_RAWCOUNTS_cellxgene.h5ad')

In [4]:
adata2

AnnData object with n_obs × n_vars = 62849 × 33694
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed'
    var: 'gene_ids'

In [5]:
adata

AnnData object with n_obs × n_vars = 62849 × 26757
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed'
    var: 'gene_ids', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'cell_name_detailed_colors'
    obsm: 'X_pca', 'X_umap'

In [6]:
adata

AnnData object with n_obs × n_vars = 62849 × 26757
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed'
    var: 'gene_ids', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'cell_name_detailed_colors'
    obsm: 'X_pca', 'X_umap'

## 2. Pre-prepare before schema adjustments

### Load raw matrix

In [7]:
adata

AnnData object with n_obs × n_vars = 62849 × 26757
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed'
    var: 'gene_ids', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'cell_name_detailed_colors'
    obsm: 'X_pca', 'X_umap'

In [8]:
adata2 # raw

AnnData object with n_obs × n_vars = 62849 × 33694
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed'
    var: 'gene_ids'

### Remove the genes that do not match in gene-ensemblID file

In [9]:
geno = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['ENSEMBLid', 'gena','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [10]:
geno

Unnamed: 0,ENSEMBLid,gena,no
0,ENSG00000223972,DDX11L1,5
1,ENST00000456328,DDX11L1-202,2
2,ENST00000450305,DDX11L1-201,2
3,ENSG00000227232,WASH7P,5
4,ENST00000488147,WASH7P-201,1
...,...,...,...
297782,ENST00000621424,ENSG110105435.4-001,4
297783,ENSG00000278625,U6,1
297784,ENST00000616830,U6.88-201,1
297785,ENSG00000277374,U1,1


In [11]:
ens_ids = adata.var['gene_ids']

In [12]:
ens_ids

index
RP11-34P13.7    ENSG00000238009
RP11-34P13.8    ENSG00000239945
FO538757.3      ENSG00000279928
FO538757.2      ENSG00000279457
AP006222.2      ENSG00000228463
                     ...       
AC007325.4      ENSG00000278817
AC007325.2      ENSG00000277196
AL354822.1      ENSG00000278384
AC004556.1      ENSG00000276345
AC240274.1      ENSG00000271254
Name: gene_ids, Length: 26757, dtype: object

In [13]:
len(ens_ids)

26757

In [14]:
genedict = {key: 1 for key in list(geno.ENSEMBLid)}

In [15]:
genedict

{'ENSG00000223972': 1,
 'ENST00000456328': 1,
 'ENST00000450305': 1,
 'ENSG00000227232': 1,
 'ENST00000488147': 1,
 'ENSG00000278267': 1,
 'ENST00000619216': 1,
 'ENSG00000243485': 1,
 'ENST00000473358': 1,
 'ENST00000469289': 1,
 'ENSG00000284332': 1,
 'ENST00000607096': 1,
 'ENSG00000237613': 1,
 'ENST00000417324': 1,
 'ENST00000461467': 1,
 'ENSG00000268020': 1,
 'ENST00000606857': 1,
 'ENSG00000240361': 1,
 'ENST00000642116': 1,
 'ENST00000492842': 1,
 'ENSG00000186092': 1,
 'ENST00000641515': 1,
 'ENSG00000238009': 1,
 'ENST00000466430': 1,
 'ENST00000477740': 1,
 'ENST00000471248': 1,
 'ENST00000610542': 1,
 'ENST00000453576': 1,
 'ENSG00000239945': 1,
 'ENST00000495576': 1,
 'ENSG00000233750': 1,
 'ENST00000442987': 1,
 'ENSG00000268903': 1,
 'ENST00000494149': 1,
 'ENSG00000269981': 1,
 'ENST00000595919': 1,
 'ENSG00000239906': 1,
 'ENST00000493797': 1,
 'ENSG00000241860': 1,
 'ENST00000484859': 1,
 'ENST00000490997': 1,
 'ENST00000466557': 1,
 'ENST00000662089': 1,
 'ENST00000

In [16]:
ens_ids2 = [x for x in ens_ids if (x in genedict)]

In [17]:
ens_ids2

['ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000279928',
 'ENSG00000279457',
 'ENSG00000228463',
 'ENSG00000237094',
 'ENSG00000230021',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000169962',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',


In [18]:
len(ens_ids2)

26328

In [19]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']


del adata.var['gene_ids']

In [20]:
adata.var

Unnamed: 0_level_0,n_cells,highly_variable,means,dispersions,dispersions_norm,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000238009,163,False,1.201170e-03,-0.515088,-0.191493,RP11-34P13.7
ENSG00000239945,7,False,7.863549e-05,-0.088078,0.753295,RP11-34P13.8
ENSG00000279928,3,False,1.000000e-12,,,FO538757.3
ENSG00000279457,20175,False,2.108105e-01,-0.289174,0.308356,FO538757.2
ENSG00000228463,16981,False,1.145107e-01,-0.373800,0.121116,AP006222.2
...,...,...,...,...,...,...
ENSG00000278817,3967,False,5.811303e-02,-0.388853,0.087810,AC007325.4
ENSG00000277196,646,False,1.151530e-02,-0.473779,-0.100094,AC007325.2
ENSG00000278384,106,False,1.361509e-03,-0.592318,-0.362370,AL354822.1
ENSG00000276345,15487,False,2.281273e-01,-0.240747,0.415504,AC004556.1


In [21]:
adata = adata[:, ens_ids2].copy()

  res = method(*args, **kwargs)


In [22]:
adata2.var['gene_symbols'] = adata2.var_names
adata2.var_names = adata2.var['gene_ids']


del adata2.var['gene_ids']

In [23]:
adata2 = adata2[:, ens_ids2].copy()

In [24]:
adata

AnnData object with n_obs × n_vars = 62849 × 26328
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols'
    uns: 'cell_name_detailed_colors'
    obsm: 'X_pca', 'X_umap'

In [25]:
adata2

AnnData object with n_obs × n_vars = 62849 × 26328
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed'
    var: 'gene_symbols'

In [26]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [27]:
adata2.var['feature_biotype'] = ["gene"] * len(adata2.var)

In [28]:
adata.raw = adata2

---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

In [29]:
adata

AnnData object with n_obs × n_vars = 62849 × 26328
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols', 'feature_biotype'
    uns: 'cell_name_detailed_colors'
    obsm: 'X_pca', 'X_umap'

# <ins>obs

In [33]:
adata.obs

Unnamed: 0_level_0,CRL,Enrichment_fraction,PCW,Donor_nb,Donor_id,Purification,Organ,Sample,doublet_scores_observed_cells,n_genes,percent_mito,n_counts,cell_type_group,cell_name,cell_name_detailed
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AAACCTGAGCTGCAAG-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.025319,2241,0.046902,5842.0,mesenchymal,mesoderm 1,Mesoderm 1
AAACCTGCACTGAAGG-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.029722,1563,0.037261,4294.0,mesenchymal,WNT4 FLC,WNT4+ fibroblasts
AAACCTGCATCCCACT-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.061127,2562,0.028831,7145.0,mesenchymal,mesoderm 1,Mesoderm 1
AAACGGGAGAGTACAT-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.048913,2297,0.026690,6257.0,mesenchymal,mesoderm 1,Mesoderm 1
AAACGGGAGAGTTGGC-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.076267,3645,0.045447,16965.0,immune,immune cells,Immune cells
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTCCTCTCACAGGCC-1-4918STDY7718977,47,pos,F10,9,2134,MACS,colon,4918STDY7718977,0.066214,3244,0.008273,9670.0,mesenchymal,mesoderm 2,Mesoderm 2
TTTGCGCTCTCTGTCG-1-4918STDY7718977,47,pos,F10,9,2134,MACS,colon,4918STDY7718977,0.030959,2759,0.016800,10238.0,epithelium,Colonic Epi,Colonocytes
TTTGGTTCACTTGGAT-1-4918STDY7718977,47,pos,F10,9,2134,MACS,colon,4918STDY7718977,0.109415,3011,0.016479,9163.0,epithelium,Secretory Epi,Enteroendocrine
TTTGTCAAGAGCTGGT-1-4918STDY7718977,47,pos,F10,9,2134,MACS,colon,4918STDY7718977,0.045616,3113,0.016294,15527.0,epithelium,Colonic Epi,Colonocytes


## assay_ontology_term_id

10x 3' v2	"EFO:0009899"  
10x 3' v3	"EFO:0009922"  
10x 5' v1	"EFO:0011025"  
10x 5' v2	"EFO:0009900"  
Smart-seq	"EFO:0008930"  
Smart-seq2	"EFO:0008931"

> Single-cell suspension for each primary intestinal or organoid sample was loaded onto a separate channel of a Chromium 10x Genomics single cell 3’v2 library chip as per manufacturer's protocol

In [34]:
adata.obs['assay_ontology_term_id'] = ['EFO:0009899'] * len(adata.obs)

In [35]:
adata.obs['assay_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    EFO:0009899
AAACCTGCACTGAAGG-1-4918STDY7321513    EFO:0009899
AAACCTGCATCCCACT-1-4918STDY7321513    EFO:0009899
AAACGGGAGAGTACAT-1-4918STDY7321513    EFO:0009899
AAACGGGAGAGTTGGC-1-4918STDY7321513    EFO:0009899
                                         ...     
TTTCCTCTCACAGGCC-1-4918STDY7718977    EFO:0009899
TTTGCGCTCTCTGTCG-1-4918STDY7718977    EFO:0009899
TTTGGTTCACTTGGAT-1-4918STDY7718977    EFO:0009899
TTTGTCAAGAGCTGGT-1-4918STDY7718977    EFO:0009899
TTTGTCACATACCATG-1-4918STDY7718977    EFO:0009899
Name: assay_ontology_term_id, Length: 62849, dtype: object

## cell_type_ontology_term_id

In [37]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_name_detailed']

In [38]:
adata.obs['cell_type_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513             Mesoderm 1
AAACCTGCACTGAAGG-1-4918STDY7321513      WNT4+ fibroblasts
AAACCTGCATCCCACT-1-4918STDY7321513             Mesoderm 1
AAACGGGAGAGTACAT-1-4918STDY7321513             Mesoderm 1
AAACGGGAGAGTTGGC-1-4918STDY7321513           Immune cells
                                             ...         
TTTCCTCTCACAGGCC-1-4918STDY7718977             Mesoderm 2
TTTGCGCTCTCTGTCG-1-4918STDY7718977            Colonocytes
TTTGGTTCACTTGGAT-1-4918STDY7718977        Enteroendocrine
TTTGTCAAGAGCTGGT-1-4918STDY7718977            Colonocytes
TTTGTCACATACCATG-1-4918STDY7718977    Smooth muscle cells
Name: cell_type_ontology_term_id, Length: 62849, dtype: category
Categories (28, object): ['Arterial endothelial cell', 'BEST4+ enterocyte', 'Enteric neurons', 'Erythroblasts', ..., 'Mesoderm 1', 'Mesoderm 2', 'Myofibroblast', 'Pericyte']

In [39]:
(list(adata.obs['cell_type_ontology_term_id'].unique()))

['Mesoderm 1',
 'WNT4+ fibroblasts',
 'Immune cells',
 'Smooth muscle cells',
 'Erythroblasts',
 'Mesoderm 2',
 'Neural crest cells',
 'Enteric neurons',
 'Uniform progenitors',
 'Serosa/mesothelial cells',
 'Pericyte',
 'Venous endothelial cell',
 'FOXL1+ fibroblasts',
 'Arterial endothelial cell',
 'Lymphatic endothelial cell',
 'Myofibroblast',
 'Enterocytes',
 'FRZB+ fibroblasts',
 'Colonocytes',
 'Interstitial cells of Cajal',
 'BEST4+ enterocyte',
 'Enteroendocrine',
 'Enterocyte',
 'G2M/S enterocytes',
 'Goblet',
 'Early enterocyte',
 'LGR5 stem',
 'NTS+ epithelial cells']

In [40]:
mapping = {'Mesoderm 1': 'CL:0000222',
 'WNT4+ fibroblasts': 'CL:0000057',
 'Immune cells': 'CL:0000988', # not sure
 'Smooth muscle cells': 'CL:0002504', # not part1/capn3+
 'Erythroblasts': 'CL:0000765',
 'Mesoderm 2': 'CL:0000222',
 'Neural crest cells': 'CL:0011012',
 'Enteric neurons': 'CL:0007011',
 'Uniform progenitors': 'CL:0011026',
 'Serosa/mesothelial cells': 'CL:0000077',
 'Pericyte': 'CL:0000669',
 'Venous endothelial cell': 'CL:0002543',
 'FOXL1+ fibroblasts': 'CL:0000057',
 'Arterial endothelial cell': 'CL:1000413',
 'Lymphatic endothelial cell': 'CL:0002138',
 'Myofibroblast': 'CL:0000186',
 'Enterocytes': 'CL:0000584',
 'FRZB+ fibroblasts': 'CL:0000057',
 'Colonocytes': 'CL:0011108',
 'Interstitial cells of Cajal': 'CL:0002088',
 'BEST4+ enterocyte': 'CL:0000584', # not best4+
 'Enteroendocrine': 'CL:0000164',
 'Enterocyte': 'CL:0000584',
 'G2M/S enterocytes': 'CL:0000584', # not g2m/s
 'Goblet': 'CL:0019031',
 'Early enterocyte': 'CL:0000584', # not early
 'LGR5 stem': 'CL:0000034', # not lgr5
 'NTS+ epithelial cells': 'CL:0000066'}

In [41]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [42]:
adata.obs['cell_type_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    CL:0000222
AAACCTGCACTGAAGG-1-4918STDY7321513    CL:0000057
AAACCTGCATCCCACT-1-4918STDY7321513    CL:0000222
AAACGGGAGAGTACAT-1-4918STDY7321513    CL:0000222
AAACGGGAGAGTTGGC-1-4918STDY7321513    CL:0000988
                                         ...    
TTTCCTCTCACAGGCC-1-4918STDY7718977    CL:0000222
TTTGCGCTCTCTGTCG-1-4918STDY7718977    CL:0011108
TTTGGTTCACTTGGAT-1-4918STDY7718977    CL:0000164
TTTGTCAAGAGCTGGT-1-4918STDY7718977    CL:0011108
TTTGTCACATACCATG-1-4918STDY7718977    CL:0002504
Name: cell_type_ontology_term_id, Length: 62849, dtype: object

## development_stage_ontology_term_id

In [43]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['Donor_nb']

In [44]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [45]:
list(adata.obs['development_stage_ontology_term_id'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [46]:
mapping = {1: 'HsapDv:0000024', #F6.1
           2: 'HsapDv:0000030', #F8.4
           3: 'HsapDv:0000047', #F10.2
           4: 'HsapDv:0000025', #F6.7
           5: 'HsapDv:0000026', #F6.9
           6: 'HsapDv:0000030', #F9.2
           7: 'HsapDv:0000029', #F7.9
           8: 'HsapDv:0000030', #F9.9
           9: 'HsapDv:0000030'} #F10

In [47]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [48]:
adata.obs['development_stage_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    HsapDv:0000024
AAACCTGCACTGAAGG-1-4918STDY7321513    HsapDv:0000024
AAACCTGCATCCCACT-1-4918STDY7321513    HsapDv:0000024
AAACGGGAGAGTACAT-1-4918STDY7321513    HsapDv:0000024
AAACGGGAGAGTTGGC-1-4918STDY7321513    HsapDv:0000024
                                           ...      
TTTCCTCTCACAGGCC-1-4918STDY7718977    HsapDv:0000030
TTTGCGCTCTCTGTCG-1-4918STDY7718977    HsapDv:0000030
TTTGGTTCACTTGGAT-1-4918STDY7718977    HsapDv:0000030
TTTGTCAAGAGCTGGT-1-4918STDY7718977    HsapDv:0000030
TTTGTCACATACCATG-1-4918STDY7718977    HsapDv:0000030
Name: development_stage_ontology_term_id, Length: 62849, dtype: object

## disease_ontology_term_id

In [49]:
adata.obs['disease_ontology_term_id'] =  ["PATO:0000461"] * len(adata.obs)

In [50]:
adata.obs['disease_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    PATO:0000461
AAACCTGCACTGAAGG-1-4918STDY7321513    PATO:0000461
AAACCTGCATCCCACT-1-4918STDY7321513    PATO:0000461
AAACGGGAGAGTACAT-1-4918STDY7321513    PATO:0000461
AAACGGGAGAGTTGGC-1-4918STDY7321513    PATO:0000461
                                          ...     
TTTCCTCTCACAGGCC-1-4918STDY7718977    PATO:0000461
TTTGCGCTCTCTGTCG-1-4918STDY7718977    PATO:0000461
TTTGGTTCACTTGGAT-1-4918STDY7718977    PATO:0000461
TTTGTCAAGAGCTGGT-1-4918STDY7718977    PATO:0000461
TTTGTCACATACCATG-1-4918STDY7718977    PATO:0000461
Name: disease_ontology_term_id, Length: 62849, dtype: object

## ethnicity_ontology_term_id

In [51]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [52]:
adata.obs['ethnicity_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    unknown
AAACCTGCACTGAAGG-1-4918STDY7321513    unknown
AAACCTGCATCCCACT-1-4918STDY7321513    unknown
AAACGGGAGAGTACAT-1-4918STDY7321513    unknown
AAACGGGAGAGTTGGC-1-4918STDY7321513    unknown
                                       ...   
TTTCCTCTCACAGGCC-1-4918STDY7718977    unknown
TTTGCGCTCTCTGTCG-1-4918STDY7718977    unknown
TTTGGTTCACTTGGAT-1-4918STDY7718977    unknown
TTTGTCAAGAGCTGGT-1-4918STDY7718977    unknown
TTTGTCACATACCATG-1-4918STDY7718977    unknown
Name: ethnicity_ontology_term_id, Length: 62849, dtype: object

## is_primary_data

In [53]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [54]:
adata.obs['is_primary_data']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    True
AAACCTGCACTGAAGG-1-4918STDY7321513    True
AAACCTGCATCCCACT-1-4918STDY7321513    True
AAACGGGAGAGTACAT-1-4918STDY7321513    True
AAACGGGAGAGTTGGC-1-4918STDY7321513    True
                                      ... 
TTTCCTCTCACAGGCC-1-4918STDY7718977    True
TTTGCGCTCTCTGTCG-1-4918STDY7718977    True
TTTGGTTCACTTGGAT-1-4918STDY7718977    True
TTTGTCAAGAGCTGGT-1-4918STDY7718977    True
TTTGTCACATACCATG-1-4918STDY7718977    True
Name: is_primary_data, Length: 62849, dtype: bool

In [55]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [56]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [57]:
adata.obs['organism_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    NCBITaxon:9606
AAACCTGCACTGAAGG-1-4918STDY7321513    NCBITaxon:9606
AAACCTGCATCCCACT-1-4918STDY7321513    NCBITaxon:9606
AAACGGGAGAGTACAT-1-4918STDY7321513    NCBITaxon:9606
AAACGGGAGAGTTGGC-1-4918STDY7321513    NCBITaxon:9606
                                           ...      
TTTCCTCTCACAGGCC-1-4918STDY7718977    NCBITaxon:9606
TTTGCGCTCTCTGTCG-1-4918STDY7718977    NCBITaxon:9606
TTTGGTTCACTTGGAT-1-4918STDY7718977    NCBITaxon:9606
TTTGTCAAGAGCTGGT-1-4918STDY7718977    NCBITaxon:9606
TTTGTCACATACCATG-1-4918STDY7718977    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 62849, dtype: object

In [58]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [59]:
adata.obs['organism_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    NCBITaxon:9606
AAACCTGCACTGAAGG-1-4918STDY7321513    NCBITaxon:9606
AAACCTGCATCCCACT-1-4918STDY7321513    NCBITaxon:9606
AAACGGGAGAGTACAT-1-4918STDY7321513    NCBITaxon:9606
AAACGGGAGAGTTGGC-1-4918STDY7321513    NCBITaxon:9606
                                           ...      
TTTCCTCTCACAGGCC-1-4918STDY7718977    NCBITaxon:9606
TTTGCGCTCTCTGTCG-1-4918STDY7718977    NCBITaxon:9606
TTTGGTTCACTTGGAT-1-4918STDY7718977    NCBITaxon:9606
TTTGTCAAGAGCTGGT-1-4918STDY7718977    NCBITaxon:9606
TTTGTCACATACCATG-1-4918STDY7718977    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 62849, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [60]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [61]:
adata.obs['sex_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    unknown
AAACCTGCACTGAAGG-1-4918STDY7321513    unknown
AAACCTGCATCCCACT-1-4918STDY7321513    unknown
AAACGGGAGAGTACAT-1-4918STDY7321513    unknown
AAACGGGAGAGTTGGC-1-4918STDY7321513    unknown
                                       ...   
TTTCCTCTCACAGGCC-1-4918STDY7718977    unknown
TTTGCGCTCTCTGTCG-1-4918STDY7718977    unknown
TTTGGTTCACTTGGAT-1-4918STDY7718977    unknown
TTTGTCAAGAGCTGGT-1-4918STDY7718977    unknown
TTTGTCACATACCATG-1-4918STDY7718977    unknown
Name: sex_ontology_term_id, Length: 62849, dtype: object

## tissue_ontology_term_id

In [63]:
adata.obs['tissue_ontology_term_id'] = adata.obs['Organ']

In [64]:
adata.obs['tissue_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    duojejunum
AAACCTGCACTGAAGG-1-4918STDY7321513    duojejunum
AAACCTGCATCCCACT-1-4918STDY7321513    duojejunum
AAACGGGAGAGTACAT-1-4918STDY7321513    duojejunum
AAACGGGAGAGTTGGC-1-4918STDY7321513    duojejunum
                                         ...    
TTTCCTCTCACAGGCC-1-4918STDY7718977         colon
TTTGCGCTCTCTGTCG-1-4918STDY7718977         colon
TTTGGTTCACTTGGAT-1-4918STDY7718977         colon
TTTGTCAAGAGCTGGT-1-4918STDY7718977         colon
TTTGTCACATACCATG-1-4918STDY7718977         colon
Name: tissue_ontology_term_id, Length: 62849, dtype: category
Categories (3, object): ['duojejunum', 'ileum', 'colon']

In [65]:
list(adata.obs['tissue_ontology_term_id'].unique())

['duojejunum', 'ileum', 'colon']

In [66]:
mapping = {'duojejunum': 'UBERON:8410000',
           'ileum': 'UBERON:0002116', 
           'colon': 'UBERON:0001155'}

In [67]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].map(mapping)

In [68]:
adata.obs['tissue_ontology_term_id']

index
AAACCTGAGCTGCAAG-1-4918STDY7321513    UBERON:8410000
AAACCTGCACTGAAGG-1-4918STDY7321513    UBERON:8410000
AAACCTGCATCCCACT-1-4918STDY7321513    UBERON:8410000
AAACGGGAGAGTACAT-1-4918STDY7321513    UBERON:8410000
AAACGGGAGAGTTGGC-1-4918STDY7321513    UBERON:8410000
                                           ...      
TTTCCTCTCACAGGCC-1-4918STDY7718977    UBERON:0001155
TTTGCGCTCTCTGTCG-1-4918STDY7718977    UBERON:0001155
TTTGGTTCACTTGGAT-1-4918STDY7718977    UBERON:0001155
TTTGTCAAGAGCTGGT-1-4918STDY7718977    UBERON:0001155
TTTGTCACATACCATG-1-4918STDY7718977    UBERON:0001155
Name: tissue_ontology_term_id, Length: 62849, dtype: category
Categories (3, object): ['UBERON:8410000', 'UBERON:0002116', 'UBERON:0001155']

---

# <ins>var and raw.var

In [69]:
adata.var

Unnamed: 0_level_0,n_cells,highly_variable,means,dispersions,dispersions_norm,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000238009,163,False,1.201170e-03,-0.515088,-0.191493,RP11-34P13.7,gene
ENSG00000239945,7,False,7.863549e-05,-0.088078,0.753295,RP11-34P13.8,gene
ENSG00000279928,3,False,1.000000e-12,,,FO538757.3,gene
ENSG00000279457,20175,False,2.108105e-01,-0.289174,0.308356,FO538757.2,gene
ENSG00000228463,16981,False,1.145107e-01,-0.373800,0.121116,AP006222.2,gene
...,...,...,...,...,...,...,...
ENSG00000278817,3967,False,5.811303e-02,-0.388853,0.087810,AC007325.4,gene
ENSG00000277196,646,False,1.151530e-02,-0.473779,-0.100094,AC007325.2,gene
ENSG00000278384,106,False,1.361509e-03,-0.592318,-0.362370,AL354822.1,gene
ENSG00000276345,15487,False,2.281273e-01,-0.240747,0.415504,AC004556.1,gene


## feature_biotype

Adjusted at the beginning!

In [70]:
#adata2.var['feature_biotype'] = ['gene'] * len(adata2.var.index)

## feature_id

Adjusted at the beginning!

In [71]:
#adata2.var_names = adata2.var["gene_ids"].copy()
#adata2.var_names.name = "feature_id"
#adata2.raw.var_names = adata2.raw.var["gene_ids"].copy()
#adata2.raw.var_names.name = "feature_id"

## feature_is_filtered

True if feature was filtered out in final matrix (X)  
Problem: This should be annotated in 'var', not 'raw.var'. __How to annotate the gene if it is already filtered out in the final matrix?__

In [72]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [73]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000238009    False
ENSG00000239945    False
ENSG00000279928    False
ENSG00000279457    False
ENSG00000228463    False
                   ...  
ENSG00000278817    False
ENSG00000277196    False
ENSG00000278384    False
ENSG00000276345    False
ENSG00000271254    False
Name: feature_is_filtered, Length: 26328, dtype: bool

# <ins>obsm

In [78]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

# <ins>uns

In [79]:
adata.uns['schema_version'] = '2.0.0'

In [80]:
adata.uns['title'] = 'Fetal Human Gut (6-11 PCW)'

In [81]:
adata.uns['X_normalization'] = 'Scanpy'

In [82]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [83]:
adata

AnnData object with n_obs × n_vars = 62849 × 26328
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'cell_name_detailed_colors', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_pca', 'X_umap'

In [91]:
adata.uns['cell_type_colors'] = adata.uns['cell_name_detailed_colors']

In [84]:
adata.obs

Unnamed: 0_level_0,CRL,Enrichment_fraction,PCW,Donor_nb,Donor_id,Purification,Organ,Sample,doublet_scores_observed_cells,n_genes,...,cell_name_detailed,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGCTGCAAG-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.025319,2241,...,Mesoderm 1,EFO:0009899,CL:0000222,HsapDv:0000024,PATO:0000461,unknown,True,NCBITaxon:9606,unknown,UBERON:8410000
AAACCTGCACTGAAGG-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.029722,1563,...,WNT4+ fibroblasts,EFO:0009899,CL:0000057,HsapDv:0000024,PATO:0000461,unknown,True,NCBITaxon:9606,unknown,UBERON:8410000
AAACCTGCATCCCACT-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.061127,2562,...,Mesoderm 1,EFO:0009899,CL:0000222,HsapDv:0000024,PATO:0000461,unknown,True,NCBITaxon:9606,unknown,UBERON:8410000
AAACGGGAGAGTACAT-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.048913,2297,...,Mesoderm 1,EFO:0009899,CL:0000222,HsapDv:0000024,PATO:0000461,unknown,True,NCBITaxon:9606,unknown,UBERON:8410000
AAACGGGAGAGTTGGC-1-4918STDY7321513,17,pos,F6.1,1,2029,none,duojejunum,4918STDY7321513,0.076267,3645,...,Immune cells,EFO:0009899,CL:0000988,HsapDv:0000024,PATO:0000461,unknown,True,NCBITaxon:9606,unknown,UBERON:8410000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTCCTCTCACAGGCC-1-4918STDY7718977,47,pos,F10,9,2134,MACS,colon,4918STDY7718977,0.066214,3244,...,Mesoderm 2,EFO:0009899,CL:0000222,HsapDv:0000030,PATO:0000461,unknown,True,NCBITaxon:9606,unknown,UBERON:0001155
TTTGCGCTCTCTGTCG-1-4918STDY7718977,47,pos,F10,9,2134,MACS,colon,4918STDY7718977,0.030959,2759,...,Colonocytes,EFO:0009899,CL:0011108,HsapDv:0000030,PATO:0000461,unknown,True,NCBITaxon:9606,unknown,UBERON:0001155
TTTGGTTCACTTGGAT-1-4918STDY7718977,47,pos,F10,9,2134,MACS,colon,4918STDY7718977,0.109415,3011,...,Enteroendocrine,EFO:0009899,CL:0000164,HsapDv:0000030,PATO:0000461,unknown,True,NCBITaxon:9606,unknown,UBERON:0001155
TTTGTCAAGAGCTGGT-1-4918STDY7718977,47,pos,F10,9,2134,MACS,colon,4918STDY7718977,0.045616,3113,...,Colonocytes,EFO:0009899,CL:0011108,HsapDv:0000030,PATO:0000461,unknown,True,NCBITaxon:9606,unknown,UBERON:0001155


In [85]:
adata.obs.dtypes

CRL                                      int64
Enrichment_fraction                   category
PCW                                   category
Donor_nb                                 int64
Donor_id                              category
Purification                          category
Organ                                 category
Sample                                category
doublet_scores_observed_cells          float64
n_genes                                  int64
percent_mito                           float32
n_counts                               float32
cell_type_group                       category
cell_name                             category
cell_name_detailed                    category
assay_ontology_term_id                  object
cell_type_ontology_term_id              object
development_stage_ontology_term_id      object
disease_ontology_term_id                object
ethnicity_ontology_term_id              object
is_primary_data                           bool
organism_onto

In [86]:
# adata.obs
print('obs')
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

# adata.var
print('var')
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

obs
changed doublet_scores_observed_cells from float64 to float32
changed CRL from int64 to int32
changed Donor_nb from int64 to int32
changed n_genes from int64 to int32
var
changed n_cells from int64 to int32


In [87]:
adata.obs['CRL'] = adata.obs['CRL'].astype('category')
adata.obs['Donor_nb'] = adata.obs['Donor_nb'].astype('category')

In [92]:
adata

AnnData object with n_obs × n_vars = 62849 × 26328
    obs: 'CRL', 'Enrichment_fraction', 'PCW', 'Donor_nb', 'Donor_id', 'Purification', 'Organ', 'Sample', 'doublet_scores_observed_cells', 'n_genes', 'percent_mito', 'n_counts', 'cell_type_group', 'cell_name', 'cell_name_detailed', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'cell_name_detailed_colors', 'schema_version', 'title', 'X_normalization', 'default_embedding', 'cell_type_colors'
    obsm: 'X_pca', 'X_umap'

In [93]:
adata.write('gca_fetal.h5ad', compression = 'gzip')

... storing 'assay_ontology_term_id' as categorical
... storing 'cell_type_ontology_term_id' as categorical
... storing 'development_stage_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'sex_ontology_term_id' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical


---