# **Curating `fetal-lung` dataset:**

### Myeloid dataset

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

In [2]:
adata = sc.read_h5ad('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/fetal-lung/C2filtered.h5ad')

In [3]:
adata.var

Unnamed: 0,gene_ids,n_cells,highly_variable5891STDY8062349,highly_variable5891STDY8062350,highly_variable5891STDY8062351,highly_variable5891STDY8062352,highly_variable5891STDY8062353,highly_variable5891STDY8062354,highly_variable5891STDY8062355,highly_variable5891STDY8062356,...,Deep_WSSS_F_LNG8713191,Deep_5891STDY9030806,Deep_5891STDY9030807,Deep_5891STDY9030808,Deep_5891STDY9030809,Deep_5891STDY9030810,Deep_n,means,dispersions,dispersions_norm
MIR1302-2HG,ENSG00000243485,4,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.000716,0.837703,-0.592373
AL627309.1,ENSG00000238009,21,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,1,0.003444,0.963300,-0.379956
AL669831.5,ENSG00000237491,263,False,False,True,False,True,False,False,False,...,False,False,False,False,False,False,0,0.044037,1.154678,-0.056287
FAM87B,ENSG00000177757,21,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.003756,1.399611,0.357959
LINC00115,ENSG00000225880,402,False,True,True,False,True,False,False,False,...,False,False,False,False,False,False,1,0.069900,1.260772,0.123146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC007325.2,ENSG00000277196,32,True,True,False,False,False,True,True,True,...,False,False,False,True,False,False,9,0.007644,2.006535,1.384423
AL354822.1,ENSG00000278384,11,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.002443,1.105285,-0.139823
AC004556.1,ENSG00000276345,3052,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.544092,1.298267,-0.382608
AC233755.1,ENSG00000275063,3,True,False,False,False,True,True,False,False,...,False,False,False,False,False,False,2,0.000397,0.471942,-1.210969


In [4]:
len(adata.var.columns)

66

In [5]:
adata.var = adata.var[['gene_ids', 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm']]

In [6]:
adata

AnnData object with n_obs × n_vars = 6947 × 20142
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C2_leiden', 'C2_leiden_R', 'new_celltype'
    var: 'gene_ids', 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm'
    uns: 'C2_leiden_R_colors', 'batch_colors', 'new_celltype_colors', 'rank_genes_groups', 'rank_genes_groups_filtered'
    obsm: 'X_C2filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [7]:
def CalculateRaw(adata,scaling_factor=10000):
    #update by Polanski in Feb 2022
    #The object must contain a log-transformed matrix
    #This function returns an integer-count object
    #The normalization constant is assumed to be 10000
    #return anndata.AnnData(X=sparse.csr_matrix(np.rint(np.array(np.expm1(adata.X).todense().transpose())*(adata.obs['n_counts'].values).transpose() / scaling_factor).transpose()),\
    #              obs=adata.obs,var=adata.var,obsm=adata.obsm,varm=adata.varm)
    X = np.expm1(adata.X)
    scaling_vector = adata.obs['n_counts'].values / scaling_factor
    #.indptr[i]:.indptr[i+1] provides the .data coordinates where the i'th row of the data resides in CSR
    #which happens to be a cell, which happens to be what we have a unique entry in scaling_vector for
    for i in np.arange(X.shape[0]):
        X.data[X.indptr[i]:X.indptr[i+1]] = X.data[X.indptr[i]:X.indptr[i+1]] * scaling_vector[i]
    return sc.AnnData(X=np.rint(X),obs=adata.obs,var=adata.var,obsm=adata.obsm,varm=adata.varm)

In [8]:
raw = CalculateRaw(adata)

In [9]:
del raw.var

In [10]:
geno = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['ENSEMBLid', 'gena','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [11]:
geno

Unnamed: 0,ENSEMBLid,gena,no
0,ENSG00000223972,DDX11L1,5
1,ENST00000456328,DDX11L1-202,2
2,ENST00000450305,DDX11L1-201,2
3,ENSG00000227232,WASH7P,5
4,ENST00000488147,WASH7P-201,1
...,...,...,...
297782,ENST00000621424,ENSG110105435.4-001,4
297783,ENSG00000278625,U6,1
297784,ENST00000616830,U6.88-201,1
297785,ENSG00000277374,U1,1


In [12]:
ens_ids = adata.var['gene_ids']

In [13]:
adata.var

Unnamed: 0,gene_ids,n_cells,Deep_n,means,dispersions,dispersions_norm
MIR1302-2HG,ENSG00000243485,4,0,0.000716,0.837703,-0.592373
AL627309.1,ENSG00000238009,21,1,0.003444,0.963300,-0.379956
AL669831.5,ENSG00000237491,263,0,0.044037,1.154678,-0.056287
FAM87B,ENSG00000177757,21,0,0.003756,1.399611,0.357959
LINC00115,ENSG00000225880,402,1,0.069900,1.260772,0.123146
...,...,...,...,...,...,...
AC007325.2,ENSG00000277196,32,9,0.007644,2.006535,1.384423
AL354822.1,ENSG00000278384,11,0,0.002443,1.105285,-0.139823
AC004556.1,ENSG00000276345,3052,0,0.544092,1.298267,-0.382608
AC233755.1,ENSG00000275063,3,2,0.000397,0.471942,-1.210969


In [14]:
genedict = {key: 1 for key in list(geno.ENSEMBLid)}

In [15]:
genedict

{'ENSG00000223972': 1,
 'ENST00000456328': 1,
 'ENST00000450305': 1,
 'ENSG00000227232': 1,
 'ENST00000488147': 1,
 'ENSG00000278267': 1,
 'ENST00000619216': 1,
 'ENSG00000243485': 1,
 'ENST00000473358': 1,
 'ENST00000469289': 1,
 'ENSG00000284332': 1,
 'ENST00000607096': 1,
 'ENSG00000237613': 1,
 'ENST00000417324': 1,
 'ENST00000461467': 1,
 'ENSG00000268020': 1,
 'ENST00000606857': 1,
 'ENSG00000240361': 1,
 'ENST00000642116': 1,
 'ENST00000492842': 1,
 'ENSG00000186092': 1,
 'ENST00000641515': 1,
 'ENSG00000238009': 1,
 'ENST00000466430': 1,
 'ENST00000477740': 1,
 'ENST00000471248': 1,
 'ENST00000610542': 1,
 'ENST00000453576': 1,
 'ENSG00000239945': 1,
 'ENST00000495576': 1,
 'ENSG00000233750': 1,
 'ENST00000442987': 1,
 'ENSG00000268903': 1,
 'ENST00000494149': 1,
 'ENSG00000269981': 1,
 'ENST00000595919': 1,
 'ENSG00000239906': 1,
 'ENST00000493797': 1,
 'ENSG00000241860': 1,
 'ENST00000484859': 1,
 'ENST00000490997': 1,
 'ENST00000466557': 1,
 'ENST00000662089': 1,
 'ENST00000

In [16]:
ens_ids2 = [x for x in ens_ids if (x in genedict or x.startswith('SARSCoV'))]

In [17]:
ens_ids2

['ENSG00000243485',
 'ENSG00000238009',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000230699',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000131591',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000169962',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',
 'ENSG00000242485',
 'ENSG00000272455',
 'ENSG00000235098',
 'ENSG00000225905',
 'ENSG00000225285',
 'ENSG00000179403',
 'ENSG00000215915',
 'ENSG00000160072',
 'ENSG00000197785',
 'ENSG00000205090',


In [18]:
len(ens_ids2)

20014

In [19]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.var['gene_symbols'] = adata.var_names


In [20]:
raw.var['gene_symbols'] = adata.var_names
raw.var_names = adata.var['gene_ids']

In [21]:
del adata.var['gene_ids']

In [22]:
del raw.var['gene_symbols']

In [23]:
adata = adata[:, ens_ids2].copy()

  res = method(*args, **kwargs)


In [24]:
raw = raw[:, ens_ids2].copy()

In [25]:
adata

AnnData object with n_obs × n_vars = 6947 × 20014
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C2_leiden', 'C2_leiden_R', 'new_celltype'
    var: 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols'
    uns: 'C2_leiden_R_colors', 'batch_colors', 'new_celltype_colors', 'rank_genes_groups', 'rank_genes_groups_filtered'
    obsm: 'X_C2filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [26]:
raw

AnnData object with n_obs × n_vars = 6947 × 20014
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C2_leiden', 'C2_leiden_R', 'new_celltype'
    obsm: 'X_C2filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [27]:
del raw.obs
del raw.obsm

In [28]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [29]:
raw.var['feature_biotype'] = ["gene"] * len(adata.var)

In [30]:
adata.raw = raw

## 2. Pre-prepare before schema adjustments

In [31]:
adata.var

Unnamed: 0_level_0,n_cells,Deep_n,means,dispersions,dispersions_norm,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000243485,4,0,0.000716,0.837703,-0.592373,MIR1302-2HG,gene
ENSG00000238009,21,1,0.003444,0.963300,-0.379956,AL627309.1,gene
ENSG00000237491,263,0,0.044037,1.154678,-0.056287,AL669831.5,gene
ENSG00000177757,21,0,0.003756,1.399611,0.357959,FAM87B,gene
ENSG00000225880,402,1,0.069900,1.260772,0.123146,LINC00115,gene
...,...,...,...,...,...,...,...
ENSG00000277196,32,9,0.007644,2.006535,1.384423,AC007325.2,gene
ENSG00000278384,11,0,0.002443,1.105285,-0.139823,AL354822.1,gene
ENSG00000276345,3052,0,0.544092,1.298267,-0.382608,AC004556.1,gene
ENSG00000275063,3,2,0.000397,0.471942,-1.210969,AC233755.1,gene


---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [32]:
adata.obs.columns

Index(['batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts',
       'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score',
       'G2M_score', 'C2_leiden', 'C2_leiden_R', 'new_celltype'],
      dtype='object')

In [33]:
adata.obs

Unnamed: 0,batch,stage,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,G2M_score,C2_leiden,C2_leiden_R,new_celltype
AAACCTGTCAATCACG-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.054201,2949.0,1385,0.039062,0.999862,2,G1,-0.147739,-0.136545,0,0,SPP1+ MΦ
AAACCTGTCATTGCCC-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.049873,30024.0,5380,0.160350,0.027939,2,S,0.661255,0.028148,11,111,Cycling DC
AAACGGGTCATGGTCA-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.115864,4258.0,1964,0.057011,0.999862,2,G1,-0.121608,-0.132354,0,0,SPP1+ MΦ
AAAGCAAAGCCACCTG-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.043150,8226.0,2561,0.066890,0.826769,2,G1,-0.171818,-0.143412,8,8,S100A12-lo cla. mono.
AACCATGAGGTAGCCA-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.058095,9890.0,2956,0.051245,0.826769,2,G1,-0.118939,-0.233848,5,500,Non-cla. mono.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGCGTCGTGGTCCGT-5891STDY9030810,5891STDY9030810,9.0,Whole-lung,Normal,0.020080,976.0,543,0.102088,0.895486,2,G1,-0.043927,-0.085867,2,2,S100A12-hi cla. mono.
TTGTAGGCATCCCATC-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.039915,9477.0,2423,0.080160,0.895486,2,G1,-0.076753,-0.115796,5,500,Non-cla. mono.
TTTACTGGTCGAGATG-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.045369,3556.0,1656,0.020619,0.954489,2,G1,-0.043132,-0.129695,9,9,SPP1+ MΦ
TTTACTGGTCTTGCGG-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.040926,5179.0,1986,0.048951,0.954489,2,G1,-0.053600,-0.096857,4,4,SPP1+ MΦ


## assay_ontology_term_id

Assay used in this study is 10x 5' v1. Publication

10x 3' v2	"EFO:0009899"  
10x 3' v3	"EFO:0009922"  
10x 5' v1	"EFO:0011025"  
Smart-seq	"EFO:0008930"  
Smart-seq2	"EFO:0008931"

> Chromium Single Cell 5’ V(D)J Reagent Kits (V1.0 chemistry) were used for scRNAseq library
construction.

In [34]:
adata.obs['assay_ontology_term_id'] = ['EFO:0011025'] * len(adata.obs)

In [35]:
adata.obs['assay_ontology_term_id']

AAACCTGTCAATCACG-5891STDY8062349    EFO:0011025
AAACCTGTCATTGCCC-5891STDY8062349    EFO:0011025
AAACGGGTCATGGTCA-5891STDY8062349    EFO:0011025
AAAGCAAAGCCACCTG-5891STDY8062349    EFO:0011025
AACCATGAGGTAGCCA-5891STDY8062349    EFO:0011025
                                       ...     
TTGCGTCGTGGTCCGT-5891STDY9030810    EFO:0011025
TTGTAGGCATCCCATC-5891STDY9030810    EFO:0011025
TTTACTGGTCGAGATG-5891STDY9030810    EFO:0011025
TTTACTGGTCTTGCGG-5891STDY9030810    EFO:0011025
TTTGTCAAGACGCTTT-5891STDY9030810    EFO:0011025
Name: assay_ontology_term_id, Length: 6947, dtype: object

## cell_type_ontology_term_id

In [36]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['new_celltype']

In [37]:
len(list(adata.obs['cell_type_ontology_term_id'].unique()))

29

In [38]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['SPP1+ MΦ',
 'Cycling DC',
 'S100A12-lo cla. mono.',
 'Non-cla. mono.',
 'DC2',
 'S100A12-hi cla. mono.',
 'Promonocyte-like',
 'CX3CR1+ MΦ',
 'DC1',
 'HSC',
 'DC3',
 'Neutrophil',
 'CXCL9+ MΦ',
 'Megakaryocyte',
 'Promyelocyte-like',
 'pDC',
 'Eosinophil',
 'aDC 2',
 'pre-pDC/DC5',
 'Basophil',
 'MEP',
 'Platelet',
 'aDC 1',
 'HSC/ELP',
 'GMP',
 'Myelocyte-like',
 'APOE+ MΦ2',
 'CMP',
 'APOE+ MΦ1']

In [39]:
mapping = {'SPP1+ MΦ': 'CL:0000235',
 'Cycling DC': 'CL:0000451',
 'S100A12-lo cla. mono.': 'CL:0000860', # not s100a12lo
 'Non-cla. mono.': 'CL:0000875',
 'DC2': 'CL:0000451', # dendritic cell?
 'S100A12-hi cla. mono.': 'CL:0000860', # not s100a12hi
 'Promonocyte-like': 'CL:0000559', # promonocyte?
 'CX3CR1+ MΦ': 'CL:0000235', # not cx3cr1+
 'DC1': 'CL:0000451',
 'HSC': 'CL:0000037',
 'DC3': 'CL:0001056',
 'Neutrophil': 'CL:0000775',
 'CXCL9+ MΦ': 'CL:0000235', # not cxcl9+
 'Megakaryocyte': 'CL:0000556',
 'Promyelocyte-like': 'CL:0000836',
 'pDC': 'CL:0000784',
 'Eosinophil': 'CL:0000771',
 'aDC 2': 'CL:0000451', # not activated
 'pre-pDC/DC5': 'CL:0000451', # dendritic
 'Basophil': 'CL:0000767',
 'MEP': 'CL:0000050',
 'Platelet': 'CL:0000233',
 'aDC 1': 'CL:0000451', # not activated
 'HSC/ELP': 'CL:0000936',
 'GMP': 'CL:0000557',
 'Myelocyte-like': 'CL:0002092', # myelocyto
 'APOE+ MΦ2': 'CL:0000235', # not apoe and 2
 'CMP': 'CL:0000049',
 'APOE+ MΦ1': 'CL:0000235'} # not apoe and 1

In [40]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

## development_stage_ontology_term_id

In [41]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['stage']

In [42]:
list(adata.obs['stage'].unique())

[18.0, 22.0, 15.0, 20.0, 6.86, 5.0, 9.0, 11.0, nan]

In [44]:
mapping = {18.0: 'HsapDv:0000055', 
           22.0: 'HsapDv:0000059', 
           15.0: 'HsapDv:0000052', 
           20.0: 'HsapDv:0000057', 
           6.86: 'HsapDv:0000026', 
           5.0: 'HsapDv:0000023', 
           9.0: 'HsapDv:0000046', 
           11.0: 'HsapDv:0000048',
           np.nan: 'unknown'}

In [45]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [46]:
adata.obs['development_stage_ontology_term_id']

AAACCTGTCAATCACG-5891STDY8062349    HsapDv:0000055
AAACCTGTCATTGCCC-5891STDY8062349    HsapDv:0000055
AAACGGGTCATGGTCA-5891STDY8062349    HsapDv:0000055
AAAGCAAAGCCACCTG-5891STDY8062349    HsapDv:0000055
AACCATGAGGTAGCCA-5891STDY8062349    HsapDv:0000055
                                         ...      
TTGCGTCGTGGTCCGT-5891STDY9030810    HsapDv:0000046
TTGTAGGCATCCCATC-5891STDY9030810    HsapDv:0000048
TTTACTGGTCGAGATG-5891STDY9030810    HsapDv:0000048
TTTACTGGTCTTGCGG-5891STDY9030810    HsapDv:0000048
TTTGTCAAGACGCTTT-5891STDY9030810    HsapDv:0000048
Name: development_stage_ontology_term_id, Length: 6947, dtype: object

In [47]:
adata.obs['development_stage_ontology_term_id'].unique()

array(['HsapDv:0000055', 'HsapDv:0000059', 'HsapDv:0000052',
       'HsapDv:0000057', 'HsapDv:0000026', 'HsapDv:0000023',
       'HsapDv:0000046', 'HsapDv:0000048', 'unknown'], dtype=object)

## disease_ontology_term_id

In [48]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [49]:
adata.obs['disease_ontology_term_id']

AAACCTGTCAATCACG-5891STDY8062349    PATO:0000461
AAACCTGTCATTGCCC-5891STDY8062349    PATO:0000461
AAACGGGTCATGGTCA-5891STDY8062349    PATO:0000461
AAAGCAAAGCCACCTG-5891STDY8062349    PATO:0000461
AACCATGAGGTAGCCA-5891STDY8062349    PATO:0000461
                                        ...     
TTGCGTCGTGGTCCGT-5891STDY9030810    PATO:0000461
TTGTAGGCATCCCATC-5891STDY9030810    PATO:0000461
TTTACTGGTCGAGATG-5891STDY9030810    PATO:0000461
TTTACTGGTCTTGCGG-5891STDY9030810    PATO:0000461
TTTGTCAAGACGCTTT-5891STDY9030810    PATO:0000461
Name: disease_ontology_term_id, Length: 6947, dtype: object

In [50]:
adata.obs['disease_ontology_term_id']

AAACCTGTCAATCACG-5891STDY8062349    PATO:0000461
AAACCTGTCATTGCCC-5891STDY8062349    PATO:0000461
AAACGGGTCATGGTCA-5891STDY8062349    PATO:0000461
AAAGCAAAGCCACCTG-5891STDY8062349    PATO:0000461
AACCATGAGGTAGCCA-5891STDY8062349    PATO:0000461
                                        ...     
TTGCGTCGTGGTCCGT-5891STDY9030810    PATO:0000461
TTGTAGGCATCCCATC-5891STDY9030810    PATO:0000461
TTTACTGGTCGAGATG-5891STDY9030810    PATO:0000461
TTTACTGGTCTTGCGG-5891STDY9030810    PATO:0000461
TTTGTCAAGACGCTTT-5891STDY9030810    PATO:0000461
Name: disease_ontology_term_id, Length: 6947, dtype: object

## ethnicity_ontology_term_id

In [51]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [52]:
adata.obs['ethnicity_ontology_term_id']

AAACCTGTCAATCACG-5891STDY8062349    unknown
AAACCTGTCATTGCCC-5891STDY8062349    unknown
AAACGGGTCATGGTCA-5891STDY8062349    unknown
AAAGCAAAGCCACCTG-5891STDY8062349    unknown
AACCATGAGGTAGCCA-5891STDY8062349    unknown
                                     ...   
TTGCGTCGTGGTCCGT-5891STDY9030810    unknown
TTGTAGGCATCCCATC-5891STDY9030810    unknown
TTTACTGGTCGAGATG-5891STDY9030810    unknown
TTTACTGGTCTTGCGG-5891STDY9030810    unknown
TTTGTCAAGACGCTTT-5891STDY9030810    unknown
Name: ethnicity_ontology_term_id, Length: 6947, dtype: object

## is_primary_data

In [53]:
allcells = pd.read_csv('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/fetal-lung/cellnames_all.csv', index_col=0)

In [54]:
allcells

Unnamed: 0,batch,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,...,broad_celltype,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
AAACCTGAGACAGGCT-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.042116,8711.0,3101,0.060127,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAACCTGCACCGAATT-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.042811,8228.0,2992,0.131980,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGATGTCGTGACAT-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.052119,3601.0,1714,0.051245,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGCAACAAACGTGG-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.069157,5707.0,2280,0.083019,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGCAACATACTACG-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.065710,7095.0,2639,0.070568,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,Proximal,Normal,0.039884,9581.0,3133,0.330578,0.826283,8,G1,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,Distal,Normal,0.034237,2706.0,1305,0.394231,0.122739,8,G1,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,Distal,Normal,0.040634,7625.0,2735,0.330578,0.122739,8,G1,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,Distal,Normal,0.027229,13147.0,3721,0.245161,0.122739,8,S,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048


In [55]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [59]:
len(set(adata.obs_names).intersection(set(allcells.index)))

6943

In [60]:
adata.obs.loc[set(adata.obs_names).difference(set(allcells.index)),'is_primary_data'] = True

In [61]:
adata.obs['is_primary_data']

AAACCTGTCAATCACG-5891STDY8062349    False
AAACCTGTCATTGCCC-5891STDY8062349    False
AAACGGGTCATGGTCA-5891STDY8062349    False
AAAGCAAAGCCACCTG-5891STDY8062349    False
AACCATGAGGTAGCCA-5891STDY8062349    False
                                    ...  
TTGCGTCGTGGTCCGT-5891STDY9030810    False
TTGTAGGCATCCCATC-5891STDY9030810    False
TTTACTGGTCGAGATG-5891STDY9030810    False
TTTACTGGTCTTGCGG-5891STDY9030810    False
TTTGTCAAGACGCTTT-5891STDY9030810    False
Name: is_primary_data, Length: 6947, dtype: bool

In [62]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [63]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [64]:
adata.obs['organism_ontology_term_id']

AAACCTGTCAATCACG-5891STDY8062349    NCBITaxon:9606
AAACCTGTCATTGCCC-5891STDY8062349    NCBITaxon:9606
AAACGGGTCATGGTCA-5891STDY8062349    NCBITaxon:9606
AAAGCAAAGCCACCTG-5891STDY8062349    NCBITaxon:9606
AACCATGAGGTAGCCA-5891STDY8062349    NCBITaxon:9606
                                         ...      
TTGCGTCGTGGTCCGT-5891STDY9030810    NCBITaxon:9606
TTGTAGGCATCCCATC-5891STDY9030810    NCBITaxon:9606
TTTACTGGTCGAGATG-5891STDY9030810    NCBITaxon:9606
TTTACTGGTCTTGCGG-5891STDY9030810    NCBITaxon:9606
TTTGTCAAGACGCTTT-5891STDY9030810    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 6947, dtype: object

In [65]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [66]:
adata.obs['organism_ontology_term_id']

AAACCTGTCAATCACG-5891STDY8062349    NCBITaxon:9606
AAACCTGTCATTGCCC-5891STDY8062349    NCBITaxon:9606
AAACGGGTCATGGTCA-5891STDY8062349    NCBITaxon:9606
AAAGCAAAGCCACCTG-5891STDY8062349    NCBITaxon:9606
AACCATGAGGTAGCCA-5891STDY8062349    NCBITaxon:9606
                                         ...      
TTGCGTCGTGGTCCGT-5891STDY9030810    NCBITaxon:9606
TTGTAGGCATCCCATC-5891STDY9030810    NCBITaxon:9606
TTTACTGGTCGAGATG-5891STDY9030810    NCBITaxon:9606
TTTACTGGTCTTGCGG-5891STDY9030810    NCBITaxon:9606
TTTGTCAAGACGCTTT-5891STDY9030810    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 6947, dtype: category
Categories (1, object): ['NCBITaxon:9606']

In [67]:
pd.set_option('display.max_columns', 500)

In [68]:
adata.obs

Unnamed: 0,batch,stage,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,G2M_score,C2_leiden,C2_leiden_R,new_celltype,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id
AAACCTGTCAATCACG-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.054201,2949.0,1385,0.039062,0.999862,2,G1,-0.147739,-0.136545,0,0,SPP1+ MΦ,EFO:0011025,CL:0000235,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
AAACCTGTCATTGCCC-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.049873,30024.0,5380,0.160350,0.027939,2,S,0.661255,0.028148,11,111,Cycling DC,EFO:0011025,CL:0000451,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
AAACGGGTCATGGTCA-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.115864,4258.0,1964,0.057011,0.999862,2,G1,-0.121608,-0.132354,0,0,SPP1+ MΦ,EFO:0011025,CL:0000235,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
AAAGCAAAGCCACCTG-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.043150,8226.0,2561,0.066890,0.826769,2,G1,-0.171818,-0.143412,8,8,S100A12-lo cla. mono.,EFO:0011025,CL:0000860,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
AACCATGAGGTAGCCA-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.058095,9890.0,2956,0.051245,0.826769,2,G1,-0.118939,-0.233848,5,500,Non-cla. mono.,EFO:0011025,CL:0000875,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGCGTCGTGGTCCGT-5891STDY9030810,5891STDY9030810,9.0,Whole-lung,Normal,0.020080,976.0,543,0.102088,0.895486,2,G1,-0.043927,-0.085867,2,2,S100A12-hi cla. mono.,EFO:0011025,CL:0000860,HsapDv:0000046,PATO:0000461,unknown,False,NCBITaxon:9606
TTGTAGGCATCCCATC-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.039915,9477.0,2423,0.080160,0.895486,2,G1,-0.076753,-0.115796,5,500,Non-cla. mono.,EFO:0011025,CL:0000875,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606
TTTACTGGTCGAGATG-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.045369,3556.0,1656,0.020619,0.954489,2,G1,-0.043132,-0.129695,9,9,SPP1+ MΦ,EFO:0011025,CL:0000235,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606
TTTACTGGTCTTGCGG-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.040926,5179.0,1986,0.048951,0.954489,2,G1,-0.053600,-0.096857,4,4,SPP1+ MΦ,EFO:0011025,CL:0000235,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606


## sex_ontology_term_id

**PATO_0000384 for male  
PATO_0000383 for female**

In [70]:
adata.obs['sex_ontology_term_id'] = ['x'] * len(adata.obs)

In [71]:
allcells.loc[list(set(allcells.index).intersection(set(adata.obs_names))), 'sex_ontology_term_id']

CCTTCCCCAGTCTTCC-WSSS_F_LNG8713187    PATO:0000384
TCAGATGCAATTGCTG-WSSS_F_LNG8713186    PATO:0000384
TAAGCGTGTACGCTGC-5891STDY8062351      PATO:0000384
CGCGGTACAGACAAAT-5891STDY8062352      PATO:0000384
AAAGCAAAGCCACCTG-5891STDY8062349      PATO:0000384
                                          ...     
GAATAAGCACCGTTGG-5891STDY8062356      PATO:0000383
CTCGTCAGTTTGTTGG-5891STDY8062350      PATO:0000384
GCGCCAATCATTATCC-WSSS_F_LNG8713178    PATO:0000384
AATCGGTGTACATGTC-5891STDY9030810      PATO:0000384
GGGACCTGTCCAGTAT-WSSS_F_LNG8713187    PATO:0000384
Name: sex_ontology_term_id, Length: 6943, dtype: object

In [72]:
adata.obs.loc[list(set(adata.obs_names).difference(set(allcells.index))),'sex_ontology_term_id'] = 'unknown'

In [73]:
adata.obs.loc[list(set(allcells.index).intersection(set(adata.obs_names))),'sex_ontology_term_id'] = allcells.loc[list(set(allcells.index).intersection(set(adata.obs_names))), 'sex_ontology_term_id']

In [74]:
adata.obs['sex_ontology_term_id']

AAACCTGTCAATCACG-5891STDY8062349    PATO:0000384
AAACCTGTCATTGCCC-5891STDY8062349    PATO:0000384
AAACGGGTCATGGTCA-5891STDY8062349    PATO:0000384
AAAGCAAAGCCACCTG-5891STDY8062349    PATO:0000384
AACCATGAGGTAGCCA-5891STDY8062349    PATO:0000384
                                        ...     
TTGCGTCGTGGTCCGT-5891STDY9030810    PATO:0000383
TTGTAGGCATCCCATC-5891STDY9030810    PATO:0000384
TTTACTGGTCGAGATG-5891STDY9030810    PATO:0000384
TTTACTGGTCTTGCGG-5891STDY9030810    PATO:0000384
TTTGTCAAGACGCTTT-5891STDY9030810    PATO:0000384
Name: sex_ontology_term_id, Length: 6947, dtype: object

## tissue_ontology_term_id

Lung: UBERON:0002048

In [75]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0002048'] * len(adata.obs)

In [76]:
adata.obs['tissue_ontology_term_id']

AAACCTGTCAATCACG-5891STDY8062349    UBERON:0002048
AAACCTGTCATTGCCC-5891STDY8062349    UBERON:0002048
AAACGGGTCATGGTCA-5891STDY8062349    UBERON:0002048
AAAGCAAAGCCACCTG-5891STDY8062349    UBERON:0002048
AACCATGAGGTAGCCA-5891STDY8062349    UBERON:0002048
                                         ...      
TTGCGTCGTGGTCCGT-5891STDY9030810    UBERON:0002048
TTGTAGGCATCCCATC-5891STDY9030810    UBERON:0002048
TTTACTGGTCGAGATG-5891STDY9030810    UBERON:0002048
TTTACTGGTCTTGCGG-5891STDY9030810    UBERON:0002048
TTTGTCAAGACGCTTT-5891STDY9030810    UBERON:0002048
Name: tissue_ontology_term_id, Length: 6947, dtype: object

---

# <ins>var and raw.var

In [77]:
adata.var

Unnamed: 0_level_0,n_cells,Deep_n,means,dispersions,dispersions_norm,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000243485,4,0,0.000716,0.837703,-0.592373,MIR1302-2HG,gene
ENSG00000238009,21,1,0.003444,0.963300,-0.379956,AL627309.1,gene
ENSG00000237491,263,0,0.044037,1.154678,-0.056287,AL669831.5,gene
ENSG00000177757,21,0,0.003756,1.399611,0.357959,FAM87B,gene
ENSG00000225880,402,1,0.069900,1.260772,0.123146,LINC00115,gene
...,...,...,...,...,...,...,...
ENSG00000277196,32,9,0.007644,2.006535,1.384423,AC007325.2,gene
ENSG00000278384,11,0,0.002443,1.105285,-0.139823,AL354822.1,gene
ENSG00000276345,3052,0,0.544092,1.298267,-0.382608,AC004556.1,gene
ENSG00000275063,3,2,0.000397,0.471942,-1.210969,AC233755.1,gene


## feature_is_filtered

In [80]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [81]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000243485    False
ENSG00000238009    False
ENSG00000237491    False
ENSG00000177757    False
ENSG00000225880    False
                   ...  
ENSG00000277196    False
ENSG00000278384    False
ENSG00000276345    False
ENSG00000275063    False
ENSG00000271254    False
Name: feature_is_filtered, Length: 20014, dtype: bool

# <ins>obsm

In [86]:
adata.obsm

AxisArrays with keys: X_C2filteredumap, X_Clusterumap, X_pca, X_umap

# <ins>uns

In [87]:
adata.uns['schema_version'] = '2.0.0'

In [91]:
adata.uns['title'] = 'Myeloid'

In [92]:
adata.uns['X_normalization'] = 'Scanpy'

In [93]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [94]:
adata

AnnData object with n_obs × n_vars = 6947 × 20014
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C2_leiden', 'C2_leiden_R', 'new_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'C2_leiden_R_colors', 'batch_colors', 'new_celltype_colors', 'rank_genes_groups', 'rank_genes_groups_filtered', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_C2filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [95]:
adata.obs

Unnamed: 0,batch,stage,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,G2M_score,C2_leiden,C2_leiden_R,new_celltype,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
AAACCTGTCAATCACG-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.054201,2949.0,1385,0.039062,0.999862,2,G1,-0.147739,-0.136545,0,0,SPP1+ MΦ,EFO:0011025,CL:0000235,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAACCTGTCATTGCCC-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.049873,30024.0,5380,0.160350,0.027939,2,S,0.661255,0.028148,11,111,Cycling DC,EFO:0011025,CL:0000451,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAACGGGTCATGGTCA-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.115864,4258.0,1964,0.057011,0.999862,2,G1,-0.121608,-0.132354,0,0,SPP1+ MΦ,EFO:0011025,CL:0000235,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGCAAAGCCACCTG-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.043150,8226.0,2561,0.066890,0.826769,2,G1,-0.171818,-0.143412,8,8,S100A12-lo cla. mono.,EFO:0011025,CL:0000860,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AACCATGAGGTAGCCA-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.058095,9890.0,2956,0.051245,0.826769,2,G1,-0.118939,-0.233848,5,500,Non-cla. mono.,EFO:0011025,CL:0000875,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGCGTCGTGGTCCGT-5891STDY9030810,5891STDY9030810,9.0,Whole-lung,Normal,0.020080,976.0,543,0.102088,0.895486,2,G1,-0.043927,-0.085867,2,2,S100A12-hi cla. mono.,EFO:0011025,CL:0000860,HsapDv:0000046,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000383,UBERON:0002048
TTGTAGGCATCCCATC-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.039915,9477.0,2423,0.080160,0.895486,2,G1,-0.076753,-0.115796,5,500,Non-cla. mono.,EFO:0011025,CL:0000875,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
TTTACTGGTCGAGATG-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.045369,3556.0,1656,0.020619,0.954489,2,G1,-0.043132,-0.129695,9,9,SPP1+ MΦ,EFO:0011025,CL:0000235,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
TTTACTGGTCTTGCGG-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.040926,5179.0,1986,0.048951,0.954489,2,G1,-0.053600,-0.096857,4,4,SPP1+ MΦ,EFO:0011025,CL:0000235,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048


In [96]:
adata.obs.dtypes

batch                                 category
stage                                  float64
dissection                            category
chemistry                             category
percent_mito                           float32
n_counts                               float32
n_genes                                  int64
doublet_scores                         float64
bh_pval                                float64
leiden                                category
phase                                 category
S_score                                float32
G2M_score                              float32
C2_leiden                             category
C2_leiden_R                           category
new_celltype                          category
assay_ontology_term_id                  object
cell_type_ontology_term_id              object
development_stage_ontology_term_id      object
disease_ontology_term_id                object
ethnicity_ontology_term_id              object
is_primary_da

In [97]:
del adata.obs['stage']
del adata.obs['bh_pval']
del adata.obs['doublet_scores']
del adata.obs['C2_leiden']
del adata.obs['C2_leiden_R']

In [98]:
adata.obs.dtypes

batch                                 category
dissection                            category
chemistry                             category
percent_mito                           float32
n_counts                               float32
n_genes                                  int64
leiden                                category
phase                                 category
S_score                                float32
G2M_score                              float32
new_celltype                          category
assay_ontology_term_id                  object
cell_type_ontology_term_id              object
development_stage_ontology_term_id      object
disease_ontology_term_id                object
ethnicity_ontology_term_id              object
is_primary_data                           bool
organism_ontology_term_id             category
sex_ontology_term_id                    object
tissue_ontology_term_id                 object
dtype: object

In [99]:
adata.var.dtypes

n_cells                  int64
Deep_n                   int32
means                  float64
dispersions            float64
dispersions_norm       float32
gene_symbols            object
feature_biotype         object
feature_is_filtered       bool
dtype: object

In [100]:
# adata.obs
print('obs')
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

# adata.var
print('var')
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

obs
changed n_genes from int64 to int32
var
changed means from float64 to float32
changed dispersions from float64 to float32
changed n_cells from int64 to int32


In [101]:
adata.write('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/fetal-lung/final/fl_mye.h5ad', compression = 'gzip')

... storing 'assay_ontology_term_id' as categorical
... storing 'cell_type_ontology_term_id' as categorical
... storing 'development_stage_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'sex_ontology_term_id' as categorical
... storing 'tissue_ontology_term_id' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical
