# **Curating `fetal-lung` dataset:**

### PNS dataset

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

In [2]:
adata = sc.read_h5ad('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/fetal-lung/C7filtered.h5ad')

In [3]:
adata.var

Unnamed: 0,gene_ids,n_cells,highly_variable5891STDY8062349,highly_variable5891STDY8062350,highly_variable5891STDY8062351,highly_variable5891STDY8062352,highly_variable5891STDY8062353,highly_variable5891STDY8062354,highly_variable5891STDY8062355,highly_variable5891STDY8062356,...,Deep_WSSS_F_LNG8713191,Deep_5891STDY9030806,Deep_5891STDY9030807,Deep_5891STDY9030808,Deep_5891STDY9030809,Deep_5891STDY9030810,Deep_n,means,dispersions,dispersions_norm
MIR1302-2HG,ENSG00000243485,3,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.003591,-0.107463,-1.354816
AL669831.5,ENSG00000237491,20,False,False,True,False,True,False,False,False,...,False,False,False,False,False,False,0,0.044867,1.259747,1.223400
LINC00115,ENSG00000225880,29,False,True,True,False,True,False,False,False,...,False,False,False,False,False,False,1,0.074161,1.497555,1.671846
FAM41C,ENSG00000230368,3,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,14,0.011604,1.684297,2.023994
AL645608.7,ENSG00000272438,5,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.010806,0.726696,0.218199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AL592183.1,ENSG00000273748,7,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,1,0.008129,0.102937,-0.958054
AC007325.4,ENSG00000278817,108,True,True,True,True,True,False,False,False,...,False,False,False,False,False,False,5,0.169911,0.488998,-0.230041
AC007325.2,ENSG00000277196,7,True,True,False,False,False,True,True,True,...,False,False,False,True,False,False,9,0.017246,1.249787,1.204617
AC004556.1,ENSG00000276345,306,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.482869,0.690596,-0.351638


In [4]:
len(adata.var.columns)

66

In [5]:
adata.var = adata.var[['gene_ids', 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm']]

In [6]:
adata

AnnData object with n_obs × n_vars = 649 × 16182
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C7_leiden', 'C7_leiden_R', 'new_celltype'
    var: 'gene_ids', 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm'
    uns: 'C7_leiden_R_colors', 'batch_colors', 'new_celltype_colors', 'old_celltype_colors', 'rank_genes_groups', 'rank_genes_groups_filtered'
    obsm: 'X_C7filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [7]:
def CalculateRaw(adata,scaling_factor=10000):
    #update by Polanski in Feb 2022
    #The object must contain a log-transformed matrix
    #This function returns an integer-count object
    #The normalization constant is assumed to be 10000
    #return anndata.AnnData(X=sparse.csr_matrix(np.rint(np.array(np.expm1(adata.X).todense().transpose())*(adata.obs['n_counts'].values).transpose() / scaling_factor).transpose()),\
    #              obs=adata.obs,var=adata.var,obsm=adata.obsm,varm=adata.varm)
    X = np.expm1(adata.X)
    scaling_vector = adata.obs['n_counts'].values / scaling_factor
    #.indptr[i]:.indptr[i+1] provides the .data coordinates where the i'th row of the data resides in CSR
    #which happens to be a cell, which happens to be what we have a unique entry in scaling_vector for
    for i in np.arange(X.shape[0]):
        X.data[X.indptr[i]:X.indptr[i+1]] = X.data[X.indptr[i]:X.indptr[i+1]] * scaling_vector[i]
    return sc.AnnData(X=np.rint(X),obs=adata.obs,var=adata.var,obsm=adata.obsm,varm=adata.varm)

In [8]:
raw = CalculateRaw(adata)

In [9]:
del raw.var

In [10]:
geno = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['ENSEMBLid', 'gena','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [11]:
geno

Unnamed: 0,ENSEMBLid,gena,no
0,ENSG00000223972,DDX11L1,5
1,ENST00000456328,DDX11L1-202,2
2,ENST00000450305,DDX11L1-201,2
3,ENSG00000227232,WASH7P,5
4,ENST00000488147,WASH7P-201,1
...,...,...,...
297782,ENST00000621424,ENSG110105435.4-001,4
297783,ENSG00000278625,U6,1
297784,ENST00000616830,U6.88-201,1
297785,ENSG00000277374,U1,1


In [12]:
ens_ids = adata.var['gene_ids']

In [13]:
adata.var

Unnamed: 0,gene_ids,n_cells,Deep_n,means,dispersions,dispersions_norm
MIR1302-2HG,ENSG00000243485,3,0,0.003591,-0.107463,-1.354816
AL669831.5,ENSG00000237491,20,0,0.044867,1.259747,1.223400
LINC00115,ENSG00000225880,29,1,0.074161,1.497555,1.671846
FAM41C,ENSG00000230368,3,14,0.011604,1.684297,2.023994
AL645608.7,ENSG00000272438,5,0,0.010806,0.726696,0.218199
...,...,...,...,...,...,...
AL592183.1,ENSG00000273748,7,1,0.008129,0.102937,-0.958054
AC007325.4,ENSG00000278817,108,5,0.169911,0.488998,-0.230041
AC007325.2,ENSG00000277196,7,9,0.017246,1.249787,1.204617
AC004556.1,ENSG00000276345,306,0,0.482869,0.690596,-0.351638


In [14]:
genedict = {key: 1 for key in list(geno.ENSEMBLid)}

In [15]:
genedict

{'ENSG00000223972': 1,
 'ENST00000456328': 1,
 'ENST00000450305': 1,
 'ENSG00000227232': 1,
 'ENST00000488147': 1,
 'ENSG00000278267': 1,
 'ENST00000619216': 1,
 'ENSG00000243485': 1,
 'ENST00000473358': 1,
 'ENST00000469289': 1,
 'ENSG00000284332': 1,
 'ENST00000607096': 1,
 'ENSG00000237613': 1,
 'ENST00000417324': 1,
 'ENST00000461467': 1,
 'ENSG00000268020': 1,
 'ENST00000606857': 1,
 'ENSG00000240361': 1,
 'ENST00000642116': 1,
 'ENST00000492842': 1,
 'ENSG00000186092': 1,
 'ENST00000641515': 1,
 'ENSG00000238009': 1,
 'ENST00000466430': 1,
 'ENST00000477740': 1,
 'ENST00000471248': 1,
 'ENST00000610542': 1,
 'ENST00000453576': 1,
 'ENSG00000239945': 1,
 'ENST00000495576': 1,
 'ENSG00000233750': 1,
 'ENST00000442987': 1,
 'ENSG00000268903': 1,
 'ENST00000494149': 1,
 'ENSG00000269981': 1,
 'ENST00000595919': 1,
 'ENSG00000239906': 1,
 'ENST00000493797': 1,
 'ENSG00000241860': 1,
 'ENST00000484859': 1,
 'ENST00000490997': 1,
 'ENST00000466557': 1,
 'ENST00000662089': 1,
 'ENST00000

In [16]:
ens_ids2 = [x for x in ens_ids if (x in genedict or x.startswith('SARSCoV'))]

In [17]:
ens_ids2

['ENSG00000243485',
 'ENSG00000237491',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000188157',
 'ENSG00000131591',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000160087',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',
 'ENSG00000242485',
 'ENSG00000272455',
 'ENSG00000235098',
 'ENSG00000225905',
 'ENSG00000225285',
 'ENSG00000179403',
 'ENSG00000160072',
 'ENSG00000197785',
 'ENSG00000205090',
 'ENSG00000160075',
 'ENSG00000215014',
 'ENSG00000228594',
 'ENSG00000272106',
 'ENSG00000197530',
 'ENSG00000189409',
 'ENSG00000248333',
 'ENSG00000272004',
 'ENSG00000189339',
 'ENSG00000008128',
 'ENSG00000215790',
 'ENSG00000008130',
 'ENSG00000078369',
 'ENSG00000178821',
 'ENSG00000067606',


In [18]:
len(ens_ids2)

16122

In [19]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.var['gene_symbols'] = adata.var_names


In [20]:
raw.var['gene_symbols'] = adata.var_names
raw.var_names = adata.var['gene_ids']

In [21]:
del adata.var['gene_ids']

In [22]:
del raw.var['gene_symbols']

In [23]:
adata = adata[:, ens_ids2].copy()

  res = method(*args, **kwargs)


In [24]:
raw = raw[:, ens_ids2].copy()

In [25]:
adata

AnnData object with n_obs × n_vars = 649 × 16122
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C7_leiden', 'C7_leiden_R', 'new_celltype'
    var: 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols'
    uns: 'C7_leiden_R_colors', 'batch_colors', 'new_celltype_colors', 'old_celltype_colors', 'rank_genes_groups', 'rank_genes_groups_filtered'
    obsm: 'X_C7filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [26]:
raw

AnnData object with n_obs × n_vars = 649 × 16122
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C7_leiden', 'C7_leiden_R', 'new_celltype'
    obsm: 'X_C7filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [27]:
del raw.obs
del raw.obsm

In [28]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [29]:
raw.var['feature_biotype'] = ["gene"] * len(adata.var)

In [30]:
adata.raw = raw

## 2. Pre-prepare before schema adjustments

In [31]:
adata.var

Unnamed: 0_level_0,n_cells,Deep_n,means,dispersions,dispersions_norm,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000243485,3,0,0.003591,-0.107463,-1.354816,MIR1302-2HG,gene
ENSG00000237491,20,0,0.044867,1.259747,1.223400,AL669831.5,gene
ENSG00000225880,29,1,0.074161,1.497555,1.671846,LINC00115,gene
ENSG00000230368,3,14,0.011604,1.684297,2.023994,FAM41C,gene
ENSG00000272438,5,0,0.010806,0.726696,0.218199,AL645608.7,gene
...,...,...,...,...,...,...,...
ENSG00000273748,7,1,0.008129,0.102937,-0.958054,AL592183.1,gene
ENSG00000278817,108,5,0.169911,0.488998,-0.230041,AC007325.4,gene
ENSG00000277196,7,9,0.017246,1.249787,1.204617,AC007325.2,gene
ENSG00000276345,306,0,0.482869,0.690596,-0.351638,AC004556.1,gene


---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [32]:
adata.obs.columns

Index(['batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts',
       'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score',
       'G2M_score', 'C7_leiden', 'C7_leiden_R', 'new_celltype'],
      dtype='object')

In [33]:
adata.obs

Unnamed: 0,batch,stage,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,G2M_score,C7_leiden,C7_leiden_R,new_celltype
GCGCAGTGTCCAGTTA-5891STDY8062349,5891STDY8062349,18.00,Proximal,Normal,0.053423,5794.0,2262,0.140584,0.999862,7,G1,-0.042786,-0.117230,8,8,COL20A1+ Schwann
GGATGTTTCTTTACAC-5891STDY8062353,5891STDY8062353,15.00,Proximal,Normal,0.192780,2102.0,1378,0.160350,0.000021,7,S,0.002162,-0.161572,7,7,Late Schwann
TGTCCCAGTGGAAAGA-5891STDY8062353,5891STDY8062353,15.00,Proximal,Normal,0.064559,14765.0,4308,0.171779,0.000021,7,G1,-0.107704,-0.123366,7,7,Late Schwann
AAACCTGCACCTCGGA-WSSS8012016,WSSS8012016,6.86,Whole-lung,Normal,0.009378,5176.0,1809,0.163569,0.893066,7,G1,-0.004359,-0.081952,10,10,Early Schwann
ACATCAGAGTCGTACT-WSSS8012016,WSSS8012016,6.86,Whole-lung,Normal,0.035617,8123.0,3058,0.061184,0.921305,7,G1,-0.085392,-0.100867,4,41,PCP4+ neuron
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTGAAGTAGTCAAGCG-5891STDY9030810,5891STDY9030810,11.00,Whole-lung,Normal,0.051788,8221.0,3349,0.190647,0.796614,7,G1,-0.102093,-0.082692,11,11,KCNIP4+ neuron
GAATGAAGTATAAACG-5891STDY9030810,5891STDY9030810,9.00,Whole-lung,Normal,0.103537,3853.0,1816,0.090323,0.796614,7,G1,-0.046438,-0.208117,6,60,Mid Schwann
TGGCCAGGTATAGTAG-5891STDY9030810,5891STDY9030810,11.00,Whole-lung,Normal,0.060677,21703.0,5670,0.127860,0.796614,7,G1,-0.157894,-0.221724,6,60,Mid Schwann
TTCTCCTTCATGTCTT-5891STDY9030810,5891STDY9030810,11.00,Whole-lung,Normal,0.047309,22813.0,5970,0.258581,0.796614,7,S,0.015327,-0.130961,8,8,COL20A1+ Schwann


## assay_ontology_term_id

Assay used in this study is 10x 5' v1. Publication

10x 3' v2	"EFO:0009899"  
10x 3' v3	"EFO:0009922"  
10x 5' v1	"EFO:0011025"  
Smart-seq	"EFO:0008930"  
Smart-seq2	"EFO:0008931"

> Chromium Single Cell 5’ V(D)J Reagent Kits (V1.0 chemistry) were used for scRNAseq library
construction.

In [34]:
adata.obs['assay_ontology_term_id'] = ['EFO:0011025'] * len(adata.obs)

In [35]:
adata.obs['assay_ontology_term_id']

GCGCAGTGTCCAGTTA-5891STDY8062349    EFO:0011025
GGATGTTTCTTTACAC-5891STDY8062353    EFO:0011025
TGTCCCAGTGGAAAGA-5891STDY8062353    EFO:0011025
AAACCTGCACCTCGGA-WSSS8012016        EFO:0011025
ACATCAGAGTCGTACT-WSSS8012016        EFO:0011025
                                       ...     
CTGAAGTAGTCAAGCG-5891STDY9030810    EFO:0011025
GAATGAAGTATAAACG-5891STDY9030810    EFO:0011025
TGGCCAGGTATAGTAG-5891STDY9030810    EFO:0011025
TTCTCCTTCATGTCTT-5891STDY9030810    EFO:0011025
TTCTTAGGTCGGATCC-5891STDY9030810    EFO:0011025
Name: assay_ontology_term_id, Length: 649, dtype: object

## cell_type_ontology_term_id

In [36]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['new_celltype']

In [37]:
len(list(adata.obs['cell_type_ontology_term_id'].unique()))

13

In [38]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['COL20A1+ Schwann',
 'Late Schwann',
 'Early Schwann',
 'PCP4+ neuron',
 'MFNG+ DBH+ neuron',
 'Schwann precursor',
 'TM4SF4+ CHODL+ neuron',
 'SST+ neuron',
 'KCNIP4+ neuron',
 'TM4SF4+ PENK+ neuron',
 'FGFBP2+ Neural progenitor',
 'Mid Schwann',
 'Proliferating Schwann']

In [39]:
mapping = {'COL20A1+ Schwann': 'CL:0002573',
 'Late Schwann': 'CL:0002573',
 'Early Schwann': 'CL:0002377', # immature
 'PCP4+ neuron': 'CL:0000540',
 'MFNG+ DBH+ neuron': 'CL:0000540',
 'Schwann precursor': 'CL:0002375',
 'TM4SF4+ CHODL+ neuron': 'CL:0000540',
 'SST+ neuron': 'CL:0000540',
 'KCNIP4+ neuron': 'CL:0000540',
 'TM4SF4+ PENK+ neuron': 'CL:0000540',
 'FGFBP2+ Neural progenitor': 'CL:0011020',
 'Mid Schwann': 'CL:0002573',
 'Proliferating Schwann': 'CL:0002573'}

In [40]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

## development_stage_ontology_term_id

In [41]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['stage']

In [42]:
list(adata.obs['stage'].unique())

[18.0, 15.0, 6.86, 5.0, 20.0, 22.0, 9.0, 11.0]

In [43]:
mapping = {18.0: 'HsapDv:0000055', 
           22.0: 'HsapDv:0000059', 
           15.0: 'HsapDv:0000052', 
           20.0: 'HsapDv:0000057', 
           6.86: 'HsapDv:0000026', 
           5.0: 'HsapDv:0000023', 
           9.0: 'HsapDv:0000046', 
           11.0: 'HsapDv:0000048'}

In [44]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [45]:
adata.obs['development_stage_ontology_term_id']

GCGCAGTGTCCAGTTA-5891STDY8062349    HsapDv:0000055
GGATGTTTCTTTACAC-5891STDY8062353    HsapDv:0000052
TGTCCCAGTGGAAAGA-5891STDY8062353    HsapDv:0000052
AAACCTGCACCTCGGA-WSSS8012016        HsapDv:0000026
ACATCAGAGTCGTACT-WSSS8012016        HsapDv:0000026
                                         ...      
CTGAAGTAGTCAAGCG-5891STDY9030810    HsapDv:0000048
GAATGAAGTATAAACG-5891STDY9030810    HsapDv:0000046
TGGCCAGGTATAGTAG-5891STDY9030810    HsapDv:0000048
TTCTCCTTCATGTCTT-5891STDY9030810    HsapDv:0000048
TTCTTAGGTCGGATCC-5891STDY9030810    HsapDv:0000046
Name: development_stage_ontology_term_id, Length: 649, dtype: object

In [46]:
adata.obs['development_stage_ontology_term_id'].unique()

array(['HsapDv:0000055', 'HsapDv:0000052', 'HsapDv:0000026',
       'HsapDv:0000023', 'HsapDv:0000057', 'HsapDv:0000059',
       'HsapDv:0000046', 'HsapDv:0000048'], dtype=object)

## disease_ontology_term_id

In [47]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [48]:
adata.obs['disease_ontology_term_id']

GCGCAGTGTCCAGTTA-5891STDY8062349    PATO:0000461
GGATGTTTCTTTACAC-5891STDY8062353    PATO:0000461
TGTCCCAGTGGAAAGA-5891STDY8062353    PATO:0000461
AAACCTGCACCTCGGA-WSSS8012016        PATO:0000461
ACATCAGAGTCGTACT-WSSS8012016        PATO:0000461
                                        ...     
CTGAAGTAGTCAAGCG-5891STDY9030810    PATO:0000461
GAATGAAGTATAAACG-5891STDY9030810    PATO:0000461
TGGCCAGGTATAGTAG-5891STDY9030810    PATO:0000461
TTCTCCTTCATGTCTT-5891STDY9030810    PATO:0000461
TTCTTAGGTCGGATCC-5891STDY9030810    PATO:0000461
Name: disease_ontology_term_id, Length: 649, dtype: object

In [49]:
adata.obs['disease_ontology_term_id']

GCGCAGTGTCCAGTTA-5891STDY8062349    PATO:0000461
GGATGTTTCTTTACAC-5891STDY8062353    PATO:0000461
TGTCCCAGTGGAAAGA-5891STDY8062353    PATO:0000461
AAACCTGCACCTCGGA-WSSS8012016        PATO:0000461
ACATCAGAGTCGTACT-WSSS8012016        PATO:0000461
                                        ...     
CTGAAGTAGTCAAGCG-5891STDY9030810    PATO:0000461
GAATGAAGTATAAACG-5891STDY9030810    PATO:0000461
TGGCCAGGTATAGTAG-5891STDY9030810    PATO:0000461
TTCTCCTTCATGTCTT-5891STDY9030810    PATO:0000461
TTCTTAGGTCGGATCC-5891STDY9030810    PATO:0000461
Name: disease_ontology_term_id, Length: 649, dtype: object

## ethnicity_ontology_term_id

In [50]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [51]:
adata.obs['ethnicity_ontology_term_id']

GCGCAGTGTCCAGTTA-5891STDY8062349    unknown
GGATGTTTCTTTACAC-5891STDY8062353    unknown
TGTCCCAGTGGAAAGA-5891STDY8062353    unknown
AAACCTGCACCTCGGA-WSSS8012016        unknown
ACATCAGAGTCGTACT-WSSS8012016        unknown
                                     ...   
CTGAAGTAGTCAAGCG-5891STDY9030810    unknown
GAATGAAGTATAAACG-5891STDY9030810    unknown
TGGCCAGGTATAGTAG-5891STDY9030810    unknown
TTCTCCTTCATGTCTT-5891STDY9030810    unknown
TTCTTAGGTCGGATCC-5891STDY9030810    unknown
Name: ethnicity_ontology_term_id, Length: 649, dtype: object

## is_primary_data

In [52]:
allcells = pd.read_csv('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/fetal-lung/cellnames_all.csv', index_col=0)

In [53]:
allcells

Unnamed: 0,batch,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,...,broad_celltype,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
AAACCTGAGACAGGCT-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.042116,8711.0,3101,0.060127,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAACCTGCACCGAATT-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.042811,8228.0,2992,0.131980,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGATGTCGTGACAT-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.052119,3601.0,1714,0.051245,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGCAACAAACGTGG-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.069157,5707.0,2280,0.083019,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGCAACATACTACG-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.065710,7095.0,2639,0.070568,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,Proximal,Normal,0.039884,9581.0,3133,0.330578,0.826283,8,G1,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,Distal,Normal,0.034237,2706.0,1305,0.394231,0.122739,8,G1,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,Distal,Normal,0.040634,7625.0,2735,0.330578,0.122739,8,G1,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,Distal,Normal,0.027229,13147.0,3721,0.245161,0.122739,8,S,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048


In [54]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [55]:
len(set(adata.obs_names).intersection(set(allcells.index)))

649

In [58]:
adata.obs.loc[set(adata.obs_names).difference(set(allcells.index)),'is_primary_data'] = True

In [59]:
adata.obs['is_primary_data']

GCGCAGTGTCCAGTTA-5891STDY8062349    False
GGATGTTTCTTTACAC-5891STDY8062353    False
TGTCCCAGTGGAAAGA-5891STDY8062353    False
AAACCTGCACCTCGGA-WSSS8012016        False
ACATCAGAGTCGTACT-WSSS8012016        False
                                    ...  
CTGAAGTAGTCAAGCG-5891STDY9030810    False
GAATGAAGTATAAACG-5891STDY9030810    False
TGGCCAGGTATAGTAG-5891STDY9030810    False
TTCTCCTTCATGTCTT-5891STDY9030810    False
TTCTTAGGTCGGATCC-5891STDY9030810    False
Name: is_primary_data, Length: 649, dtype: bool

In [60]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [61]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [62]:
adata.obs['organism_ontology_term_id']

GCGCAGTGTCCAGTTA-5891STDY8062349    NCBITaxon:9606
GGATGTTTCTTTACAC-5891STDY8062353    NCBITaxon:9606
TGTCCCAGTGGAAAGA-5891STDY8062353    NCBITaxon:9606
AAACCTGCACCTCGGA-WSSS8012016        NCBITaxon:9606
ACATCAGAGTCGTACT-WSSS8012016        NCBITaxon:9606
                                         ...      
CTGAAGTAGTCAAGCG-5891STDY9030810    NCBITaxon:9606
GAATGAAGTATAAACG-5891STDY9030810    NCBITaxon:9606
TGGCCAGGTATAGTAG-5891STDY9030810    NCBITaxon:9606
TTCTCCTTCATGTCTT-5891STDY9030810    NCBITaxon:9606
TTCTTAGGTCGGATCC-5891STDY9030810    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 649, dtype: object

In [63]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [64]:
adata.obs['organism_ontology_term_id']

GCGCAGTGTCCAGTTA-5891STDY8062349    NCBITaxon:9606
GGATGTTTCTTTACAC-5891STDY8062353    NCBITaxon:9606
TGTCCCAGTGGAAAGA-5891STDY8062353    NCBITaxon:9606
AAACCTGCACCTCGGA-WSSS8012016        NCBITaxon:9606
ACATCAGAGTCGTACT-WSSS8012016        NCBITaxon:9606
                                         ...      
CTGAAGTAGTCAAGCG-5891STDY9030810    NCBITaxon:9606
GAATGAAGTATAAACG-5891STDY9030810    NCBITaxon:9606
TGGCCAGGTATAGTAG-5891STDY9030810    NCBITaxon:9606
TTCTCCTTCATGTCTT-5891STDY9030810    NCBITaxon:9606
TTCTTAGGTCGGATCC-5891STDY9030810    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 649, dtype: category
Categories (1, object): ['NCBITaxon:9606']

In [65]:
pd.set_option('display.max_columns', 500)

In [66]:
adata.obs

Unnamed: 0,batch,stage,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,G2M_score,C7_leiden,C7_leiden_R,new_celltype,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id
GCGCAGTGTCCAGTTA-5891STDY8062349,5891STDY8062349,18.00,Proximal,Normal,0.053423,5794.0,2262,0.140584,0.999862,7,G1,-0.042786,-0.117230,8,8,COL20A1+ Schwann,EFO:0011025,CL:0002573,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
GGATGTTTCTTTACAC-5891STDY8062353,5891STDY8062353,15.00,Proximal,Normal,0.192780,2102.0,1378,0.160350,0.000021,7,S,0.002162,-0.161572,7,7,Late Schwann,EFO:0011025,CL:0002573,HsapDv:0000052,PATO:0000461,unknown,False,NCBITaxon:9606
TGTCCCAGTGGAAAGA-5891STDY8062353,5891STDY8062353,15.00,Proximal,Normal,0.064559,14765.0,4308,0.171779,0.000021,7,G1,-0.107704,-0.123366,7,7,Late Schwann,EFO:0011025,CL:0002573,HsapDv:0000052,PATO:0000461,unknown,False,NCBITaxon:9606
AAACCTGCACCTCGGA-WSSS8012016,WSSS8012016,6.86,Whole-lung,Normal,0.009378,5176.0,1809,0.163569,0.893066,7,G1,-0.004359,-0.081952,10,10,Early Schwann,EFO:0011025,CL:0002377,HsapDv:0000026,PATO:0000461,unknown,False,NCBITaxon:9606
ACATCAGAGTCGTACT-WSSS8012016,WSSS8012016,6.86,Whole-lung,Normal,0.035617,8123.0,3058,0.061184,0.921305,7,G1,-0.085392,-0.100867,4,41,PCP4+ neuron,EFO:0011025,CL:0000540,HsapDv:0000026,PATO:0000461,unknown,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTGAAGTAGTCAAGCG-5891STDY9030810,5891STDY9030810,11.00,Whole-lung,Normal,0.051788,8221.0,3349,0.190647,0.796614,7,G1,-0.102093,-0.082692,11,11,KCNIP4+ neuron,EFO:0011025,CL:0000540,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606
GAATGAAGTATAAACG-5891STDY9030810,5891STDY9030810,9.00,Whole-lung,Normal,0.103537,3853.0,1816,0.090323,0.796614,7,G1,-0.046438,-0.208117,6,60,Mid Schwann,EFO:0011025,CL:0002573,HsapDv:0000046,PATO:0000461,unknown,False,NCBITaxon:9606
TGGCCAGGTATAGTAG-5891STDY9030810,5891STDY9030810,11.00,Whole-lung,Normal,0.060677,21703.0,5670,0.127860,0.796614,7,G1,-0.157894,-0.221724,6,60,Mid Schwann,EFO:0011025,CL:0002573,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606
TTCTCCTTCATGTCTT-5891STDY9030810,5891STDY9030810,11.00,Whole-lung,Normal,0.047309,22813.0,5970,0.258581,0.796614,7,S,0.015327,-0.130961,8,8,COL20A1+ Schwann,EFO:0011025,CL:0002573,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606


## sex_ontology_term_id

**PATO_0000384 for male  
PATO_0000383 for female**

In [67]:
adata.obs['sex_ontology_term_id'] = ['x'] * len(adata.obs)

In [68]:
allcells.loc[list(set(allcells.index).intersection(set(adata.obs_names))), 'sex_ontology_term_id']

GTCCTCAGTGCGCTTG-WSSS8011222          PATO:0000383
GGATTACGTTCATGGT-WSSS_F_LNG8713178    PATO:0000384
TACGGATAGTAACCCT-WSSS8012016          PATO:0000383
GGATGTTTCTTTACAC-5891STDY8062353      PATO:0000383
CATGCCTTCAGCTCGG-WSSS8011222          PATO:0000383
                                          ...     
GTTCTCGTCGTATCAG-WSSS8011222          PATO:0000383
TGTGTTTTCAACCAAC-5891STDY9030809      PATO:0000384
CGTAGGCGTATTCTCT-5891STDY9030808      PATO:0000383
TAGGCATGTACCGCTG-WSSS8011222          PATO:0000383
GACCTGGGTGTCTGAT-5891STDY9030809      PATO:0000384
Name: sex_ontology_term_id, Length: 649, dtype: object

In [69]:
adata.obs.loc[list(set(adata.obs_names).difference(set(allcells.index))),'sex_ontology_term_id'] = 'unknown'

In [70]:
adata.obs.loc[list(set(allcells.index).intersection(set(adata.obs_names))),'sex_ontology_term_id'] = allcells.loc[list(set(allcells.index).intersection(set(adata.obs_names))), 'sex_ontology_term_id']

In [71]:
adata.obs['sex_ontology_term_id']

GCGCAGTGTCCAGTTA-5891STDY8062349    PATO:0000384
GGATGTTTCTTTACAC-5891STDY8062353    PATO:0000383
TGTCCCAGTGGAAAGA-5891STDY8062353    PATO:0000383
AAACCTGCACCTCGGA-WSSS8012016        PATO:0000383
ACATCAGAGTCGTACT-WSSS8012016        PATO:0000383
                                        ...     
CTGAAGTAGTCAAGCG-5891STDY9030810    PATO:0000384
GAATGAAGTATAAACG-5891STDY9030810    PATO:0000383
TGGCCAGGTATAGTAG-5891STDY9030810    PATO:0000384
TTCTCCTTCATGTCTT-5891STDY9030810    PATO:0000384
TTCTTAGGTCGGATCC-5891STDY9030810    PATO:0000383
Name: sex_ontology_term_id, Length: 649, dtype: object

## tissue_ontology_term_id

Lung: UBERON:0002048

In [72]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0002048'] * len(adata.obs)

In [73]:
adata.obs['tissue_ontology_term_id']

GCGCAGTGTCCAGTTA-5891STDY8062349    UBERON:0002048
GGATGTTTCTTTACAC-5891STDY8062353    UBERON:0002048
TGTCCCAGTGGAAAGA-5891STDY8062353    UBERON:0002048
AAACCTGCACCTCGGA-WSSS8012016        UBERON:0002048
ACATCAGAGTCGTACT-WSSS8012016        UBERON:0002048
                                         ...      
CTGAAGTAGTCAAGCG-5891STDY9030810    UBERON:0002048
GAATGAAGTATAAACG-5891STDY9030810    UBERON:0002048
TGGCCAGGTATAGTAG-5891STDY9030810    UBERON:0002048
TTCTCCTTCATGTCTT-5891STDY9030810    UBERON:0002048
TTCTTAGGTCGGATCC-5891STDY9030810    UBERON:0002048
Name: tissue_ontology_term_id, Length: 649, dtype: object

---

# <ins>var and raw.var

In [74]:
adata.var

Unnamed: 0_level_0,n_cells,Deep_n,means,dispersions,dispersions_norm,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000243485,3,0,0.003591,-0.107463,-1.354816,MIR1302-2HG,gene
ENSG00000237491,20,0,0.044867,1.259747,1.223400,AL669831.5,gene
ENSG00000225880,29,1,0.074161,1.497555,1.671846,LINC00115,gene
ENSG00000230368,3,14,0.011604,1.684297,2.023994,FAM41C,gene
ENSG00000272438,5,0,0.010806,0.726696,0.218199,AL645608.7,gene
...,...,...,...,...,...,...,...
ENSG00000273748,7,1,0.008129,0.102937,-0.958054,AL592183.1,gene
ENSG00000278817,108,5,0.169911,0.488998,-0.230041,AC007325.4,gene
ENSG00000277196,7,9,0.017246,1.249787,1.204617,AC007325.2,gene
ENSG00000276345,306,0,0.482869,0.690596,-0.351638,AC004556.1,gene


## feature_is_filtered

In [77]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [78]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000243485    False
ENSG00000237491    False
ENSG00000225880    False
ENSG00000230368    False
ENSG00000272438    False
                   ...  
ENSG00000273748    False
ENSG00000278817    False
ENSG00000277196    False
ENSG00000276345    False
ENSG00000271254    False
Name: feature_is_filtered, Length: 16122, dtype: bool

# <ins>obsm

In [83]:
adata.obsm

AxisArrays with keys: X_C7filteredumap, X_Clusterumap, X_pca, X_umap

# <ins>uns

In [84]:
adata.uns['schema_version'] = '2.0.0'

In [85]:
adata.uns['title'] = 'PNS'

In [86]:
adata.uns['X_normalization'] = 'Scanpy'

In [87]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [88]:
adata

AnnData object with n_obs × n_vars = 649 × 16122
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C7_leiden', 'C7_leiden_R', 'new_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'C7_leiden_R_colors', 'batch_colors', 'new_celltype_colors', 'old_celltype_colors', 'rank_genes_groups', 'rank_genes_groups_filtered', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_C7filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [89]:
adata.obs

Unnamed: 0,batch,stage,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,G2M_score,C7_leiden,C7_leiden_R,new_celltype,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
GCGCAGTGTCCAGTTA-5891STDY8062349,5891STDY8062349,18.00,Proximal,Normal,0.053423,5794.0,2262,0.140584,0.999862,7,G1,-0.042786,-0.117230,8,8,COL20A1+ Schwann,EFO:0011025,CL:0002573,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
GGATGTTTCTTTACAC-5891STDY8062353,5891STDY8062353,15.00,Proximal,Normal,0.192780,2102.0,1378,0.160350,0.000021,7,S,0.002162,-0.161572,7,7,Late Schwann,EFO:0011025,CL:0002573,HsapDv:0000052,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000383,UBERON:0002048
TGTCCCAGTGGAAAGA-5891STDY8062353,5891STDY8062353,15.00,Proximal,Normal,0.064559,14765.0,4308,0.171779,0.000021,7,G1,-0.107704,-0.123366,7,7,Late Schwann,EFO:0011025,CL:0002573,HsapDv:0000052,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000383,UBERON:0002048
AAACCTGCACCTCGGA-WSSS8012016,WSSS8012016,6.86,Whole-lung,Normal,0.009378,5176.0,1809,0.163569,0.893066,7,G1,-0.004359,-0.081952,10,10,Early Schwann,EFO:0011025,CL:0002377,HsapDv:0000026,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000383,UBERON:0002048
ACATCAGAGTCGTACT-WSSS8012016,WSSS8012016,6.86,Whole-lung,Normal,0.035617,8123.0,3058,0.061184,0.921305,7,G1,-0.085392,-0.100867,4,41,PCP4+ neuron,EFO:0011025,CL:0000540,HsapDv:0000026,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000383,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTGAAGTAGTCAAGCG-5891STDY9030810,5891STDY9030810,11.00,Whole-lung,Normal,0.051788,8221.0,3349,0.190647,0.796614,7,G1,-0.102093,-0.082692,11,11,KCNIP4+ neuron,EFO:0011025,CL:0000540,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
GAATGAAGTATAAACG-5891STDY9030810,5891STDY9030810,9.00,Whole-lung,Normal,0.103537,3853.0,1816,0.090323,0.796614,7,G1,-0.046438,-0.208117,6,60,Mid Schwann,EFO:0011025,CL:0002573,HsapDv:0000046,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000383,UBERON:0002048
TGGCCAGGTATAGTAG-5891STDY9030810,5891STDY9030810,11.00,Whole-lung,Normal,0.060677,21703.0,5670,0.127860,0.796614,7,G1,-0.157894,-0.221724,6,60,Mid Schwann,EFO:0011025,CL:0002573,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
TTCTCCTTCATGTCTT-5891STDY9030810,5891STDY9030810,11.00,Whole-lung,Normal,0.047309,22813.0,5970,0.258581,0.796614,7,S,0.015327,-0.130961,8,8,COL20A1+ Schwann,EFO:0011025,CL:0002573,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048


In [90]:
adata.obs.dtypes

batch                                 category
stage                                  float64
dissection                            category
chemistry                             category
percent_mito                           float32
n_counts                               float32
n_genes                                  int64
doublet_scores                         float64
bh_pval                                float64
leiden                                category
phase                                 category
S_score                                float32
G2M_score                              float32
C7_leiden                             category
C7_leiden_R                           category
new_celltype                          category
assay_ontology_term_id                  object
cell_type_ontology_term_id              object
development_stage_ontology_term_id      object
disease_ontology_term_id                object
ethnicity_ontology_term_id              object
is_primary_da

In [91]:
del adata.obs['stage']
del adata.obs['bh_pval']
del adata.obs['doublet_scores']
del adata.obs['C7_leiden']
del adata.obs['C7_leiden_R']

In [92]:
adata.obs.dtypes

batch                                 category
dissection                            category
chemistry                             category
percent_mito                           float32
n_counts                               float32
n_genes                                  int64
leiden                                category
phase                                 category
S_score                                float32
G2M_score                              float32
new_celltype                          category
assay_ontology_term_id                  object
cell_type_ontology_term_id              object
development_stage_ontology_term_id      object
disease_ontology_term_id                object
ethnicity_ontology_term_id              object
is_primary_data                           bool
organism_ontology_term_id             category
sex_ontology_term_id                    object
tissue_ontology_term_id                 object
dtype: object

In [93]:
adata.var.dtypes

n_cells                  int64
Deep_n                   int32
means                  float64
dispersions            float64
dispersions_norm       float32
gene_symbols            object
feature_biotype         object
feature_is_filtered       bool
dtype: object

In [94]:
# adata.obs
print('obs')
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

# adata.var
print('var')
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

obs
changed n_genes from int64 to int32
var
changed means from float64 to float32
changed dispersions from float64 to float32
changed n_cells from int64 to int32


In [95]:
adata.write('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/fetal-lung/final/fl_pns.h5ad', compression = 'gzip')

... storing 'assay_ontology_term_id' as categorical
... storing 'cell_type_ontology_term_id' as categorical
... storing 'development_stage_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'sex_ontology_term_id' as categorical
... storing 'tissue_ontology_term_id' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical
