# **Curating `fetal-lung` dataset:**

### T/NK/ILC dataset

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [3]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

In [4]:
adata = sc.read_h5ad('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/fetal-lung/C4filtered.h5ad')

In [5]:
adata.var

Unnamed: 0,gene_ids,n_cells,highly_variable5891STDY8062349,highly_variable5891STDY8062350,highly_variable5891STDY8062351,highly_variable5891STDY8062352,highly_variable5891STDY8062353,highly_variable5891STDY8062354,highly_variable5891STDY8062355,highly_variable5891STDY8062356,...,Deep_WSSS_F_LNG8713191,Deep_5891STDY9030806,Deep_5891STDY9030807,Deep_5891STDY9030808,Deep_5891STDY9030809,Deep_5891STDY9030810,Deep_n,means,dispersions,dispersions_norm
AL627309.1,ENSG00000238009,12,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,1,0.007779,2.167017,1.588299
AL669831.5,ENSG00000237491,116,False,False,True,False,True,False,False,False,...,False,False,False,False,False,False,0,0.052239,1.765612,0.840349
FAM87B,ENSG00000177757,4,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.001574,0.800999,-0.957045
LINC00115,ENSG00000225880,117,False,True,True,False,True,False,False,False,...,False,False,False,False,False,False,1,0.063205,1.779426,0.866089
FAM41C,ENSG00000230368,13,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,14,0.004869,0.877874,-0.813801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC007325.4,ENSG00000278817,98,True,True,True,True,True,False,False,False,...,False,False,False,False,False,False,5,0.030571,1.182253,-0.246642
AC007325.2,ENSG00000277196,3,True,True,False,False,False,True,True,True,...,False,False,False,True,False,False,9,0.001751,1.412020,0.181490
AL354822.1,ENSG00000278384,6,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.001440,0.424227,-1.659097
AC004556.1,ENSG00000276345,1123,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0.353873,0.891707,-1.426738


In [6]:
len(adata.var.columns)

66

In [7]:
adata.var = adata.var[['gene_ids', 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm']]

In [8]:
adata

AnnData object with n_obs × n_vars = 4778 × 17937
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C4_leiden', 'C4_leiden_R', 'new_celltype', 'status_summary', 'productive_summary', 'isotype_summary', 'receptor_type', 'receptor_subtype'
    var: 'gene_ids', 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm'
    uns: 'batch_colors', 'new_celltype_colors', 'old_celltype_colors', 'rank_genes_groups', 'rank_genes_groups_filtered'
    obsm: 'X_C4filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [9]:
def CalculateRaw(adata,scaling_factor=10000):
    #update by Polanski in Feb 2022
    #The object must contain a log-transformed matrix
    #This function returns an integer-count object
    #The normalization constant is assumed to be 10000
    #return anndata.AnnData(X=sparse.csr_matrix(np.rint(np.array(np.expm1(adata.X).todense().transpose())*(adata.obs['n_counts'].values).transpose() / scaling_factor).transpose()),\
    #              obs=adata.obs,var=adata.var,obsm=adata.obsm,varm=adata.varm)
    X = np.expm1(adata.X)
    scaling_vector = adata.obs['n_counts'].values / scaling_factor
    #.indptr[i]:.indptr[i+1] provides the .data coordinates where the i'th row of the data resides in CSR
    #which happens to be a cell, which happens to be what we have a unique entry in scaling_vector for
    for i in np.arange(X.shape[0]):
        X.data[X.indptr[i]:X.indptr[i+1]] = X.data[X.indptr[i]:X.indptr[i+1]] * scaling_vector[i]
    return sc.AnnData(X=np.rint(X),obs=adata.obs,var=adata.var,obsm=adata.obsm,varm=adata.varm)

In [10]:
raw = CalculateRaw(adata)

In [11]:
del raw.var

In [12]:
geno = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['ENSEMBLid', 'gena','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [13]:
geno

Unnamed: 0,ENSEMBLid,gena,no
0,ENSG00000223972,DDX11L1,5
1,ENST00000456328,DDX11L1-202,2
2,ENST00000450305,DDX11L1-201,2
3,ENSG00000227232,WASH7P,5
4,ENST00000488147,WASH7P-201,1
...,...,...,...
297782,ENST00000621424,ENSG110105435.4-001,4
297783,ENSG00000278625,U6,1
297784,ENST00000616830,U6.88-201,1
297785,ENSG00000277374,U1,1


In [14]:
ens_ids = adata.var['gene_ids']

In [15]:
adata.var

Unnamed: 0,gene_ids,n_cells,Deep_n,means,dispersions,dispersions_norm
AL627309.1,ENSG00000238009,12,1,0.007779,2.167017,1.588299
AL669831.5,ENSG00000237491,116,0,0.052239,1.765612,0.840349
FAM87B,ENSG00000177757,4,0,0.001574,0.800999,-0.957045
LINC00115,ENSG00000225880,117,1,0.063205,1.779426,0.866089
FAM41C,ENSG00000230368,13,14,0.004869,0.877874,-0.813801
...,...,...,...,...,...,...
AC007325.4,ENSG00000278817,98,5,0.030571,1.182253,-0.246642
AC007325.2,ENSG00000277196,3,9,0.001751,1.412020,0.181490
AL354822.1,ENSG00000278384,6,0,0.001440,0.424227,-1.659097
AC004556.1,ENSG00000276345,1123,0,0.353873,0.891707,-1.426738


In [16]:
genedict = {key: 1 for key in list(geno.ENSEMBLid)}

In [17]:
genedict

{'ENSG00000223972': 1,
 'ENST00000456328': 1,
 'ENST00000450305': 1,
 'ENSG00000227232': 1,
 'ENST00000488147': 1,
 'ENSG00000278267': 1,
 'ENST00000619216': 1,
 'ENSG00000243485': 1,
 'ENST00000473358': 1,
 'ENST00000469289': 1,
 'ENSG00000284332': 1,
 'ENST00000607096': 1,
 'ENSG00000237613': 1,
 'ENST00000417324': 1,
 'ENST00000461467': 1,
 'ENSG00000268020': 1,
 'ENST00000606857': 1,
 'ENSG00000240361': 1,
 'ENST00000642116': 1,
 'ENST00000492842': 1,
 'ENSG00000186092': 1,
 'ENST00000641515': 1,
 'ENSG00000238009': 1,
 'ENST00000466430': 1,
 'ENST00000477740': 1,
 'ENST00000471248': 1,
 'ENST00000610542': 1,
 'ENST00000453576': 1,
 'ENSG00000239945': 1,
 'ENST00000495576': 1,
 'ENSG00000233750': 1,
 'ENST00000442987': 1,
 'ENSG00000268903': 1,
 'ENST00000494149': 1,
 'ENSG00000269981': 1,
 'ENST00000595919': 1,
 'ENSG00000239906': 1,
 'ENST00000493797': 1,
 'ENSG00000241860': 1,
 'ENST00000484859': 1,
 'ENST00000490997': 1,
 'ENST00000466557': 1,
 'ENST00000662089': 1,
 'ENST00000

In [18]:
ens_ids2 = [x for x in ens_ids if (x in genedict or x.startswith('SARSCoV'))]

In [19]:
ens_ids2

['ENSG00000238009',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000131591',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',
 'ENSG00000242485',
 'ENSG00000272455',
 'ENSG00000235098',
 'ENSG00000179403',
 'ENSG00000215915',
 'ENSG00000160072',
 'ENSG00000197785',
 'ENSG00000160075',
 'ENSG00000215014',
 'ENSG00000228594',
 'ENSG00000272106',
 'ENSG00000197530',
 'ENSG00000189409',
 'ENSG00000248333',
 'ENSG00000272004',
 'ENSG00000189339',
 'ENSG00000008128',


In [20]:
len(ens_ids2)

17851

In [21]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.var['gene_symbols'] = adata.var_names


In [22]:
raw.var['gene_symbols'] = adata.var_names
raw.var_names = adata.var['gene_ids']

In [23]:
del adata.var['gene_ids']

In [24]:
del raw.var['gene_symbols']

In [25]:
adata = adata[:, ens_ids2].copy()

  res = method(*args, **kwargs)


In [26]:
raw = raw[:, ens_ids2].copy()

In [27]:
adata

AnnData object with n_obs × n_vars = 4778 × 17851
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C4_leiden', 'C4_leiden_R', 'new_celltype', 'status_summary', 'productive_summary', 'isotype_summary', 'receptor_type', 'receptor_subtype'
    var: 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols'
    uns: 'batch_colors', 'new_celltype_colors', 'old_celltype_colors', 'rank_genes_groups', 'rank_genes_groups_filtered'
    obsm: 'X_C4filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [28]:
raw

AnnData object with n_obs × n_vars = 4778 × 17851
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C4_leiden', 'C4_leiden_R', 'new_celltype', 'status_summary', 'productive_summary', 'isotype_summary', 'receptor_type', 'receptor_subtype'
    obsm: 'X_C4filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [29]:
del raw.obs
del raw.obsm

In [30]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [31]:
raw.var['feature_biotype'] = ["gene"] * len(adata.var)

In [32]:
adata.raw = raw

## 2. Pre-prepare before schema adjustments

In [33]:
adata.var

Unnamed: 0_level_0,n_cells,Deep_n,means,dispersions,dispersions_norm,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000238009,12,1,0.007779,2.167017,1.588299,AL627309.1,gene
ENSG00000237491,116,0,0.052239,1.765612,0.840349,AL669831.5,gene
ENSG00000177757,4,0,0.001574,0.800999,-0.957045,FAM87B,gene
ENSG00000225880,117,1,0.063205,1.779426,0.866089,LINC00115,gene
ENSG00000230368,13,14,0.004869,0.877874,-0.813801,FAM41C,gene
...,...,...,...,...,...,...,...
ENSG00000278817,98,5,0.030571,1.182253,-0.246642,AC007325.4,gene
ENSG00000277196,3,9,0.001751,1.412020,0.181490,AC007325.2,gene
ENSG00000278384,6,0,0.001440,0.424227,-1.659097,AL354822.1,gene
ENSG00000276345,1123,0,0.353873,0.891707,-1.426738,AC004556.1,gene


---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [34]:
adata.obs.columns

Index(['batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts',
       'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score',
       'G2M_score', 'C4_leiden', 'C4_leiden_R', 'new_celltype',
       'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype'],
      dtype='object')

In [35]:
adata.obs

Unnamed: 0,batch,stage,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,...,S_score,G2M_score,C4_leiden,C4_leiden_R,new_celltype,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype
AAACGGGGTAGGGTAC-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.044039,9790.0,2360,0.098121,0.999862,4,...,-0.072387,-0.057476,1,1,CD4 T,TRB + TRA,T + T,unassigned,TCR,TRA+TRB
AAAGCAAGTAAGAGAG-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.080093,4330.0,1921,0.031100,0.999862,4,...,-0.166618,-0.095826,0,0,CD16+ NK,,,,,
AACACGTAGTTGAGAT-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.043467,3829.0,1684,0.031100,0.999862,4,...,-0.033295,-0.007830,2,2,CD56bright NK,,,,,
AACTCCCGTAGCCTAT-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.042411,10996.0,2828,0.063415,0.999862,4,...,-0.035172,-0.094765,1,1,CD4 T,TRB + TRA,Multi,unassigned,TCR,TRA+TRB
AACTCTTGTCTAACGT-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.069772,4333.0,1849,0.017189,0.999862,4,...,-0.145187,-0.190998,3,3,Intermediate NK,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TACGGTATCTCTGAGA-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.049737,13717.0,4028,0.235669,0.796614,4,...,0.592496,-0.017360,4,41,ILC3,,,,,
TGACAACCAATCAGAA-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.050541,23330.0,5200,0.206897,0.796614,4,...,0.712578,0.073712,16,160,ILCP,,,,,
TGACTAGGTACAAGTA-5891STDY9030810,5891STDY9030810,9.0,Whole-lung,Normal,0.044490,10223.0,3191,0.190647,0.796614,4,...,-0.002027,-0.126401,4,41,ILC3,,,,,
TGGTTAGCAGGGCATA-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.184981,1921.0,1317,0.169687,0.796614,4,...,-0.065463,-0.046523,16,160,ILCP,,,,,


## assay_ontology_term_id

Assay used in this study is 10x 5' v1. Publication

10x 3' v2	"EFO:0009899"  
10x 3' v3	"EFO:0009922"  
10x 5' v1	"EFO:0011025"  
Smart-seq	"EFO:0008930"  
Smart-seq2	"EFO:0008931"

> Chromium Single Cell 5’ V(D)J Reagent Kits (V1.0 chemistry) were used for scRNAseq library
construction.

In [36]:
adata.obs['assay_ontology_term_id'] = ['EFO:0011025'] * len(adata.obs)

In [37]:
adata.obs['assay_ontology_term_id']

AAACGGGGTAGGGTAC-5891STDY8062349    EFO:0011025
AAAGCAAGTAAGAGAG-5891STDY8062349    EFO:0011025
AACACGTAGTTGAGAT-5891STDY8062349    EFO:0011025
AACTCCCGTAGCCTAT-5891STDY8062349    EFO:0011025
AACTCTTGTCTAACGT-5891STDY8062349    EFO:0011025
                                       ...     
TACGGTATCTCTGAGA-5891STDY9030810    EFO:0011025
TGACAACCAATCAGAA-5891STDY9030810    EFO:0011025
TGACTAGGTACAAGTA-5891STDY9030810    EFO:0011025
TGGTTAGCAGGGCATA-5891STDY9030810    EFO:0011025
TTCCCAGTCCTACAGA-5891STDY9030810    EFO:0011025
Name: assay_ontology_term_id, Length: 4778, dtype: object

## cell_type_ontology_term_id

In [38]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['new_celltype']

In [39]:
len(list(adata.obs['cell_type_ontology_term_id'].unique()))

16

In [40]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['CD4 T',
 'CD16+ NK',
 'CD56bright NK',
 'Intermediate NK',
 'Th17',
 'Treg',
 'CD8 T',
 'Cycling T',
 'NKT2',
 'NKT1',
 'ILC3',
 'Cycling NK',
 'ILC2',
 'Activated NK',
 'ILCP',
 'Tαβ_Entry']

In [41]:
mapping = {'CD4 T': 'CL:0000897',
 'CD16+ NK': 'CL:0000939',
 'CD56bright NK': 'CL:0000938',
 'Intermediate NK': 'CL:0000623', # not intermediate
 'Th17': 'CL:0000899',
 'Treg': 'CL:0000815',
 'CD8 T': 'CL:0000625',
 'Cycling T': 'CL:0000084',
 'NKT2': 'CL:0000922',
 'NKT1': 'CL:0000921',
 'ILC3': 'CL:0001071',
 'Cycling NK': 'CL:0000623', # not cycling
 'ILC2': 'CL:0001069',
 'Activated NK': 'CL:0000623', # not activated
 'ILCP': 'CL:0001074',
 'Tαβ_Entry': 'CL:0000893'} # not entry

In [42]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

## development_stage_ontology_term_id

In [43]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['stage']

In [44]:
list(adata.obs['stage'].unique())

[18.0, 22.0, 15.0, 20.0, 6.86, 9.0, 11.0]

In [45]:
mapping = {18.0: 'HsapDv:0000055', 
           22.0: 'HsapDv:0000059', 
           15.0: 'HsapDv:0000052', 
           20.0: 'HsapDv:0000057', 
           6.86: 'HsapDv:0000026', 
           5.0: 'HsapDv:0000023', 
           9.0: 'HsapDv:0000046', 
           11.0: 'HsapDv:0000048'}

In [46]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [47]:
adata.obs['development_stage_ontology_term_id']

AAACGGGGTAGGGTAC-5891STDY8062349    HsapDv:0000055
AAAGCAAGTAAGAGAG-5891STDY8062349    HsapDv:0000055
AACACGTAGTTGAGAT-5891STDY8062349    HsapDv:0000055
AACTCCCGTAGCCTAT-5891STDY8062349    HsapDv:0000055
AACTCTTGTCTAACGT-5891STDY8062349    HsapDv:0000055
                                         ...      
TACGGTATCTCTGAGA-5891STDY9030810    HsapDv:0000048
TGACAACCAATCAGAA-5891STDY9030810    HsapDv:0000048
TGACTAGGTACAAGTA-5891STDY9030810    HsapDv:0000046
TGGTTAGCAGGGCATA-5891STDY9030810    HsapDv:0000048
TTCCCAGTCCTACAGA-5891STDY9030810    HsapDv:0000048
Name: development_stage_ontology_term_id, Length: 4778, dtype: object

In [48]:
adata.obs['development_stage_ontology_term_id'].unique()

array(['HsapDv:0000055', 'HsapDv:0000059', 'HsapDv:0000052',
       'HsapDv:0000057', 'HsapDv:0000026', 'HsapDv:0000046',
       'HsapDv:0000048'], dtype=object)

## disease_ontology_term_id

In [49]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [50]:
adata.obs['disease_ontology_term_id']

AAACGGGGTAGGGTAC-5891STDY8062349    PATO:0000461
AAAGCAAGTAAGAGAG-5891STDY8062349    PATO:0000461
AACACGTAGTTGAGAT-5891STDY8062349    PATO:0000461
AACTCCCGTAGCCTAT-5891STDY8062349    PATO:0000461
AACTCTTGTCTAACGT-5891STDY8062349    PATO:0000461
                                        ...     
TACGGTATCTCTGAGA-5891STDY9030810    PATO:0000461
TGACAACCAATCAGAA-5891STDY9030810    PATO:0000461
TGACTAGGTACAAGTA-5891STDY9030810    PATO:0000461
TGGTTAGCAGGGCATA-5891STDY9030810    PATO:0000461
TTCCCAGTCCTACAGA-5891STDY9030810    PATO:0000461
Name: disease_ontology_term_id, Length: 4778, dtype: object

In [51]:
adata.obs['disease_ontology_term_id']

AAACGGGGTAGGGTAC-5891STDY8062349    PATO:0000461
AAAGCAAGTAAGAGAG-5891STDY8062349    PATO:0000461
AACACGTAGTTGAGAT-5891STDY8062349    PATO:0000461
AACTCCCGTAGCCTAT-5891STDY8062349    PATO:0000461
AACTCTTGTCTAACGT-5891STDY8062349    PATO:0000461
                                        ...     
TACGGTATCTCTGAGA-5891STDY9030810    PATO:0000461
TGACAACCAATCAGAA-5891STDY9030810    PATO:0000461
TGACTAGGTACAAGTA-5891STDY9030810    PATO:0000461
TGGTTAGCAGGGCATA-5891STDY9030810    PATO:0000461
TTCCCAGTCCTACAGA-5891STDY9030810    PATO:0000461
Name: disease_ontology_term_id, Length: 4778, dtype: object

## ethnicity_ontology_term_id

In [52]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [53]:
adata.obs['ethnicity_ontology_term_id']

AAACGGGGTAGGGTAC-5891STDY8062349    unknown
AAAGCAAGTAAGAGAG-5891STDY8062349    unknown
AACACGTAGTTGAGAT-5891STDY8062349    unknown
AACTCCCGTAGCCTAT-5891STDY8062349    unknown
AACTCTTGTCTAACGT-5891STDY8062349    unknown
                                     ...   
TACGGTATCTCTGAGA-5891STDY9030810    unknown
TGACAACCAATCAGAA-5891STDY9030810    unknown
TGACTAGGTACAAGTA-5891STDY9030810    unknown
TGGTTAGCAGGGCATA-5891STDY9030810    unknown
TTCCCAGTCCTACAGA-5891STDY9030810    unknown
Name: ethnicity_ontology_term_id, Length: 4778, dtype: object

## is_primary_data

In [54]:
allcells = pd.read_csv('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/fetal-lung/cellnames_all.csv', index_col=0)

In [55]:
allcells

Unnamed: 0,batch,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,...,broad_celltype,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
AAACCTGAGACAGGCT-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.042116,8711.0,3101,0.060127,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAACCTGCACCGAATT-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.042811,8228.0,2992,0.131980,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGATGTCGTGACAT-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.052119,3601.0,1714,0.051245,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGCAACAAACGTGG-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.069157,5707.0,2280,0.083019,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGCAACATACTACG-5891STDY8062349,5891STDY8062349,Proximal,Normal,0.065710,7095.0,2639,0.070568,0.826769,0,G1,...,Fibroblast,EFO:0011025,CL:4028004,HsapDv:0000055,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,Proximal,Normal,0.039884,9581.0,3133,0.330578,0.826283,8,G1,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,Distal,Normal,0.034237,2706.0,1305,0.394231,0.122739,8,G1,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,Distal,Normal,0.040634,7625.0,2735,0.330578,0.122739,8,G1,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,Distal,Normal,0.027229,13147.0,3721,0.245161,0.122739,8,S,...,Other myeloid,EFO:0011025,CL:0000097,HsapDv:0000052,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002048


In [56]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [57]:
len(set(adata.obs_names).intersection(set(allcells.index)))

4778

In [58]:
adata.obs.loc[set(adata.obs_names).difference(set(allcells.index)),'is_primary_data'] = True

In [59]:
adata.obs['is_primary_data']

AAACGGGGTAGGGTAC-5891STDY8062349    False
AAAGCAAGTAAGAGAG-5891STDY8062349    False
AACACGTAGTTGAGAT-5891STDY8062349    False
AACTCCCGTAGCCTAT-5891STDY8062349    False
AACTCTTGTCTAACGT-5891STDY8062349    False
                                    ...  
TACGGTATCTCTGAGA-5891STDY9030810    False
TGACAACCAATCAGAA-5891STDY9030810    False
TGACTAGGTACAAGTA-5891STDY9030810    False
TGGTTAGCAGGGCATA-5891STDY9030810    False
TTCCCAGTCCTACAGA-5891STDY9030810    False
Name: is_primary_data, Length: 4778, dtype: bool

In [60]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [61]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [62]:
adata.obs['organism_ontology_term_id']

AAACGGGGTAGGGTAC-5891STDY8062349    NCBITaxon:9606
AAAGCAAGTAAGAGAG-5891STDY8062349    NCBITaxon:9606
AACACGTAGTTGAGAT-5891STDY8062349    NCBITaxon:9606
AACTCCCGTAGCCTAT-5891STDY8062349    NCBITaxon:9606
AACTCTTGTCTAACGT-5891STDY8062349    NCBITaxon:9606
                                         ...      
TACGGTATCTCTGAGA-5891STDY9030810    NCBITaxon:9606
TGACAACCAATCAGAA-5891STDY9030810    NCBITaxon:9606
TGACTAGGTACAAGTA-5891STDY9030810    NCBITaxon:9606
TGGTTAGCAGGGCATA-5891STDY9030810    NCBITaxon:9606
TTCCCAGTCCTACAGA-5891STDY9030810    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 4778, dtype: object

In [63]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [64]:
adata.obs['organism_ontology_term_id']

AAACGGGGTAGGGTAC-5891STDY8062349    NCBITaxon:9606
AAAGCAAGTAAGAGAG-5891STDY8062349    NCBITaxon:9606
AACACGTAGTTGAGAT-5891STDY8062349    NCBITaxon:9606
AACTCCCGTAGCCTAT-5891STDY8062349    NCBITaxon:9606
AACTCTTGTCTAACGT-5891STDY8062349    NCBITaxon:9606
                                         ...      
TACGGTATCTCTGAGA-5891STDY9030810    NCBITaxon:9606
TGACAACCAATCAGAA-5891STDY9030810    NCBITaxon:9606
TGACTAGGTACAAGTA-5891STDY9030810    NCBITaxon:9606
TGGTTAGCAGGGCATA-5891STDY9030810    NCBITaxon:9606
TTCCCAGTCCTACAGA-5891STDY9030810    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 4778, dtype: category
Categories (1, object): ['NCBITaxon:9606']

In [65]:
pd.set_option('display.max_columns', 500)

In [66]:
adata.obs

Unnamed: 0,batch,stage,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,G2M_score,C4_leiden,C4_leiden_R,new_celltype,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id
AAACGGGGTAGGGTAC-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.044039,9790.0,2360,0.098121,0.999862,4,G1,-0.072387,-0.057476,1,1,CD4 T,TRB + TRA,T + T,unassigned,TCR,TRA+TRB,EFO:0011025,CL:0000897,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
AAAGCAAGTAAGAGAG-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.080093,4330.0,1921,0.031100,0.999862,4,G1,-0.166618,-0.095826,0,0,CD16+ NK,,,,,,EFO:0011025,CL:0000939,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
AACACGTAGTTGAGAT-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.043467,3829.0,1684,0.031100,0.999862,4,G1,-0.033295,-0.007830,2,2,CD56bright NK,,,,,,EFO:0011025,CL:0000938,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
AACTCCCGTAGCCTAT-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.042411,10996.0,2828,0.063415,0.999862,4,G1,-0.035172,-0.094765,1,1,CD4 T,TRB + TRA,Multi,unassigned,TCR,TRA+TRB,EFO:0011025,CL:0000897,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
AACTCTTGTCTAACGT-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.069772,4333.0,1849,0.017189,0.999862,4,G1,-0.145187,-0.190998,3,3,Intermediate NK,,,,,,EFO:0011025,CL:0000623,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TACGGTATCTCTGAGA-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.049737,13717.0,4028,0.235669,0.796614,4,S,0.592496,-0.017360,4,41,ILC3,,,,,,EFO:0011025,CL:0001071,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606
TGACAACCAATCAGAA-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.050541,23330.0,5200,0.206897,0.796614,4,S,0.712578,0.073712,16,160,ILCP,,,,,,EFO:0011025,CL:0001074,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606
TGACTAGGTACAAGTA-5891STDY9030810,5891STDY9030810,9.0,Whole-lung,Normal,0.044490,10223.0,3191,0.190647,0.796614,4,G1,-0.002027,-0.126401,4,41,ILC3,,,,,,EFO:0011025,CL:0001071,HsapDv:0000046,PATO:0000461,unknown,False,NCBITaxon:9606
TGGTTAGCAGGGCATA-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.184981,1921.0,1317,0.169687,0.796614,4,G1,-0.065463,-0.046523,16,160,ILCP,,,,,,EFO:0011025,CL:0001074,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606


## sex_ontology_term_id

**PATO_0000384 for male  
PATO_0000383 for female**

In [67]:
adata.obs['sex_ontology_term_id'] = ['x'] * len(adata.obs)

In [68]:
allcells.loc[list(set(allcells.index).intersection(set(adata.obs_names))), 'sex_ontology_term_id']

CTGATAGGTTACCAGT-WSSS_F_LNG8713177    PATO:0000384
TGTCCCAGTCTAGCGC-5891STDY8062355      PATO:0000383
AGCTCCTGTACGACCC-WSSS_F_LNG8713179    PATO:0000384
CAGTAACCAGCTTCGG-WSSS_F_LNG8713179    PATO:0000384
GGTATTGTCTAGCACA-5891STDY8062350      PATO:0000384
                                          ...     
AGGCCGTGTTCGCGAC-WSSS_F_LNG8713181    PATO:0000384
ACTGCTCCACCAGATT-5891STDY8062355      PATO:0000383
CGTCACTTCAAGCCTA-5891STDY8062351      PATO:0000384
CTCGAGGGTCCATGAT-WSSS_F_LNG8713187    PATO:0000384
TATGCCCGTCAACTGT-WSSS_F_LNG8713187    PATO:0000384
Name: sex_ontology_term_id, Length: 4778, dtype: object

In [69]:
adata.obs.loc[list(set(adata.obs_names).difference(set(allcells.index))),'sex_ontology_term_id'] = 'unknown'

In [70]:
adata.obs.loc[list(set(allcells.index).intersection(set(adata.obs_names))),'sex_ontology_term_id'] = allcells.loc[list(set(allcells.index).intersection(set(adata.obs_names))), 'sex_ontology_term_id']

In [71]:
adata.obs['sex_ontology_term_id']

AAACGGGGTAGGGTAC-5891STDY8062349    PATO:0000384
AAAGCAAGTAAGAGAG-5891STDY8062349    PATO:0000384
AACACGTAGTTGAGAT-5891STDY8062349    PATO:0000384
AACTCCCGTAGCCTAT-5891STDY8062349    PATO:0000384
AACTCTTGTCTAACGT-5891STDY8062349    PATO:0000384
                                        ...     
TACGGTATCTCTGAGA-5891STDY9030810    PATO:0000384
TGACAACCAATCAGAA-5891STDY9030810    PATO:0000384
TGACTAGGTACAAGTA-5891STDY9030810    PATO:0000383
TGGTTAGCAGGGCATA-5891STDY9030810    PATO:0000384
TTCCCAGTCCTACAGA-5891STDY9030810    PATO:0000384
Name: sex_ontology_term_id, Length: 4778, dtype: object

## tissue_ontology_term_id

Lung: UBERON:0002048

In [72]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0002048'] * len(adata.obs)

In [73]:
adata.obs['tissue_ontology_term_id']

AAACGGGGTAGGGTAC-5891STDY8062349    UBERON:0002048
AAAGCAAGTAAGAGAG-5891STDY8062349    UBERON:0002048
AACACGTAGTTGAGAT-5891STDY8062349    UBERON:0002048
AACTCCCGTAGCCTAT-5891STDY8062349    UBERON:0002048
AACTCTTGTCTAACGT-5891STDY8062349    UBERON:0002048
                                         ...      
TACGGTATCTCTGAGA-5891STDY9030810    UBERON:0002048
TGACAACCAATCAGAA-5891STDY9030810    UBERON:0002048
TGACTAGGTACAAGTA-5891STDY9030810    UBERON:0002048
TGGTTAGCAGGGCATA-5891STDY9030810    UBERON:0002048
TTCCCAGTCCTACAGA-5891STDY9030810    UBERON:0002048
Name: tissue_ontology_term_id, Length: 4778, dtype: object

---

# <ins>var and raw.var

In [74]:
adata.var

Unnamed: 0_level_0,n_cells,Deep_n,means,dispersions,dispersions_norm,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000238009,12,1,0.007779,2.167017,1.588299,AL627309.1,gene
ENSG00000237491,116,0,0.052239,1.765612,0.840349,AL669831.5,gene
ENSG00000177757,4,0,0.001574,0.800999,-0.957045,FAM87B,gene
ENSG00000225880,117,1,0.063205,1.779426,0.866089,LINC00115,gene
ENSG00000230368,13,14,0.004869,0.877874,-0.813801,FAM41C,gene
...,...,...,...,...,...,...,...
ENSG00000278817,98,5,0.030571,1.182253,-0.246642,AC007325.4,gene
ENSG00000277196,3,9,0.001751,1.412020,0.181490,AC007325.2,gene
ENSG00000278384,6,0,0.001440,0.424227,-1.659097,AL354822.1,gene
ENSG00000276345,1123,0,0.353873,0.891707,-1.426738,AC004556.1,gene


## feature_is_filtered

In [77]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [78]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000238009    False
ENSG00000237491    False
ENSG00000177757    False
ENSG00000225880    False
ENSG00000230368    False
                   ...  
ENSG00000278817    False
ENSG00000277196    False
ENSG00000278384    False
ENSG00000276345    False
ENSG00000271254    False
Name: feature_is_filtered, Length: 17851, dtype: bool

# <ins>obsm

In [83]:
adata.obsm

AxisArrays with keys: X_C4filteredumap, X_Clusterumap, X_pca, X_umap

# <ins>uns

In [84]:
adata.uns['schema_version'] = '2.0.0'

In [85]:
adata.uns['title'] = 'T, NK and ILC'

In [86]:
adata.uns['X_normalization'] = 'Scanpy'

In [87]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [88]:
adata

AnnData object with n_obs × n_vars = 4778 × 17851
    obs: 'batch', 'stage', 'dissection', 'chemistry', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'C4_leiden', 'C4_leiden_R', 'new_celltype', 'status_summary', 'productive_summary', 'isotype_summary', 'receptor_type', 'receptor_subtype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'n_cells', 'Deep_n', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'batch_colors', 'new_celltype_colors', 'old_celltype_colors', 'rank_genes_groups', 'rank_genes_groups_filtered', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_C4filteredumap', 'X_Clusterumap', 'X_pca', 'X_umap'

In [89]:
adata.obs

Unnamed: 0,batch,stage,dissection,chemistry,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,G2M_score,C4_leiden,C4_leiden_R,new_celltype,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
AAACGGGGTAGGGTAC-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.044039,9790.0,2360,0.098121,0.999862,4,G1,-0.072387,-0.057476,1,1,CD4 T,TRB + TRA,T + T,unassigned,TCR,TRA+TRB,EFO:0011025,CL:0000897,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AAAGCAAGTAAGAGAG-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.080093,4330.0,1921,0.031100,0.999862,4,G1,-0.166618,-0.095826,0,0,CD16+ NK,,,,,,EFO:0011025,CL:0000939,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AACACGTAGTTGAGAT-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.043467,3829.0,1684,0.031100,0.999862,4,G1,-0.033295,-0.007830,2,2,CD56bright NK,,,,,,EFO:0011025,CL:0000938,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AACTCCCGTAGCCTAT-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.042411,10996.0,2828,0.063415,0.999862,4,G1,-0.035172,-0.094765,1,1,CD4 T,TRB + TRA,Multi,unassigned,TCR,TRA+TRB,EFO:0011025,CL:0000897,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
AACTCTTGTCTAACGT-5891STDY8062349,5891STDY8062349,18.0,Proximal,Normal,0.069772,4333.0,1849,0.017189,0.999862,4,G1,-0.145187,-0.190998,3,3,Intermediate NK,,,,,,EFO:0011025,CL:0000623,HsapDv:0000055,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TACGGTATCTCTGAGA-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.049737,13717.0,4028,0.235669,0.796614,4,S,0.592496,-0.017360,4,41,ILC3,,,,,,EFO:0011025,CL:0001071,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
TGACAACCAATCAGAA-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.050541,23330.0,5200,0.206897,0.796614,4,S,0.712578,0.073712,16,160,ILCP,,,,,,EFO:0011025,CL:0001074,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048
TGACTAGGTACAAGTA-5891STDY9030810,5891STDY9030810,9.0,Whole-lung,Normal,0.044490,10223.0,3191,0.190647,0.796614,4,G1,-0.002027,-0.126401,4,41,ILC3,,,,,,EFO:0011025,CL:0001071,HsapDv:0000046,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000383,UBERON:0002048
TGGTTAGCAGGGCATA-5891STDY9030810,5891STDY9030810,11.0,Whole-lung,Normal,0.184981,1921.0,1317,0.169687,0.796614,4,G1,-0.065463,-0.046523,16,160,ILCP,,,,,,EFO:0011025,CL:0001074,HsapDv:0000048,PATO:0000461,unknown,False,NCBITaxon:9606,PATO:0000384,UBERON:0002048


In [90]:
adata.obs.dtypes

batch                                 category
stage                                  float64
dissection                            category
chemistry                             category
percent_mito                           float32
n_counts                               float32
n_genes                                  int64
doublet_scores                         float64
bh_pval                                float64
leiden                                category
phase                                 category
S_score                                float32
G2M_score                              float32
C4_leiden                             category
C4_leiden_R                           category
new_celltype                          category
status_summary                        category
productive_summary                    category
isotype_summary                       category
receptor_type                         category
receptor_subtype                      category
assay_ontolog

In [91]:
del adata.obs['stage']
del adata.obs['bh_pval']
del adata.obs['doublet_scores']
del adata.obs['C4_leiden']
del adata.obs['C4_leiden_R']
del adata.obs['status_summary']
del adata.obs['productive_summary']
del adata.obs['isotype_summary']
del adata.obs['receptor_type']
del adata.obs['receptor_subtype']

In [92]:
adata.obs.dtypes

batch                                 category
dissection                            category
chemistry                             category
percent_mito                           float32
n_counts                               float32
n_genes                                  int64
leiden                                category
phase                                 category
S_score                                float32
G2M_score                              float32
new_celltype                          category
assay_ontology_term_id                  object
cell_type_ontology_term_id              object
development_stage_ontology_term_id      object
disease_ontology_term_id                object
ethnicity_ontology_term_id              object
is_primary_data                           bool
organism_ontology_term_id             category
sex_ontology_term_id                    object
tissue_ontology_term_id                 object
dtype: object

In [93]:
adata.var.dtypes

n_cells                  int64
Deep_n                   int32
means                  float64
dispersions            float64
dispersions_norm       float32
gene_symbols            object
feature_biotype         object
feature_is_filtered       bool
dtype: object

In [94]:
# adata.obs
print('obs')
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

# adata.var
print('var')
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

obs
changed n_genes from int64 to int32
var
changed means from float64 to float32
changed dispersions from float64 to float32
changed n_cells from int64 to int32


In [95]:
adata.write('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/fetal-lung/final/fl_tnkilc.h5ad', compression = 'gzip')

... storing 'assay_ontology_term_id' as categorical
... storing 'cell_type_ontology_term_id' as categorical
... storing 'development_stage_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'sex_ontology_term_id' as categorical
... storing 'tissue_ontology_term_id' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical
