# `cellarium-ml` highly variable genes

2024.12.02

Stephen Fleming

Pre-requisite:

Run `onepass_mean_var_std` by using a command like

```bash
(cellarium) $ cellarium-ml onepass_mean_var_std fit -c onepass_config.yaml
```

In [12]:
# fill this in

onepass_checkpoint = '/home/sfleming/nmf/lightning_logs/version_0/checkpoints/epoch=0-step=2425.ckpt'

In [13]:
from cellarium.ml.preprocessing import get_highly_variable_genes
from cellarium.ml import CellariumModule

In [14]:
module = CellariumModule.load_from_checkpoint(onepass_checkpoint)
module

CellariumModule(pipeline = CellariumPipeline(
  (0): NormalizeTotal(target_count=10000, eps=1e-06)
  (1): Log1p()
  (2): OnePassMeanVarStd()
))

In [22]:
module.model.var_names_g

array(['ENSG00000148156', 'ENSG00000147081', 'ENSG00000163283', ...,
       'ENSG00000211917', 'ENSG00000253709', 'ENSG00000254220'],
      dtype=object)

In [23]:
module.model.mean_g

tensor([5.2306e-05, 3.0378e-05, 8.3388e-05,  ..., 1.1868e-07, 2.0085e-07,
        3.3028e-07], device='cuda:0')

In [24]:
module.model.var_g

tensor([3.6543e-05, 2.1972e-05, 6.1024e-05,  ..., 3.4973e-08, 1.0018e-07,
        2.7088e-07], device='cuda:0')

In [29]:
hvg_df = get_highly_variable_genes(
    gene_names=module.model.var_names_g.astype(str),
    mean=module.model.mean_g.detach().cpu(),
    var=module.model.var_g.detach().cpu(),
    n_top_genes=3000,
)

In [30]:
hvg_df

Unnamed: 0,means,dispersions,mean_bin,dispersions_norm,highly_variable
ENSG00000148156,5.230453e-05,-0.358630,"(-0.0019, 0.095]",0.005601,False
ENSG00000147081,3.037773e-05,-0.323947,"(-0.0019, 0.095]",0.116389,False
ENSG00000163283,8.338425e-05,-0.312237,"(-0.0019, 0.095]",0.153795,False
ENSG00000237763,4.751006e-05,-0.474944,"(-0.0019, 0.095]",-0.365940,False
ENSG00000174876,8.168856e-06,-0.463825,"(-0.0019, 0.095]",-0.330422,False
...,...,...,...,...,...
ENSG00000211825,8.412861e-07,0.736703,"(-0.0019, 0.095]",3.504417,True
ENSG00000182776,8.550716e-08,-1.549629,"(-0.0019, 0.095]",-3.798798,False
ENSG00000211917,1.186767e-07,-1.221826,"(-0.0019, 0.095]",-2.751700,False
ENSG00000253709,2.008544e-07,-0.695649,"(-0.0019, 0.095]",-1.070936,False


In [31]:
hvg_df[hvg_df['highly_variable']]

Unnamed: 0,means,dispersions,mean_bin,dispersions_norm,highly_variable
ENSG00000230539,9.082461e-05,0.297573,"(-0.0019, 0.095]",2.101706,True
ENSG00000164047,2.179779e-05,0.147694,"(-0.0019, 0.095]",1.622950,True
ENSG00000276409,8.754482e-05,0.269194,"(-0.0019, 0.095]",2.011055,True
ENSG00000108700,3.136903e-04,0.770400,"(-0.0019, 0.095]",3.612054,True
ENSG00000255521,1.799595e-04,0.137865,"(-0.0019, 0.095]",1.591551,True
...,...,...,...,...,...
ENSG00000258453,6.063096e-07,0.294720,"(-0.0019, 0.095]",2.092593,True
ENSG00000211766,6.274480e-07,0.443432,"(-0.0019, 0.095]",2.567623,True
ENSG00000185926,5.686792e-07,0.345088,"(-0.0019, 0.095]",2.253483,True
ENSG00000211790,3.100428e-06,0.993262,"(-0.0019, 0.095]",4.323941,True


In [32]:
hvg_df['ensembl_id'] = hvg_df.index.copy()

In [33]:
hvg_df[hvg_df['highly_variable']]

Unnamed: 0,means,dispersions,mean_bin,dispersions_norm,highly_variable,ensembl_id
ENSG00000230539,9.082461e-05,0.297573,"(-0.0019, 0.095]",2.101706,True,ENSG00000230539
ENSG00000164047,2.179779e-05,0.147694,"(-0.0019, 0.095]",1.622950,True,ENSG00000164047
ENSG00000276409,8.754482e-05,0.269194,"(-0.0019, 0.095]",2.011055,True,ENSG00000276409
ENSG00000108700,3.136903e-04,0.770400,"(-0.0019, 0.095]",3.612054,True,ENSG00000108700
ENSG00000255521,1.799595e-04,0.137865,"(-0.0019, 0.095]",1.591551,True,ENSG00000255521
...,...,...,...,...,...,...
ENSG00000258453,6.063096e-07,0.294720,"(-0.0019, 0.095]",2.092593,True,ENSG00000258453
ENSG00000211766,6.274480e-07,0.443432,"(-0.0019, 0.095]",2.567623,True,ENSG00000211766
ENSG00000185926,5.686792e-07,0.345088,"(-0.0019, 0.095]",2.253483,True,ENSG00000185926
ENSG00000211790,3.100428e-06,0.993262,"(-0.0019, 0.095]",4.323941,True,ENSG00000211790


In [None]:
hvg_df[hvg_df['highly_variable']]['ensembl_id'].to_csv('hvg.csv', index=False)

In [None]:
!head hvg.csv

ensembl_id
ENSG00000230539
ENSG00000164047
ENSG00000276409
ENSG00000108700
ENSG00000255521
ENSG00000254693
ENSG00000138755
ENSG00000181552
ENSG00000197665


In [37]:
!wc -l /home/sfleming/nmf/hvg.csv

3001 /home/sfleming/nmf/hvg.csv
