# Getting started with Starling (ST)

In [1]:
import os
os.getcwd()

'/home/campbell/yulee/project/st/starling-main/docs/tutorial'

In [2]:
import sys
import pandas as pd
import anndata as ad

import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping #ModelCheckpoint

sys.path.append("../../") ## change this path according to the current working directory
#sys.path.append("../../github/starling/")
from starling import utility
from starling import starling

In [3]:
os.getcwd()

'/home/campbell/yulee/project/st/starling-main/docs/tutorial'

## Setting seed for everything

In [4]:
pl.utilities.seed.seed_everything(10, workers=True)

Global seed set to 10


10

## Loading annData objects

The example below runs Kmeans with 10 clusters read from "sample_input.h5ad" object.

In [5]:
adata = utility.init_clustering(ad.read_h5ad('sample_input.h5ad'), 'KM', k=10)

The utility.py provides an easy setup of GMM, KM (Kmeans) or PG (PhenoGraph), k can be omitted when PG is used.

## Setting initializations

The example below uses defualt parameter settings based on benchmarking results (more details in manuscript). 

In [6]:
st = starling.ST(adata)

A list of parameters are shown:
- adata: annDATA object of the sample
- dist_option (default: 'T'): T for Student-T (df=2) and N for Normal (Gaussian)
- the proportion of anticipated segmentation error free cells (default: 0.6)
- model_cell_size (default: 'Y'): Y for incoporating cell size in the model and N otherwise
- cell_size_col_name (default: 'area'): area is the column name in anndata.obs dataframe
- model_zplane_overlap (default: 'Y'): Y for modeling z-plane overlap when cell size is modelled and N otherwise
Note: if the user sets model_cell_size = 'N', then model_zplane_overlap is ignored
- model_regularizer (default: 1): Regularizier term impose on synthetic doublet loss (BCE)
- learning_rate (default: 1e-3): The learning rate of ADAM optimizer for STARLING

Equivalent as the above example:
st = starling.ST(adata, 'T', 'Y', 'area', 'Y', 1, 1e-3)

## Setting trainning log

Once training starts, a new directory 'log' will created.

In [7]:
## log training results via tensorboard
log_tb = TensorBoardLogger(save_dir = 'log')

One could view the training information via tensorboard. Please refer to torch lightning (https://lightning.ai/docs/pytorch/stable/api_references.html#profiler) for other possible loggers.

## Setting early stopping criterion

In [8]:
## set early stopping criterion
cb_early_stopping = EarlyStopping(monitor = 'train_loss', mode = 'min', verbose = False)

Training loss is monitored. 

## Training Starling

In [9]:
## train ST
trainer = pl.Trainer(max_epochs = 100, accelerator = 'auto', devices = 'auto', deterministic = True, callbacks = [cb_early_stopping], logger=[log_tb])
trainer.fit(st)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type | Params
------------------------------
------------------------------
0         Trainable params
0         Non-trainable params
0         Total params
0.000     Total estimated model params size (MB)
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Epoch 0:  15%|█▍        | 4/27 [00:00<00:03,  6.04it/s, loss=88.2, v_num=5, train_loss_step=88.10]

  ccmax = torch.combinations(psi).max(1).values


Epoch 99: 100%|██████████| 27/27 [00:01<00:00, 24.90it/s, loss=58.4, v_num=5, train_loss_step=58.00, train_loss_epoch=58.40]


## Retriving Starling results

In [10]:
## retrive starling results
st.result()

## The information can be retrieved from the annData object.
   - st.adata.varm['init_exp_centroids']      -- initial expression cluster centroids (P x C matrix)
   - st.adata.varm['st_exp_centroids']        -- ST expression cluster centroids (P x C matrix)
   - st.adata.uns['init_cell_size_centroids'] -- initial cell size centroids if STARLING models cell size
   - st.adata.uns['st_cell_size_centroids']   -- initial & ST cell size centroids if ST models cell size
   - st.adata.obsm['assignment_prob_matrix']  -- cell assignment probability (N x C maxtrix)
   - st.adata.obsm['gamma_prob_matrix']       -- gamma probabilitiy of two cells (N x C x C maxtrix)
   - st.adata.obs['doublet']                  -- doublet indicator
   - st.adata.obs['doublet_prob']             -- doublet probabilities
   - st.adata.obs['init_label']               -- initial assignments
   - st.adata.obs['st_label']                 -- ST assignments
   - st.adata.obs['max_assign_prob']          -- ST max probabilites of assignments
      - N: # of cells; C: # of clusters; P: # of proteins

## Saving the model

In [11]:
## st object can be saved
torch.save(st, 'model.pt')

model.pt will be saved in the same directory as this notebook.

## Starling results are appended to the annData object

In [12]:
st.adata

AnnData object with n_obs × n_vars = 13685 × 24
    obs: 'sample', 'id', 'x', 'y', 'area', 'area_convex', 'neighbor', 'init_label', 'st_label', 'doublet_prob', 'doublet', 'max_assign_prob'
    uns: 'init_cell_size_centroids', 'init_cell_size_variances', 'st_cell_size_centroids'
    obsm: 'assignment_prob_matrix', 'gamma_assignment_prob_matrix'
    varm: 'init_exp_centroids', 'init_exp_variances', 'st_exp_centroids'

One could easily perform further analysis such as co-occurance, enrichment analysis and etc.

In [13]:
st.adata.obs

Unnamed: 0,sample,id,x,y,area,area_convex,neighbor,init_label,st_label,doublet_prob,doublet,max_assign_prob
4_1,4,1,0.785714,7.785714,14,14,0,7,3,0.107680,0,0.892312
4_2,4,2,0.823529,22.294117,17,17,0,8,3,0.475209,0,0.418412
4_3,4,3,0.875000,79.500000,16,16,1,5,3,0.107942,0,0.892058
4_4,4,4,0.666667,270.500000,12,12,0,4,3,0.140917,0,0.856819
4_5,4,5,0.823529,279.294130,17,17,1,2,2,0.578934,1,0.356235
...,...,...,...,...,...,...,...,...,...,...,...,...
4_13681,4,13681,997.769200,754.500000,26,26,0,9,9,0.058607,0,0.941282
4_13682,4,13682,998.153900,127.615390,13,13,0,4,4,0.091396,0,0.908604
4_13683,4,13683,998.153900,160.000000,13,13,1,7,3,0.128414,0,0.871540
4_13684,4,13684,997.580600,242.580640,31,33,1,9,9,0.136119,0,0.860448


Starling provides doublet probabilities and cell assignment if it were a singlet for each cell.

## Showing initial expression centriods:

In [14]:
## initial expression centriods (p x c) matrix
pd.DataFrame(st.adata.varm['init_exp_centroids'], index=st.adata.var_names)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
SMA,0.720967,0.585027,0.801933,2.13225,0.492805,2.310254,1.535946,1.150474,1.693778,0.61403
ECadherin,0.877221,0.99957,1.463762,0.918666,5.105359,0.978423,0.888575,0.941698,0.886874,2.994359
Cytokeratin,9.933142,12.609383,14.818891,7.949757,63.817917,7.390986,8.085617,8.824631,8.109798,34.635605
HLADR,109.319489,99.347565,30.853996,20.089088,11.319593,17.104588,26.659348,22.432001,25.211506,19.767342
Vimentin,215.87114,65.165985,145.056137,591.789062,31.197891,834.506287,338.02887,243.142197,443.985291,149.811188
CD28,0.402611,0.431232,0.407054,0.194037,0.282477,0.129055,0.340027,0.387506,0.28124,0.162825
CD15,0.842164,2.414609,4.767142,11.917,10.161815,8.31009,2.395725,2.642576,3.612238,157.945465
CD45RA,25.086103,20.061749,10.642691,6.086351,3.428966,4.584605,8.464054,8.87518,7.588144,5.590945
CD66b,0.290807,0.260315,0.388225,0.912257,0.646158,0.923219,0.395315,0.345415,0.506907,6.663888
CD20,48.42778,66.686539,14.879524,6.222989,5.205252,4.316024,8.634953,8.083755,7.648183,10.075335


There are 10 centroids since we set Kmeans (KM) as k = 10 earlier.

## Showing Starling expression centriods:

In [15]:
## starling expression centriods (p x c) matrix
pd.DataFrame(st.adata.varm['st_exp_centroids'], index=st.adata.var_names)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
SMA,0.630584,0.625222,0.646329,0.925917,0.449863,2.793317,2.356341,1.781954,2.149641,0.532924
ECadherin,0.60278,0.684089,0.780165,0.797225,5.494092,1.108318,0.864519,1.710122,0.597689,3.270387
Cytokeratin,9.02243,9.63251,7.601309,6.805786,69.495041,8.927615,7.453179,7.341437,5.320751,35.020645
HLADR,57.801815,88.580734,19.923067,10.716701,8.128937,14.212717,21.098606,12.688202,13.49973,33.77515
Vimentin,193.36586,108.276184,294.296844,369.279358,29.04899,853.547791,377.023956,335.995605,268.646179,133.570007
CD28,0.243396,0.351867,0.419079,0.052022,0.24422,0.093331,0.274716,0.245859,0.164856,0.142304
CD15,0.417759,0.394474,0.551968,3.617295,7.30036,12.508798,1.981807,1.466862,2.84859,104.11483
CD45RA,15.071248,23.635565,7.468481,3.171324,1.456866,3.126213,6.561466,8.161564,4.676317,7.831636
CD66b,0.226182,0.196237,0.293509,0.311583,0.220635,0.996837,0.347522,0.310293,0.258991,4.287268
CD20,27.397152,58.756229,4.724949,2.36685,1.946522,2.971781,6.54257,5.003287,4.631355,13.386323


From here one could easily annotate cluster centriods to cell type.

## Showing Assignment Distributions:

In [16]:
## assignment distributions (n x c maxtrix)
pd.DataFrame(st.adata.obsm['assignment_prob_matrix'], index = st.adata.obs.index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
4_1,3.783097e-10,1.870313e-08,7.454336e-06,8.923120e-01,1.673693e-07,3.145988e-17,4.585031e-16,1.625775e-15,3.954680e-11,2.633352e-12
4_2,2.253198e-09,8.602157e-05,1.062930e-01,4.184124e-01,2.296793e-08,3.368046e-15,9.210434e-12,3.308265e-11,5.122466e-10,6.382439e-10
4_3,1.567647e-15,6.495662e-16,1.818553e-10,8.920578e-01,1.923190e-11,1.889954e-08,2.455403e-15,1.051328e-13,5.568968e-11,3.372012e-12
4_4,3.224672e-09,3.656452e-07,1.963687e-03,8.568190e-01,2.996252e-04,1.629633e-14,1.802143e-14,3.459486e-15,3.711590e-10,3.037263e-10
4_5,2.961560e-06,6.342213e-08,3.562353e-01,6.477993e-02,4.144526e-07,1.328070e-12,6.870071e-09,1.043632e-09,4.769265e-05,4.274003e-13
...,...,...,...,...,...,...,...,...,...,...
4_13681,2.557147e-18,3.785219e-15,2.366307e-13,4.329301e-08,1.110778e-04,4.569230e-16,4.551192e-18,3.977166e-16,1.793794e-14,9.412823e-01
4_13682,1.904857e-13,3.979871e-13,1.681163e-10,2.300984e-10,9.086042e-01,7.093535e-21,9.250489e-19,8.101208e-17,3.967741e-17,8.297649e-09
4_13683,3.569051e-12,7.494605e-08,4.569505e-05,8.715396e-01,2.190433e-07,7.166176e-16,1.738358e-16,1.070137e-17,3.586017e-12,7.725170e-12
4_13684,1.515786e-15,2.098635e-13,6.944415e-10,1.426281e-08,3.433115e-03,2.658963e-17,1.186033e-18,1.115595e-17,1.341295e-14,8.604477e-01


Currently, we assign a cell label based on the maximum probability among all possible clusters. However, there could be mislabeled because maximum and second highest probabilies can be very close that the user might be interested.