# pipeline of doing pseudotime analysis

In [1]:
# Step 1: Import necessary libraries
import os
import numpy as np
import matplotlib.pyplot as plt
from omegaconf import DictConfig, OmegaConf
from core.model.inference import run_inference
from core.data.cell import CellBatch
from core.pseduotime.dimensionality_reduction import DimensionalityReduction
from core.pseduotime.clustering import Clustering
from core.pseduotime.pseudotime import SlingshotMethod

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  @hydra.main(config_path="../../config/inference", config_name="main")
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 2: Define and Load Configuration
# Define a dictionary for model configuration. This would be extended based on your actual model parameters.
cfg = {
    "inference": {
        "checkpoint_path": "/Users/zhangjiahao/Project/tic/results/train/2025-02-26_14-56-11/checkpoints/checkpoint_31_5000.pth",
        "batch_size": 64,
        "dataset_root": "/Users/zhangjiahao/Project/tic/data",
        "node_features":  ["center_coord", "SIZE", "cell_type", "biomarker_expression"],  # Node features to use
        "output_dir": "/Users/zhangjiahao/Project/tic/results/train/2025-02-26_14-56-11/cells",
        "device": "cpu" # Use "cuda" for GPU or "cpu" for CPU
    },
    "model": {
        "num_layer": 3,
        "emb_dim": 512,
        "gnn_type": "gin",
        "dropout": 0.25,
    },
    "clustering": {
        "method": "kmeans",
        "n_clusters": 5,
    },
    "dimensionality_reduction": {
        "method": "PCA", # Use "PCA" or "UMAP" for dimensionality reduction
        "n_components": 2,
    },
    "pseudotime_analysis": {
        "start_node": 0,
        "output_dir": "/Users/zhangjiahao/Project/tic/results/train/2025-02-26_14-56-11/pseduotime" # Output directory for pseudotime analysis visualization
    }
}

# Convert the configuration to DictConfig object
cfg = DictConfig(cfg)
print(OmegaConf.to_yaml(cfg))

inference:
  checkpoint_path: /Users/zhangjiahao/Project/tic/results/train/2025-02-26_14-56-11/checkpoints/checkpoint_31_5000.pth
  batch_size: 64
  dataset_root: /Users/zhangjiahao/Project/tic/data
  node_features:
  - center_coord
  - SIZE
  - cell_type
  - biomarker_expression
  output_dir: /Users/zhangjiahao/Project/tic/results/train/2025-02-26_14-56-11/cells
  device: cpu
model:
  num_layer: 3
  emb_dim: 512
  gnn_type: gin
  dropout: 0.25
clustering:
  method: kmeans
  n_clusters: 5
dimensionality_reduction:
  method: PCA
  n_components: 2
pseudotime_analysis:
  start_node: 0
  output_dir: /Users/zhangjiahao/Project/tic/results/train/2025-02-26_14-56-11/pseduotime



## Run the GNN model to get embeddings (or use your own method) to extract embeddings

In [None]:
# run_inference(cfg) # this will use cfg.inference and cfg.model to run inference

## Load Cells into a CellBatch object
After running the GNN or other embedding method, you will load the embeddings as a batch of cells.
Update the path to where the embeddings are stored.

In [3]:
cell_dir = cfg.inference.output_dir
cells = CellBatch(pkl_dir=cell_dir)
print(cells)
print(f"Shape of embeddings in cells: {cells.get_embeddings().shape}")

CellBatch(num_cells=2867)
Shape of embeddings in cells: (2867, 512)


In [None]:
print(cells.cells[0])

## Perform Dimensionality Reduction (e.g., UMAP)

In [None]:
# Using the configuration parameters for dimensionality reduction.
dimension_reducer = DimensionalityReduction(method=cfg.dimensionality_reduction.method, 
                                            n_components=cfg.dimensionality_reduction.n_components)

embeddings = cells.get_embeddings()
umap_embeddings = dimension_reducer.reduce(embeddings)
print(f"Shape of UMAP embeddings: {umap_embeddings.shape}")

# Add the UMAP embeddings back into the cell batch for further analysis
cells.add_umap_embeddings(umap_embeddings)

## Perform Clustering (e.g., KMeans)

In [None]:
# Using the configured clustering method and number of clusters.
cluster_method = cfg.clustering.method
n_clusters = cfg.clustering.n_clusters
cluster = Clustering(method=cluster_method, n_clusters=n_clusters)
cluster_labels = cluster.cluster(umap_embeddings)

## Perform Pseudotime Analysis using Slingshot

In [None]:
start_node = cfg.pseudotime_analysis.start_node
output_dir = cfg.pseudotime_analysis.output_dir
analysiser = SlingshotMethod(start_node=start_node)
pseduotime = analysiser.analyze(cluster_labels, umap_embeddings, output_dir=output_dir)

# Add the pseudotime values to the cell batch
cells.add_pseudotimes(pseduotime)

In [None]:
cells.save(output_dir=cfg.inference.output_dir)
print(f"Saved the cell batch with pseudotime values at {cfg.inference.output_dir}")