# 2.assess-heterogeneity 

This section of the notebook uses buscar's clustering module to assess single-cell heterogeneity. We'll focus on three specific datasets: **CFReT**, **MitoCheck**, and **CPJUMP (crispir)**. The goal is to use our clustering algorithms to identify cellular heterogeneity at the single-cell level. 

A key advantage of using these datasets is that they include ground-truth labels. This allows us to evaluate whether our clustering algorithms are identifying biologically meaningful groups in a data-driven way, and to assess the accuracy of our approach.

In [1]:
import sys
import pathlib


sys.path.append("../../")
from utils.heterogeneity import optimized_clustering, cluster_profiles
from utils.io_utils import load_profiles

  from pkg_resources import get_distribution, DistributionNotFound


Setting parameter grid search for the optimized clustering 

In [2]:
cluster_search_param_grid = {
    # Clustering resolution: how granular the clusters should be
    "cluster_resolution": {"type": "float", "low": 0.1, "high": 3.0},
    # Number of neighbors for graph construction
    "n_neighbors": {"type": "int", "low": 5, "high": 50},
    # Clustering algorithm
    "cluster_method": {"type": "categorical", "choices": ["leiden", "louvain"]},
    # Distance metric for neighbor computation
    "neighbor_distance_metric": {
        "type": "categorical",
        "choices": ["euclidean", "cosine", "manhattan"],
    },
    # Dimensionality reduction approach
    "dim_reduction": {"type": "categorical", "choices": ["PCA", "raw"]},
}

Setting paths

In [3]:
# set module and data directory paths
download_module_path = pathlib.Path("../0.download-data/").resolve(strict=True)
sc_profiles_path = (download_module_path / "data" / "sc-profiles").resolve(strict=True)


# setting profiles paths
cfret_profiles_path = (
    sc_profiles_path / "cfret" / "localhost230405150001_sc_feature_selected.parquet"
).resolve(strict=True)
cpjump1_trt_crispr_profiles_path = (
    sc_profiles_path
    / "cpjump1"
    / "trt-profiles"
    / "cpjump1_crispr_trt_profiles.parquet"
).resolve(strict=True)
mitocheck_trt_profiles_path = (
    sc_profiles_path / "mitocheck" / "mitocheck_concat_profiles.parquet"
).resolve(strict=True)

# create signature output paths
results_dir = pathlib.Path("./results/cluster-labels").resolve()
results_dir.mkdir(exist_ok=True, parents=True)

Loading datasets

In [8]:
# load all profiles
mitocheck_profiles_df = load_profiles(mitocheck_trt_profiles_path)
cfret_profiles_df = load_profiles(cfret_profiles_path)
cpjump1_crispr_profiles_df = load_profiles(cpjump1_trt_crispr_profiles_path)

## Clustering profiles

### Assessing heterogeneity for MitoCheck data

In [5]:
# separate metadata based on phenotypic class
# split metadata and features
mito_meta = [
    "index",
    "Mitocheck_Phenotypic_Class",
    "Cell_UUID",
    "Metadata_treatment_type",
    "Metadata_cell_id",
    "Location_Center_X",
    "Location_Center_Y",
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Frame",
    "Metadata_Site",
    "Metadata_Plate_Map_Name",
    "Metadata_DNA",
    "Metadata_Gene",
    "Metadata_Gene_Replicate",
]

mito_features = mitocheck_profiles_df.drop(mito_meta).columns

In [None]:
# does not work getting an error about dealing with sparse matrices
cluster_profiles(
    profiles=mitocheck_profiles_df,
    meta_features=mito_meta,
    morph_features=mito_features,
    treatment_col="Metadata_Gene",
    pca_n_components_to_capture_variance=100,
)

### Assessing heterogeneity for CFReT data

In [12]:
# split metadata and features for cfret
cfret_meta = [
    "Metadata_WellRow",
    "Metadata_WellCol",
    "Metadata_heart_number",
    "Metadata_cell_type",
    "Metadata_heart_failure_type",
    "Metadata_treatment",
    "Metadata_Nuclei_Location_Center_X",
    "Metadata_Nuclei_Location_Center_Y",
    "Metadata_Cells_Location_Center_X",
    "Metadata_Cells_Location_Center_Y",
    "Metadata_Image_Count_Cells",
    "Metadata_ImageNumber",
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Cells_Number_Object_Number",
    "Metadata_Cytoplasm_Parent_Cells",
    "Metadata_Cytoplasm_Parent_Nuclei",
    "Metadata_Nuclei_Number_Object_Number",
    "Metadata_Site",
]

cfret_feats = cfret_profiles_df.drop(cfret_meta).columns

In [None]:
cfret_cluster_results = optimized_clustering(
    profiles=cfret_profiles_df,
    morph_features=cfret_feats,
    meta_features=cfret_meta,
    treatment_col="Metadata_treatment",
    n_trials=200,
    param_grid=cluster_search_param_grid,
)

cfret_cluster_results

[I 2025-10-03 10:49:34,076] A new study created in memory with name: cluster_optimization_0
[I 2025-10-03 10:49:45,603] Trial 0 finished with value: 0.49540420217147246 and parameters: {'cluster_resolution': 1.6915591613892418, 'n_neighbors': 37, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine', 'dim_reduction': 'raw'}. Best is trial 0 with value: 0.49540420217147246.
[I 2025-10-03 10:49:57,073] Trial 1 finished with value: 0.5021723535853716 and parameters: {'cluster_resolution': 1.2119804045947553, 'n_neighbors': 41, 'cluster_method': 'louvain', 'neighbor_distance_metric': 'euclidean', 'dim_reduction': 'raw'}. Best is trial 1 with value: 0.5021723535853716.
  warn(
[I 2025-10-03 10:51:27,857] Trial 2 finished with value: -0.16579425225099012 and parameters: {'cluster_resolution': 2.356654577754566, 'n_neighbors': 45, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine', 'dim_reduction': 'PCA'}. Best is trial 1 with value: 0.5021723535853716.
[I 2025-10-03 

(shape: (20_865, 681)
 ┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
 │ Metadata_ ┆ Metadata_ ┆ Metadata_ ┆ Metadata_ ┆ … ┆ Metadata_ ┆ Metadata_ ┆ Metadata_ ┆ Metadata │
 │ WellRow   ┆ WellCol   ┆ heart_num ┆ cell_type ┆   ┆ cluster_i ┆ cluster_n ┆ treatment ┆ _cluster │
 │ ---       ┆ ---       ┆ ber       ┆ ---       ┆   ┆ d         ┆ _cells    ┆ _n_cells  ┆ _ratio   │
 │ str       ┆ i64       ┆ ---       ┆ str       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
 │           ┆           ┆ i64       ┆           ┆   ┆ cat       ┆ u32       ┆ u32       ┆ f64      │
 ╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
 │ B         ┆ 2         ┆ 9         ┆ failing   ┆ … ┆ DMSO_leid ┆ 126       ┆ 10519     ┆ 1.197832 │
 │           ┆           ┆           ┆           ┆   ┆ en_41     ┆           ┆           ┆          │
 │ B         ┆ 2         ┆ 9         ┆ failing   ┆ … ┆ DMSO_

### Assessing heterogeneity for CPJUMP1 CRISPR data

In [14]:
# split metadata and features for cpjump1
cpjump1_meta = [
    "index",
    "Metadata_broad_sample",
    "Metadata_ImageNumber",
    "Metadata_Plate",
    "Metadata_Site",
    "Metadata_Well",
    "Metadata_TableNumber",
    "Metadata_ObjectNumber_cytoplasm",
    "Metadata_Cytoplasm_Parent_Cells",
    "Metadata_Cytoplasm_Parent_Nuclei",
    "Metadata_ObjectNumber_cells",
    "Metadata_ObjectNumber",
    "Metadata_gene",
    "Metadata_pert_type",
    "Metadata_control_type",
    "Metadata_target_sequence",
    "Metadata_negcon_control_type",
    "__index_level_0__",
]

# split metadata and features for cpjump1
cpjump1_feats = cpjump1_crispr_profiles_df.drop(cpjump1_meta).columns

In [17]:
cpjump1_cluster_results = optimized_clustering(
    profiles=cpjump1_crispr_profiles_df,
    meta_features=cpjump1_meta,
    morph_features=cpjump1_feats,
    treatment_col="Metadata_gene",
    param_grid=cluster_search_param_grid,
    n_trials=30,
)

[I 2025-10-06 09:36:13,825] A new study created in memory with name: cluster_optimization_0
[I 2025-10-06 10:54:32,232] Trial 0 finished with value: 0.21206753315840984 and parameters: {'cluster_resolution': 1.6915591613892418, 'n_neighbors': 37, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine', 'dim_reduction': 'raw'}. Best is trial 0 with value: 0.21206753315840984.
[I 2025-10-06 12:15:45,314] Trial 1 finished with value: 0.2291905488765506 and parameters: {'cluster_resolution': 1.2119804045947553, 'n_neighbors': 41, 'cluster_method': 'louvain', 'neighbor_distance_metric': 'euclidean', 'dim_reduction': 'raw'}. Best is trial 1 with value: 0.2291905488765506.
  warn(
[I 2025-10-06 13:13:59,118] Trial 2 finished with value: -0.2963307000003131 and parameters: {'cluster_resolution': 2.356654577754566, 'n_neighbors': 45, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine', 'dim_reduction': 'PCA'}. Best is trial 1 with value: 0.2291905488765506.
[I 2025-10-06 1

: 