# 2.assess-heterogeneity 

This section of the notebook uses buscar's clustering module to assess single-cell heterogeneity. We'll focus on three specific datasets: **CFReT**, **MitoCheck**, and **CPJUMP (crispir)**. The goal is to use our clustering algorithms to identify cellular heterogeneity at the single-cell level. 

A key advantage of using these datasets is that they include ground-truth labels. This allows us to evaluate whether our clustering algorithms are identifying biologically meaningful groups in a data-driven way, and to assess the accuracy of our approach.

In [1]:
import sys
import pathlib

import json

sys.path.append("../../")
from utils.heterogeneity import assess_heterogeneity
from utils.io_utils import load_profiles

  from pkg_resources import get_distribution, DistributionNotFound


Setting paths

In [2]:
# set module and data directory paths
download_module_path = pathlib.Path("../0.download-data/").resolve(strict=True)
sc_profiles_path = (download_module_path / "data" / "sc-profiles").resolve(strict=True)


# setting profiles paths
cfret_profiles_path = (
    sc_profiles_path / "cfret" / "localhost230405150001_sc_feature_selected.parquet"
).resolve(strict=True)
cpjump1_trt_crispr_profiles_path = (
    sc_profiles_path
    / "cpjump1"
    / "trt-profiles"
    / "cpjump1_crispr_trt_profiles.parquet"
).resolve(strict=True)
# mitocheck_trt_profiles_path = (
#     sc_profiles_path / "mitocheck" / "treated_mitocheck_cp_profiles.parquet"
# ).resolve(strict=True)

# create signature output paths
results_dir = pathlib.Path("./results/cluster-labels").resolve()
results_dir.mkdir(exist_ok=True, parents=True)

Loading datasets

In [4]:
# load all profiles
# mitocheck_trt_profile_df = load_profiles(mitocheck_trt_profiles_path)
cfret_profile_df = load_profiles(cfret_profiles_path)
cpjump1_trt_crispr_df = load_profiles(cpjump1_trt_crispr_profiles_path)

## Clustering profiles

### Assessing heterogeneity for MitoCheck data

In [None]:
# separate metadata based on phenotypic class
# split metadata and features
mito_meta = [
    "Mitocheck_Phenotypic_Class",
    "Cell_UUID",
    "Location_Center_X",
    "Location_Center_Y",
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Frame",
    "Metadata_Site",
    "Metadata_Plate_Map_Name",
    "Metadata_DNA",
    "Metadata_Gene",
    "Metadata_Gene_Replicate",
    "Metadata_Object_Outline",
]

mito_features = mitocheck_trt_profile_df.drop(mito_meta).columns

In [None]:
mitocheck_cluster_results = assess_heterogeneity(
    profiles=mitocheck_trt_profile_df,
    meta=mito_meta,
    features=mito_features,
    n_trials=10,
    n_jobs=1,
    study_name="mitocheck_heterogeneity",
    seed=0,
)
with open(results_dir / "mitocheck_heterogenic_clusters_results.json", "w") as f:
    json.dump(mitocheck_cluster_results, f)

### Assessing heterogeneity for CFReT data

In [5]:
# only selected treatment profiles from cfret
# cfret_trt = cfret_profile_df.filter(pl.col("Metadata_treatment") != "DMSO")

# split metadata and features for cfret
cfret_meta = [
    "Metadata_WellRow",
    "Metadata_WellCol",
    "Metadata_heart_number",
    "Metadata_cell_type",
    "Metadata_heart_failure_type",
    "Metadata_treatment",
    "Metadata_Nuclei_Location_Center_X",
    "Metadata_Nuclei_Location_Center_Y",
    "Metadata_Cells_Location_Center_X",
    "Metadata_Cells_Location_Center_Y",
    "Metadata_Image_Count_Cells",
    "Metadata_ImageNumber",
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Cells_Number_Object_Number",
    "Metadata_Cytoplasm_Parent_Cells",
    "Metadata_Cytoplasm_Parent_Nuclei",
    "Metadata_Nuclei_Number_Object_Number",
    "Metadata_Site",
]

cfret_feats = cfret_profile_df.drop(cfret_meta).columns

In [12]:
cfret_cluster_results = assess_heterogeneity(
    profiles=cfret_profile_df,
    meta=cfret_meta,
    features=cfret_feats,
    n_trials=500,
    n_jobs=5,
    study_name="cfret_heterogeneity",
    seed=0,
)

[I 2025-09-11 19:31:08,926] A new study created in memory with name: cfret_heterogeneity


  0%|          | 0/500 [00:00<?, ?it/s]

  warn(
  warn(
  warn(


[I 2025-09-11 19:31:20,034] Trial 4 finished with value: -1.0 and parameters: {'n_neighbors': 29, 'dim_reduction': 'pca', 'pca_solver': 'randomized', 'pca_components': 79, 'resolution': 0.8644236208804086, 'method': 'leiden', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'leidenalg'}. Best is trial 4 with value: -1.0.


  warn(


[I 2025-09-11 19:31:24,799] Trial 1 finished with value: -1.0 and parameters: {'n_neighbors': 190, 'dim_reduction': 'raw', 'pca_solver': 'auto', 'pca_components': 74, 'resolution': 0.7216206574800512, 'method': 'leiden', 'dist_metric': 'cosine', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 4 with value: -1.0.
[I 2025-09-11 19:31:26,656] Trial 3 finished with value: -1.0 and parameters: {'n_neighbors': 184, 'dim_reduction': 'raw', 'pca_solver': 'arpack', 'pca_components': 90, 'resolution': 1.6912019168619274, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 4 with value: -1.0.


  sc.pp.pca(
  warn(


[I 2025-09-11 19:31:31,317] Trial 5 finished with value: -1.0 and parameters: {'n_neighbors': 192, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 22, 'resolution': 0.514506400641847, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'leidenalg'}. Best is trial 4 with value: -1.0.
[I 2025-09-11 19:31:35,875] Trial 7 finished with value: -1.0 and parameters: {'n_neighbors': 80, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 17, 'resolution': 1.7879206023305148, 'method': 'leiden', 'dist_metric': 'cosine', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'leidenalg'}. Best is trial 4 with value: -1.0.


  warn(


[I 2025-09-11 19:31:48,608] Trial 6 finished with value: -1.0 and parameters: {'n_neighbors': 195, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 84, 'resolution': 1.4609437513911498, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 4 with value: -1.0.
[I 2025-09-11 19:31:52,910] Trial 9 finished with value: -1.0 and parameters: {'n_neighbors': 152, 'dim_reduction': 'raw', 'pca_solver': 'arpack', 'pca_components': 85, 'resolution': 0.21382219931142857, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'leidenalg'}. Best is trial 4 with value: -1.0.
[I 2025-09-11 19:31:57,994] Trial 10 finished with value: -1.0 and parameters: {'n_neighbors': 145, 'dim_reduction': 'pca', 'pca_solver': 'randomized', 'pca_components': 35, 'resolution': 1.0678096919641347, 'method': 'leiden', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clus

`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:32:22,034] Trial 11 finished with value: 0.5048687061958449 and parameters: {'n_neighbors': 39, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 11, 'resolution': 0.6204250198862267, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 11 with value: 0.5048687061958449.
[I 2025-09-11 19:32:35,749] Trial 13 finished with value: 0.48025460447316304 and parameters: {'n_neighbors': 16, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 78, 'resolution': 0.853396075907053, 'method': 'leiden', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 11 with value: 0.5048687061958449.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:33:08,982] Trial 2 finished with value: 0.5162869816667686 and parameters: {'n_neighbors': 200, 'dim_reduction': 'raw', 'pca_solver': 'randomized', 'pca_components': 77, 'resolution': 0.5881046577677544, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'leidenalg'}. Best is trial 2 with value: 0.5162869816667686.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:33:20,030] Trial 8 finished with value: 0.5115144235416149 and parameters: {'n_neighbors': 176, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 96, 'resolution': 0.8688749817653099, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 2 with value: 0.5162869816667686.
[I 2025-09-11 19:33:21,738] Trial 14 finished with value: 0.47455823250114126 and parameters: {'n_neighbors': 13, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 49, 'resolution': 0.16504400438653954, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 2 with value: 0.5162869816667686.
[I 2025-09-11 19:33:26,832] Trial 12 finished with value: 0.487514443242759 and parameters: {'n_neighbors': 189, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 98, 'resolution': 1.6845245321242266, 'method': 'leiden', 'di

`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:34:05,544] Trial 16 finished with value: 0.5076375144754756 and parameters: {'n_neighbors': 115, 'dim_reduction': 'raw', 'pca_solver': 'randomized', 'pca_components': 55, 'resolution': 0.2616839631078705, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 2 with value: 0.5162869816667686.
[I 2025-09-11 19:35:04,437] Trial 19 finished with value: 0.5168488181452883 and parameters: {'n_neighbors': 131, 'dim_reduction': 'raw', 'pca_solver': 'randomized', 'pca_components': 62, 'resolution': 1.176351762048585, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'leidenalg'}. Best is trial 19 with value: 0.5168488181452883.
[I 2025-09-11 19:35:04,589] Trial 18 finished with value: 0.5087809709933446 and parameters: {'n_neighbors': 140, 'dim_reduction': 'raw', 'pca_solver': 'randomized', 'pca_components': 99, 'resolution': 1.1493783009536922, 'me

`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:44:22,892] Trial 45 finished with value: 0.5064362946910761 and parameters: {'n_neighbors': 127, 'dim_reduction': 'raw', 'pca_solver': 'randomized', 'pca_components': 58, 'resolution': 0.5127346643194762, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'leidenalg'}. Best is trial 19 with value: 0.5168488181452883.
[I 2025-09-11 19:44:25,799] Trial 46 finished with value: 0.5100748158163738 and parameters: {'n_neighbors': 199, 'dim_reduction': 'raw', 'pca_solver': 'randomized', 'pca_components': 58, 'resolution': 0.47299019266384423, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'leidenalg'}. Best is trial 19 with value: 0.5168488181452883.
[I 2025-09-11 19:44:30,263] Trial 47 finished with value: 0.5193532231214614 and parameters: {'n_neighbors': 196, 'dim_reduction': 'raw', 'pca_solver': 'randomized', 'pca_components': 59, 'resolution': 0.241481583578249

`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:44:51,826] Trial 52 finished with value: -1.0 and parameters: {'n_neighbors': 152, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 84, 'resolution': 0.26723382264351636, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 47 with value: 0.5193532231214614.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:44:56,392] Trial 54 finished with value: -1.0 and parameters: {'n_neighbors': 200, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 85, 'resolution': 0.23260684428189124, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 47 with value: 0.5193532231214614.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:45:00,579] Trial 53 finished with value: -1.0 and parameters: {'n_neighbors': 155, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 84, 'resolution': 0.8358907785333507, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 47 with value: 0.5193532231214614.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:45:44,000] Trial 51 finished with value: 0.4949571804165428 and parameters: {'n_neighbors': 154, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 83, 'resolution': 0.849441957546779, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'leidenalg'}. Best is trial 47 with value: 0.5193532231214614.
[I 2025-09-11 19:46:21,298] Trial 55 finished with value: 0.4971122241641983 and parameters: {'n_neighbors': 186, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 75, 'resolution': 0.21396828963784575, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 47 with value: 0.5193532231214614.
[I 2025-09-11 19:46:46,182] Trial 56 finished with value: 0.5227365421986392 and parameters: {'n_neighbors': 193, 'dim_reduction': 'raw', 'pca_solver': 'auto', 'pca_components': 75, 'resolution': 0.19641679441049398, 'method': 'louva

`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 19:59:56,142] Trial 101 finished with value: -1.0 and parameters: {'n_neighbors': 159, 'dim_reduction': 'raw', 'pca_solver': 'auto', 'pca_components': 50, 'resolution': 0.5714008992979998, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 92 with value: 0.5237995240646663.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 20:00:01,080] Trial 102 finished with value: -1.0 and parameters: {'n_neighbors': 180, 'dim_reduction': 'raw', 'pca_solver': 'auto', 'pca_components': 50, 'resolution': 0.5743480410457402, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 92 with value: 0.5237995240646663.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 20:00:07,145] Trial 103 finished with value: -1.0 and parameters: {'n_neighbors': 181, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 45, 'resolution': 0.7323964949609963, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 92 with value: 0.5237995240646663.
[I 2025-09-11 20:00:08,368] Trial 99 finished with value: 0.5198143987358433 and parameters: {'n_neighbors': 162, 'dim_reduction': 'raw', 'pca_solver': 'auto', 'pca_components': 71, 'resolution': 0.5635587077961162, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 92 with value: 0.5237995240646663.
[I 2025-09-11 20:00:57,757] Trial 100 finished with value: 0.5179261777448978 and parameters: {'n_neighbors': 161, 'dim_reduction': 'raw', 'pca_solver': 'auto', 'pca_components': 59, 'resolution': 0.5769065028931142, 'method': 'louvain', 'dist_metric': '

`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 20:23:53,986] Trial 162 finished with value: -1.0 and parameters: {'n_neighbors': 197, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 11, 'resolution': 0.29280108808332517, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 138 with value: 0.5450384826304095.
[I 2025-09-11 20:23:59,921] Trial 160 finished with value: 0.5235406430532232 and parameters: {'n_neighbors': 196, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 26, 'resolution': 0.2863049931057955, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 138 with value: 0.5450384826304095.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 20:24:05,168] Trial 163 finished with value: -1.0 and parameters: {'n_neighbors': 200, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 26, 'resolution': 0.28795095960100553, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 138 with value: 0.5450384826304095.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 20:24:10,850] Trial 164 finished with value: -1.0 and parameters: {'n_neighbors': 200, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 25, 'resolution': 0.28692421808082325, 'method': 'louvain', 'dist_metric': 'cosine', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 138 with value: 0.5450384826304095.
[I 2025-09-11 20:24:10,930] Trial 161 finished with value: 0.5202510688121982 and parameters: {'n_neighbors': 200, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 26, 'resolution': 0.2991243996166103, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 138 with value: 0.5450384826304095.
[I 2025-09-11 20:25:48,349] Trial 165 finished with value: 0.5188175707839661 and parameters: {'n_neighbors': 197, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 10, 'resolution': 0.2774451425377867, 'method': 'louvain', 'dist_

`resolution` parameter has no effect for flavor "igraph"
`resolution` parameter has no effect for flavor "igraph"
`resolution` parameter has no effect for flavor "igraph"
`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 21:00:59,474] Trial 254 finished with value: 0.5217598406575202 and parameters: {'n_neighbors': 185, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 10, 'resolution': 0.14445317736880384, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:01:00,868] Trial 255 finished with value: 0.5021423472728689 and parameters: {'n_neighbors': 186, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 10, 'resolution': 0.14552971064842668, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:01:06,023] Trial 256 finished with value: 0.503770461191206 and parameters: {'n_neighbors': 186, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 14, 'resolution': 0.14687708112269446, 'method': '

`resolution` parameter has no effect for flavor "igraph"
`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 21:27:10,800] Trial 325 finished with value: 0.5193762031055451 and parameters: {'n_neighbors': 171, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 17, 'resolution': 0.10188935445890612, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:27:14,798] Trial 324 finished with value: 0.5185273000916868 and parameters: {'n_neighbors': 183, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 12, 'resolution': 0.10132987724291216, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:27:23,585] Trial 328 finished with value: 0.49667396986311696 and parameters: {'n_neighbors': 177, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 10, 'resolution': 0.132369391448073, 'method': 'louv

`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 21:28:22,096] Trial 329 finished with value: 0.5038341411542591 and parameters: {'n_neighbors': 188, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 10, 'resolution': 0.10100667374249128, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:28:46,500] Trial 330 finished with value: 0.5221541462491333 and parameters: {'n_neighbors': 174, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 10, 'resolution': 0.13574921440950075, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:29:20,146] Trial 333 finished with value: 0.49476871531136124 and parameters: {'n_neighbors': 188, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 17, 'resolution': 0.14498952736944196, 'method':

`resolution` parameter has no effect for flavor "igraph"
`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 21:46:34,724] Trial 381 finished with value: 0.5059288703008121 and parameters: {'n_neighbors': 180, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 18, 'resolution': 1.3272578615489425, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 21:46:53,747] Trial 382 finished with value: 0.5152726371438864 and parameters: {'n_neighbors': 180, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 16, 'resolution': 0.18200583195278708, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:46:57,321] Trial 383 finished with value: 0.49352027247648833 and parameters: {'n_neighbors': 180, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 17, 'resolution': 0.21910384879695316, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:46:59,735] Trial 384 finished with value: 0.506668249446707 and parameters: {'n_neighbors': 178, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 92, 'resolution': 0.2006569221617458, 'method': 'louvai

`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 21:47:41,019] Trial 387 finished with value: 0.512666483985924 and parameters: {'n_neighbors': 188, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 12, 'resolution': 0.22456931519232837, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.


`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 21:49:08,626] Trial 389 finished with value: 0.49108206458434617 and parameters: {'n_neighbors': 188, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 12, 'resolution': 1.9937418148745798, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:49:11,378] Trial 390 finished with value: 0.49053855499937676 and parameters: {'n_neighbors': 188, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 12, 'resolution': 0.1343038457062479, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 21:49:11,756] Trial 388 finished with value: 0.49344688286678595 and parameters: {'n_neighbors': 188, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 12, 'resolution': 0.21780062422866836, 'method': 'louv

`resolution` parameter has no effect for flavor "igraph"
`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 22:19:32,944] Trial 475 finished with value: 0.5191351102055484 and parameters: {'n_neighbors': 178, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 11, 'resolution': 0.10125465607130474, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 22:19:35,803] Trial 474 finished with value: 0.4876910482679408 and parameters: {'n_neighbors': 177, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 13, 'resolution': 0.13217703938957898, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 22:19:37,236] Trial 473 finished with value: 0.5175508758978188 and parameters: {'n_neighbors': 191, 'dim_reduction': 'pca', 'pca_solver': 'arpack', 'pca_components': 13, 'resolution': 0.13676519994170383, 'method': 

`resolution` parameter has no effect for flavor "igraph"


[I 2025-09-11 22:20:06,344] Trial 480 finished with value: 0.2901266167800889 and parameters: {'n_neighbors': 6, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 17, 'resolution': 0.10064759684336262, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'vtraag', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 22:20:13,604] Trial 478 finished with value: 0.4961359528340423 and parameters: {'n_neighbors': 59, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 15, 'resolution': 0.1345374844248876, 'method': 'louvain', 'dist_metric': 'euclidean', 'louv_clustering_imp': 'igraph', 'leid_clustering_imp': 'igraph'}. Best is trial 190 with value: 0.547811394867538.
[I 2025-09-11 22:20:17,373] Trial 479 finished with value: 0.5094768250595711 and parameters: {'n_neighbors': 63, 'dim_reduction': 'pca', 'pca_solver': 'auto', 'pca_components': 17, 'resolution': 0.16603833906960974, 'method': 'louvain', 

In [None]:
cfret_cluster_results["cluster_labels"] = cfret_cluster_results[
    "cluster_labels"
].astype(int)

{'study': <optuna.study.study.Study at 0x76d74c949eb0>,
 'cluster_labels': array(['1', '1', '0', ..., '0', '5', '0'], shape=(20865,), dtype=object),
 'best_score': 0.547811394867538,
 'best_params': {'n_neighbors': 196,
  'dim_reduction': 'pca',
  'pca_solver': 'arpack',
  'pca_components': 14,
  'resolution': 0.10058072197144338,
  'method': 'louvain',
  'dist_metric': 'euclidean',
  'louv_clustering_imp': 'vtraag',
  'leid_clustering_imp': 'igraph'},
 'n_clusters': 6,
 'n_trials': 500}

In [16]:
with open(results_dir / "cfret_heterogenic_clusters_results.json", "w") as f:
    json.dump(
        {
            "best_score": cfret_cluster_results["best_score"],
            "best_params": cfret_cluster_results["best_params"],
            "n_clusters": cfret_cluster_results["n_clusters"],
            "n_trials": cfret_cluster_results["n_trials"],
            "cfret_cluster_results": cfret_cluster_results["cluster_labels"].tolist(),
        },
        f,
        indent=4,
    )

### Assessing heterogeneity for CPJUMP1 CRISPR data

In [None]:
# split metadata and features for cpjump1
cpjump1_meta = [
    "index",
    "Metadata_broad_sample",
    "Metadata_ImageNumber",
    "Metadata_Plate",
    "Metadata_Site",
    "Metadata_Well",
    "Metadata_TableNumber",
    "Metadata_ObjectNumber_cytoplasm",
    "Metadata_Cytoplasm_Parent_Cells",
    "Metadata_Cytoplasm_Parent_Nuclei",
    "Metadata_ObjectNumber_cells",
    "Metadata_ObjectNumber",
    "Metadata_gene",
    "Metadata_pert_type",
    "Metadata_control_type",
    "Metadata_target_sequence",
    "Metadata_negcon_control_type",
    "__index_level_0__",
]

# split metadata and features for cpjump1
cpjump1_feats = cpjump1_trt_crispr_df.drop(cpjump1_meta).columns

In [None]:
cpjump1_cluster_results = assess_heterogeneity(
    profiles=cpjump1_trt_crispr_df,
    meta=cpjump1_meta,
    features=cpjump1_feats,
    n_trials=10,
    n_jobs=1,
    study_name="cpjump1_heterogeneity",
    seed=0,
)
with open(results_dir / "cpjump1_heterogenic_clusters_results.json", "w") as f:
    json.dump(cpjump1_cluster_results, f)