# 2.assess-heterogeneity 

This section of the notebook uses buscar's clustering module to assess single-cell heterogeneity. We'll focus on three specific datasets: **CFReT**, **MitoCheck**, and **CPJUMP (crispir)**. The goal is to use our clustering algorithms to identify cellular heterogeneity at the single-cell level. 

A key advantage of using these datasets is that they include ground-truth labels. This allows us to evaluate whether our clustering algorithms are identifying biologically meaningful groups in a data-driven way, and to assess the accuracy of our approach.

In [15]:
import sys
import pathlib


sys.path.append("../../")
from utils.heterogeneity import cluster_profiles
from utils.io_utils import load_profiles

Setting parameter grid search for the optimized clustering 

In [2]:
cluster_search_param_grid = {
    # Clustering resolution: how granular the clusters should be
    "cluster_resolution": {"type": "float", "low": 0.1, "high": 3.0},
    # Number of neighbors for graph construction
    "n_neighbors": {"type": "int", "low": 5, "high": 50},
    # Clustering algorithm
    "cluster_method": {"type": "categorical", "choices": ["leiden", "louvain"]},
    # Distance metric for neighbor computation
    "neighbor_distance_metric": {
        "type": "categorical",
        "choices": ["euclidean", "cosine", "manhattan"],
    },
    # Dimensionality reduction approach
    "dim_reduction": {"type": "categorical", "choices": ["PCA", "raw"]},
}

Setting paths

In [3]:
# set module and data directory paths
download_module_path = pathlib.Path("../0.download-data/").resolve(strict=True)
sc_profiles_path = (download_module_path / "data" / "sc-profiles").resolve(strict=True)


# setting profiles paths
cfret_profiles_path = (
    sc_profiles_path / "cfret" / "localhost230405150001_sc_feature_selected.parquet"
).resolve(strict=True)
cpjump1_trt_crispr_profiles_path = (
    sc_profiles_path
    / "cpjump1"
    / "trt-profiles"
    / "cpjump1_crispr_trt_profiles.parquet"
).resolve(strict=True)
mitocheck_trt_profiles_path = (
    sc_profiles_path / "mitocheck" / "mitocheck_concat_profiles.parquet"
).resolve(strict=True)

# create signature output paths
results_dir = pathlib.Path("./results/cluster-labels").resolve()
results_dir.mkdir(exist_ok=True, parents=True)

Loading datasets

In [4]:
# load all profiles
mitocheck_trt_profile_df = load_profiles(mitocheck_trt_profiles_path)
cfret_profile_df = load_profiles(cfret_profiles_path)
cpjump1_trt_crispr_df = load_profiles(cpjump1_trt_crispr_profiles_path)

## Clustering profiles

### Assessing heterogeneity for MitoCheck data

In [26]:
# separate metadata based on phenotypic class
# split metadata and features
mito_meta = [
    "index",
    "Mitocheck_Phenotypic_Class",
    "Cell_UUID",
    "Metadata_cell_id",
    "Location_Center_X",
    "Location_Center_Y",
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Frame",
    "Metadata_Site",
    "Metadata_Plate_Map_Name",
    "Metadata_DNA",
    "Metadata_Gene",
    "Metadata_Gene_Replicate",
]

mito_features = mitocheck_trt_profile_df.drop(mito_meta).columns

In [28]:
cluster_profiles(
    profiles=mitocheck_trt_profile_df,
    meta_features=mito_meta,
    morph_features=mito_features,
    treatment_col="Metadata_Gene",
)

ValueError: could not convert string to float: 'trt'

### Assessing heterogeneity for CFReT data

In [29]:
# split metadata and features for cfret
cfret_meta = [
    "Metadata_WellRow",
    "Metadata_WellCol",
    "Metadata_heart_number",
    "Metadata_cell_type",
    "Metadata_heart_failure_type",
    "Metadata_treatment",
    "Metadata_Nuclei_Location_Center_X",
    "Metadata_Nuclei_Location_Center_Y",
    "Metadata_Cells_Location_Center_X",
    "Metadata_Cells_Location_Center_Y",
    "Metadata_Image_Count_Cells",
    "Metadata_ImageNumber",
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Cells_Number_Object_Number",
    "Metadata_Cytoplasm_Parent_Cells",
    "Metadata_Cytoplasm_Parent_Nuclei",
    "Metadata_Nuclei_Number_Object_Number",
    "Metadata_Site",
]

cfret_feats = cfret_profile_df.drop(cfret_meta).columns

In [30]:
cfret_cluster_results = cluster_profiles(
    profiles=cfret_profile_df,
    morph_features=cfret_feats,
    meta_features=cfret_meta,
    treatment_col="Metadata_treatment",
)
cfret_cluster_results


 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(


Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Nuclei_Number_Object_Number,Metadata_Site,Metadata_cell_id,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MeanRadius,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Perimeter,Cytoplasm_AreaShape_Solidity,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,Cytoplasm_AreaShape_Zernike_2_0,Cytoplasm_AreaShape_Zernike_2_2,Cytoplasm_AreaShape_Zernike_3_1,Cytoplasm_AreaShape_Zernike_4_0,Cytoplasm_AreaShape_Zernike_4_2,…,Nuclei_Texture_InfoMeas1_ER_3_00_256,Nuclei_Texture_InfoMeas1_ER_3_01_256,Nuclei_Texture_InfoMeas1_ER_3_02_256,Nuclei_Texture_InfoMeas1_ER_3_03_256,Nuclei_Texture_InfoMeas1_Hoechst_3_00_256,Nuclei_Texture_InfoMeas1_Hoechst_3_01_256,Nuclei_Texture_InfoMeas1_Hoechst_3_02_256,Nuclei_Texture_InfoMeas1_Hoechst_3_03_256,Nuclei_Texture_InfoMeas1_Mitochondria_3_00_256,Nuclei_Texture_InfoMeas1_Mitochondria_3_01_256,Nuclei_Texture_InfoMeas1_Mitochondria_3_02_256,Nuclei_Texture_InfoMeas1_Mitochondria_3_03_256,Nuclei_Texture_InfoMeas1_PM_3_00_256,Nuclei_Texture_InfoMeas1_PM_3_01_256,Nuclei_Texture_InfoMeas1_PM_3_02_256,Nuclei_Texture_InfoMeas1_PM_3_03_256,Nuclei_Texture_InfoMeas2_ER_3_01_256,Nuclei_Texture_InfoMeas2_ER_3_03_256,Nuclei_Texture_InfoMeas2_Hoechst_3_01_256,Nuclei_Texture_InfoMeas2_Hoechst_3_03_256,Nuclei_Texture_InfoMeas2_PM_3_01_256,Nuclei_Texture_InfoMeas2_PM_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_03_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumEntropy_PM_3_01_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_01_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_cluster_id,Metadata_cluster_n_cells,Metadata_treatment_n_cells,Metadata_cluster_ratio
str,i64,i64,str,str,str,f64,f64,f64,f64,i64,i64,str,str,i64,i64,i64,i64,str,u64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,cat,u32,u32,f64
"""B""",2,9,"""failing""","""rejected""","""DMSO""",221.046761,137.115493,246.6028,109.285755,40,1,"""localhost230405150001""","""B02""",1,1,6,6,"""f00""",3741745246391810767,-1.35494,0.841229,0.648883,-0.850138,-1.045214,1.298358,0.376165,0.935101,1.530228,-0.983617,-0.261031,-0.299817,-0.721977,0.944725,0.161074,0.532329,1.845864,…,-0.173336,0.300041,0.217945,-0.039774,0.488531,0.472164,0.28659,0.464359,0.501649,0.507623,1.076663,0.741941,-0.696022,-0.178762,0.186741,0.158222,0.341595,0.50487,-0.440604,-0.426966,0.194372,-0.035117,0.400021,-0.619206,-0.393448,0.961214,0.406068,0.374039,-0.280532,-0.158967,-0.344804,-0.263653,-0.305486,"""DMSO_leiden_84""",60,10519,0.570396
"""B""",2,9,"""failing""","""rejected""","""DMSO""",690.596142,183.067828,716.170091,177.132195,40,1,"""localhost230405150001""","""B02""",2,2,7,7,"""f00""",2742388086301918299,0.657107,-0.850399,-0.584931,2.090925,1.263259,-0.021031,1.627957,0.944161,-0.085511,1.475345,2.164761,-0.688462,1.215015,1.499086,-0.770667,1.012721,0.6791,…,0.134835,0.263514,-0.124309,0.634517,0.968512,0.859562,0.351144,0.914468,-2.508508,-2.389124,-1.80698,-2.121536,-0.231231,-0.763949,-1.055166,-0.258152,0.282319,0.048807,-0.981164,-1.0743,0.612996,0.290339,0.030854,-0.421502,-0.61852,-0.050925,0.424753,0.323462,-0.096856,-0.218001,-0.359297,2.621455,-0.175679,"""DMSO_leiden_55""",71,10519,0.674969
"""B""",2,9,"""failing""","""rejected""","""DMSO""",626.56149,206.923698,623.94374,199.90644,40,1,"""localhost230405150001""","""B02""",3,3,8,8,"""f00""",8052448581702921839,0.384287,-0.727344,0.399813,0.699568,0.778991,-0.192578,-0.166121,-0.185078,-0.620564,0.385325,0.41953,-1.35377,-0.189027,1.88019,-0.198823,0.77826,2.084304,…,-0.126239,0.315114,-0.682006,-0.952994,0.534521,0.448969,-0.512213,0.68761,-0.333052,-1.116806,-0.671374,-0.085583,0.565659,0.117809,-0.035232,0.340022,0.392109,0.906171,-0.637012,-0.912759,-0.139719,-0.319312,-0.119514,-0.62708,-0.213998,0.492022,0.783465,0.531513,-0.515924,-0.090464,-0.381751,-0.23489,-0.312005,"""DMSO_leiden_58""",70,10519,0.665462
"""B""",2,9,"""failing""","""rejected""","""DMSO""",559.448583,220.68816,528.646623,196.955552,40,1,"""localhost230405150001""","""B02""",4,4,9,9,"""f00""",14147989784803112122,-0.08178,-0.31057,-1.984463,0.923396,-0.152527,-0.454748,0.485672,0.978143,0.075853,0.333035,1.036702,2.124015,-0.11271,-1.276017,0.663499,1.351768,-2.07981,…,-0.285138,0.187411,-0.401472,-1.323716,0.216479,0.694455,0.22334,0.272893,-1.610123,-1.983535,-1.990444,-1.759351,-0.667021,-1.511134,-1.70973,-1.025608,0.38988,0.970785,-0.723812,-0.240465,1.02861,0.817875,0.731123,-0.410279,0.066951,0.233985,0.697668,0.3868,0.216837,-0.078625,-0.345897,-0.148249,-0.205381,"""DMSO_leiden_107""",53,10519,0.50385
"""B""",2,9,"""failing""","""rejected""","""DMSO""",909.019946,247.69434,897.965996,253.621836,40,1,"""localhost230405150001""","""B02""",5,5,10,10,"""f00""",4803356350846732868,1.384627,-0.236857,0.651571,-0.525561,-0.256208,-0.352022,-0.51073,-0.650514,-0.61187,-0.390602,-0.915644,0.274757,-0.807468,-0.263914,1.012877,0.333081,0.457026,…,0.560328,0.087048,0.500935,1.024688,0.682356,0.703425,-0.559919,0.535412,-0.446346,-0.250839,-0.325067,-0.220781,0.135176,-0.068065,-1.328074,-0.471597,-0.313553,-1.011855,-0.921082,-0.718369,-0.1701,0.076669,0.151063,0.78411,0.796587,-0.833035,0.971781,0.96971,-0.859995,-0.437968,-0.375427,0.054053,-0.346036,"""DMSO_leiden_40""",78,10519,0.741515
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""G""",10,9,"""failing""","""rejected""","""DMSO""",417.553262,678.670439,411.069742,701.062876,20,680,"""localhost230405150001""","""G10""",16,16,21,21,"""f15""",1828074344285244675,-0.411438,-0.550025,0.933272,0.159549,0.288706,0.000636,-0.337034,-0.783618,-0.546877,-0.099205,-0.353632,-0.727437,-0.141679,1.124212,-1.315817,-0.152297,1.113884,…,0.027805,0.1628,0.467093,0.150966,0.389177,0.061158,-0.186975,-0.483708,0.062713,-0.236416,-0.99515,-0.389831,-0.11007,0.167719,-1.285632,-1.442444,0.235799,0.235442,0.310009,0.747407,0.351582,1.123192,0.387158,0.046973,-0.009479,0.323142,-0.684759,0.097293,0.554241,-0.318752,-0.199681,-0.225785,0.028462,"""DMSO_leiden_87""",59,10519,0.56089
"""G""",10,9,"""failing""","""rejected""","""DMSO""",498.481236,693.440765,520.422886,716.795365,20,680,"""localhost230405150001""","""G10""",17,17,22,22,"""f15""",17996227433841145301,-0.070413,-0.879821,-1.853292,1.637607,1.405004,-0.384347,1.287084,1.054404,-0.290838,1.289462,2.013421,-0.172146,1.370837,-0.761592,-0.284166,0.507502,-1.569647,…,-0.353139,-1.029642,-1.874687,-1.043998,0.232443,0.168135,-1.206198,-0.183575,-0.867178,-1.448457,-2.28079,-1.267928,0.01632,-0.405172,-0.720056,-0.313263,1.08923,1.094201,0.397685,0.707845,1.018167,0.96371,0.833534,-1.45509,-1.22603,-1.051954,-1.508309,-1.71505,1.542973,0.817171,-0.005211,0.323479,0.822093,"""DMSO_leiden_120""",49,10519,0.465824
"""G""",10,9,"""failing""","""rejected""","""DMSO""",318.200661,728.801228,315.515137,756.658659,20,680,"""localhost230405150001""","""G10""",18,18,23,23,"""f15""",6564083374445386651,-0.885084,2.54069,0.386993,-1.984288,-1.446532,0.577869,-1.269813,0.676251,0.859093,-2.593615,-1.700356,-0.408731,-1.631464,-0.853618,-0.597128,-0.849872,-0.652601,…,0.293678,-0.103463,-0.569333,0.757699,0.179876,0.208219,-0.162802,0.130081,0.545801,0.198027,0.031118,0.645746,-0.460905,-0.349052,-1.305133,-0.598384,-0.065092,-0.650356,0.256878,0.33604,0.326594,0.471925,0.525841,0.861197,0.453347,0.591169,0.444195,0.553663,-0.179696,-0.408678,-0.093233,-0.259144,-0.266575,"""DMSO_leiden_11""",97,10519,0.922141
"""G""",10,9,"""failing""","""rejected""","""DMSO""",491.591029,876.989886,498.0537,851.270108,20,680,"""localhost230405150001""","""G10""",19,19,26,26,"""f15""",573771430041384564,-0.430384,0.709102,0.657286,-0.69934,-0.988828,1.013925,0.457867,0.645007,1.280872,-0.433404,-0.759422,1.370071,-1.353948,-1.339713,1.7348,1.408307,-1.057471,…,-0.709953,-0.441521,-0.749642,-0.716573,0.393876,-0.00311,0.264086,0.394655,0.67071,0.725018,0.829167,1.067858,-1.204221,-1.454792,-1.629882,-1.260699,0.277148,0.40219,0.416932,0.022673,0.751658,0.666525,0.553082,0.47027,0.667388,0.808673,0.988244,0.985433,-0.412572,-0.368709,-0.134434,-0.265111,-0.312917,"""DMSO_leiden_46""",75,10519,0.712996


### Assessing heterogeneity for CPJUMP1 CRISPR data

In [31]:
# split metadata and features for cpjump1
cpjump1_meta = [
    "index",
    "Metadata_broad_sample",
    "Metadata_ImageNumber",
    "Metadata_Plate",
    "Metadata_Site",
    "Metadata_Well",
    "Metadata_TableNumber",
    "Metadata_ObjectNumber_cytoplasm",
    "Metadata_Cytoplasm_Parent_Cells",
    "Metadata_Cytoplasm_Parent_Nuclei",
    "Metadata_ObjectNumber_cells",
    "Metadata_ObjectNumber",
    "Metadata_gene",
    "Metadata_pert_type",
    "Metadata_control_type",
    "Metadata_target_sequence",
    "Metadata_negcon_control_type",
    "__index_level_0__",
]

# split metadata and features for cpjump1
cpjump1_feats = cpjump1_trt_crispr_df.drop(cpjump1_meta).columns

In [33]:
cpjump1_cluster_results = cluster_profiles(
    profiles=cpjump1_trt_crispr_df,
    meta_features=cpjump1_meta,
    morph_features=cpjump1_feats,
    treatment_col="Metadata_gene",
)