# CFReT Buscar analysis


This notebook demonstrates applying the BUSCAR pipeline to the CFReT pilot Cell Painting dataset.
It walks through data loading, signature extraction, clustering, measuring phenotypic activity
(between a reference control and experimental treatments) and ranking treatments by effect.

Data & references
- Data source: CFReT pilot dataset (see paper: https://www.ahajournals.org/doi/full/10.1161/CIRCULATIONAHA.124.071956)
- Original data repo: https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis

In [1]:
import sys
import json
import pathlib

import polars as pl

sys.path.append("../../")
from utils.io_utils import load_profiles

# from utils.metrics import measure_phenotypic_activity
from utils.data_utils import split_meta_and_features
from utils.signatures import get_signatures
from utils.heterogeneity import optimized_clustering
from utils.metrics import measure_phenotypic_activity
from utils.identify_hits import identify_compound_hit

  from pkg_resources import get_distribution, DistributionNotFound


Setting paramters

In [2]:
# setting parameters
treatment_col = "Metadata_treatment"
treatment_heart_col = "Metadata_treatment_and_heart"


# parameters used for clustering optimization
cfret_cluster_param_grid = {
    # Clustering resolution: how granular the clusters should be
    "cluster_resolution": {"type": "float", "low": 0.1, "high": 2.2},
    # Number of neighbors for graph construction
    "n_neighbors": {"type": "int", "low": 5, "high": 100},
    # Clustering algorithm
    "cluster_method": {"type": "categorical", "choices": ["leiden", "louvain"]},
    # Distance metric for neighbor computation
    "neighbor_distance_metric": {
        "type": "categorical",
        "choices": ["euclidean", "cosine", "manhattan"],
    },
    # Dimensionality reduction approach
    "dim_reduction": {"type": "categorical", "choices": ["PCA", "raw"]},
}

Setting input and output paths

In [3]:
# load in raw data from
cfret_data_dir = pathlib.Path("../0.download-data/data/sc-profiles/cfret/").resolve(
    strict=True
)
cfret_profiles_path = (
    cfret_data_dir / "localhost230405150001_sc_feature_selected.parquet"
).resolve(strict=True)
cfret_feature_space_path = (
    cfret_data_dir / "cfret_feature_space_configs.json"
).resolve(strict=True)

# make results dir
results_dir = pathlib.Path("./results/cfret-pilot").resolve()
results_dir.mkdir(parents=True, exist_ok=True)

Data preprocessing 
- 

In [4]:
# loading profiles
cfret_df = load_profiles(cfret_profiles_path)

# add another metadata column that combins both Metadata_heart_number and Metadata_treatment
cfret_df = cfret_df.with_columns(
    (
        pl.col("Metadata_treatment").cast(pl.Utf8)
        + "_heart_"
        + pl.col("Metadata_heart_number").cast(pl.Utf8)
    ).alias("Metadata_treatment_and_heart")
)

# Update the Metadata_treatment column that distinguishes what is the reference
# among other treatments in this example we are using heart 11 + DMSO as our
# reference (healthy heart + DMSO)
cfret_df = cfret_df.with_columns(
    pl.when(
        (pl.col("Metadata_treatment") == "DMSO")
        & (pl.col("Metadata_heart_number") == 11)
    )
    .then(pl.lit("DMSO_heart_11"))
    .otherwise(pl.col("Metadata_treatment"))
    .alias("Metadata_treatment")
)
cfret_df = cfret_df.with_columns(
    pl.when(
        (pl.col("Metadata_treatment") == "DMSO")
        & (pl.col("Metadata_heart_number") == 9)
    )
    .then(pl.lit("DMSO_heart_9"))
    .otherwise(pl.col("Metadata_treatment"))
    .alias("Metadata_treatment")
)

# split features
cfret_meta, cfret_feats = split_meta_and_features(cfret_df)


# Display data
cfret_df.head()

Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Nuclei_Number_Object_Number,Metadata_Site,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MeanRadius,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Perimeter,Cytoplasm_AreaShape_Solidity,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,Cytoplasm_AreaShape_Zernike_2_0,Cytoplasm_AreaShape_Zernike_2_2,Cytoplasm_AreaShape_Zernike_3_1,Cytoplasm_AreaShape_Zernike_4_0,Cytoplasm_AreaShape_Zernike_4_2,Cytoplasm_AreaShape_Zernike_5_1,…,Nuclei_Texture_DifferenceVariance_Actin_3_01_256,Nuclei_Texture_DifferenceVariance_Mitochondria_3_03_256,Nuclei_Texture_DifferenceVariance_PM_3_03_256,Nuclei_Texture_InfoMeas1_ER_3_00_256,Nuclei_Texture_InfoMeas1_ER_3_01_256,Nuclei_Texture_InfoMeas1_ER_3_02_256,Nuclei_Texture_InfoMeas1_ER_3_03_256,Nuclei_Texture_InfoMeas1_Hoechst_3_00_256,Nuclei_Texture_InfoMeas1_Hoechst_3_01_256,Nuclei_Texture_InfoMeas1_Hoechst_3_02_256,Nuclei_Texture_InfoMeas1_Hoechst_3_03_256,Nuclei_Texture_InfoMeas1_Mitochondria_3_00_256,Nuclei_Texture_InfoMeas1_Mitochondria_3_01_256,Nuclei_Texture_InfoMeas1_Mitochondria_3_02_256,Nuclei_Texture_InfoMeas1_Mitochondria_3_03_256,Nuclei_Texture_InfoMeas1_PM_3_00_256,Nuclei_Texture_InfoMeas1_PM_3_01_256,Nuclei_Texture_InfoMeas1_PM_3_02_256,Nuclei_Texture_InfoMeas1_PM_3_03_256,Nuclei_Texture_InfoMeas2_ER_3_01_256,Nuclei_Texture_InfoMeas2_ER_3_03_256,Nuclei_Texture_InfoMeas2_Hoechst_3_01_256,Nuclei_Texture_InfoMeas2_Hoechst_3_03_256,Nuclei_Texture_InfoMeas2_PM_3_01_256,Nuclei_Texture_InfoMeas2_PM_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_03_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumEntropy_PM_3_01_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_01_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_treatment_and_heart
str,i64,i64,str,str,str,f64,f64,f64,f64,i64,i64,str,str,i64,i64,i64,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""B""",2,9,"""failing""","""rejected""","""DMSO_heart_9""",221.046761,137.115493,246.6028,109.285755,40,1,"""localhost230405150001""","""B02""",1,1,6,6,"""f00""",-1.35494,0.841229,0.648883,-0.850138,-1.045214,1.298358,0.376165,0.935101,1.530228,-0.983617,-0.261031,-0.299817,-0.721977,0.944725,0.161074,0.532329,1.845864,-1.418634,…,-0.052719,0.797095,0.359081,-0.173336,0.300041,0.217945,-0.039774,0.488531,0.472164,0.28659,0.464359,0.501649,0.507623,1.076663,0.741941,-0.696022,-0.178762,0.186741,0.158222,0.341595,0.50487,-0.440604,-0.426966,0.194372,-0.035117,0.400021,-0.619206,-0.393448,0.961214,0.406068,0.374039,-0.280532,-0.158967,-0.344804,-0.263653,-0.305486,"""DMSO_heart_9"""
"""B""",2,9,"""failing""","""rejected""","""DMSO_heart_9""",690.596142,183.067828,716.170091,177.132195,40,1,"""localhost230405150001""","""B02""",2,2,7,7,"""f00""",0.657107,-0.850399,-0.584931,2.090925,1.263259,-0.021031,1.627957,0.944161,-0.085511,1.475345,2.164761,-0.688462,1.215015,1.499086,-0.770667,1.012721,0.6791,0.429311,…,-0.318777,-1.154168,-0.66473,0.134835,0.263514,-0.124309,0.634517,0.968512,0.859562,0.351144,0.914468,-2.508508,-2.389124,-1.80698,-2.121536,-0.231231,-0.763949,-1.055166,-0.258152,0.282319,0.048807,-0.981164,-1.0743,0.612996,0.290339,0.030854,-0.421502,-0.61852,-0.050925,0.424753,0.323462,-0.096856,-0.218001,-0.359297,2.621455,-0.175679,"""DMSO_heart_9"""
"""B""",2,9,"""failing""","""rejected""","""DMSO_heart_9""",626.56149,206.923698,623.94374,199.90644,40,1,"""localhost230405150001""","""B02""",3,3,8,8,"""f00""",0.384287,-0.727344,0.399813,0.699568,0.778991,-0.192578,-0.166121,-0.185078,-0.620564,0.385325,0.41953,-1.35377,-0.189027,1.88019,-0.198823,0.77826,2.084304,0.866506,…,-0.437225,0.097014,0.148712,-0.126239,0.315114,-0.682006,-0.952994,0.534521,0.448969,-0.512213,0.68761,-0.333052,-1.116806,-0.671374,-0.085583,0.565659,0.117809,-0.035232,0.340022,0.392109,0.906171,-0.637012,-0.912759,-0.139719,-0.319312,-0.119514,-0.62708,-0.213998,0.492022,0.783465,0.531513,-0.515924,-0.090464,-0.381751,-0.23489,-0.312005,"""DMSO_heart_9"""
"""B""",2,9,"""failing""","""rejected""","""DMSO_heart_9""",559.448583,220.68816,528.646623,196.955552,40,1,"""localhost230405150001""","""B02""",4,4,9,9,"""f00""",-0.08178,-0.31057,-1.984463,0.923396,-0.152527,-0.454748,0.485672,0.978143,0.075853,0.333035,1.036702,2.124015,-0.11271,-1.276017,0.663499,1.351768,-2.07981,2.000062,…,-0.180273,0.154455,0.355861,-0.285138,0.187411,-0.401472,-1.323716,0.216479,0.694455,0.22334,0.272893,-1.610123,-1.983535,-1.990444,-1.759351,-0.667021,-1.511134,-1.70973,-1.025608,0.38988,0.970785,-0.723812,-0.240465,1.02861,0.817875,0.731123,-0.410279,0.066951,0.233985,0.697668,0.3868,0.216837,-0.078625,-0.345897,-0.148249,-0.205381,"""DMSO_heart_9"""
"""B""",2,9,"""failing""","""rejected""","""DMSO_heart_9""",909.019946,247.69434,897.965996,253.621836,40,1,"""localhost230405150001""","""B02""",5,5,10,10,"""f00""",1.384627,-0.236857,0.651571,-0.525561,-0.256208,-0.352022,-0.51073,-0.650514,-0.61187,-0.390602,-0.915644,0.274757,-0.807468,-0.263914,1.012877,0.333081,0.457026,-0.320432,…,-0.235359,-0.874322,1.036752,0.560328,0.087048,0.500935,1.024688,0.682356,0.703425,-0.559919,0.535412,-0.446346,-0.250839,-0.325067,-0.220781,0.135176,-0.068065,-1.328074,-0.471597,-0.313553,-1.011855,-0.921082,-0.718369,-0.1701,0.076669,0.151063,0.78411,0.796587,-0.833035,0.971781,0.96971,-0.859995,-0.437968,-0.375427,0.054053,-0.346036,"""DMSO_heart_9"""


## BUSCAR pipeline

Creating on and off morphology signatures

In [5]:
# setting output paths
signatures_outpath = (results_dir / "cfret_pilot_signatures.json").resolve()

if signatures_outpath.exists():
    print("Signatures already exist, skipping this step.")
    with open(signatures_outpath, "r") as f:
        sigs = json.load(f)
        on_sigs = sigs["on"]
        off_sigs = sigs["off"]
else:
    # once the data is loaded, separate the controls
    negcon_df = cfret_df.filter(pl.col("Metadata_treatment") == "DMSO_heart_9")
    poscon_df = cfret_df.filter(pl.col("Metadata_treatment") == "DMSO_heart_11")

    # creating signatures
    on_sigs, off_sigs, _ = get_signatures(
        ref_profiles=negcon_df,
        exp_profiles=poscon_df,
        morph_feats=cfret_feats,
        test_method="mann_whitney_u",
    )

    # Save signatures as json file
    with open(signatures_outpath, "w") as f:
        json.dump({"on": on_sigs, "off": off_sigs}, f, indent=4)

Signatures already exist, skipping this step.


assess heterogeneity 

In [6]:
# setting best params outputs
treatment_best_params_outpath = (
    results_dir / "cfret_treatment_clustering_params.json"
).resolve()
cfret_treatment_cluster_df_outpath = (
    results_dir / "cfret_treatment_clustered.parquet"
).resolve()
cfret_treatment_heart_cluster_df_outpath = (
    results_dir / "cfret_treatment_heart_clustered.parquet"
).resolve()
treatment_heart_best_params_outpath = (
    results_dir / "cfret_treatment_heart_clustering_params.json"
).resolve()

# check if the files exist, if they do skip this step
if all(
    path.exists()
    for path in [
        treatment_best_params_outpath,
        cfret_treatment_cluster_df_outpath,
    ]
):
    # load the profiles
    cfret_treatment_clustered_df = pl.read_parquet(cfret_treatment_cluster_df_outpath)
else:
    # here we are clustering each treatment regardless of heart
    # this will allow us to see how each treatment affects the population as a whole
    cfret_treatment_clustered_df, cfret_treatment_clustered_best_params = (
        optimized_clustering(
            profiles=cfret_df,
            meta_features=cfret_meta,
            morph_features=cfret_feats,
            treatment_col=treatment_col,
            param_grid=cfret_cluster_param_grid,
            n_trials=200,
            n_jobs=1,
        )
    )

    # save best params as json and dataframe as parquet
    cfret_treatment_clustered_df.write_parquet(cfret_treatment_cluster_df_outpath)
    with open(treatment_best_params_outpath, "w") as f:
        json.dump(
            cfret_treatment_clustered_best_params,
            f,
            indent=4,
        )


# check if the files exist, if they do skip this step aswell
if all(
    path.exists()
    for path in [
        cfret_treatment_heart_cluster_df_outpath,
        treatment_heart_best_params_outpath,
    ]
):
    # load the profiles
    cfret_treatment_heart_clustered_df = pl.read_parquet(
        cfret_treatment_heart_cluster_df_outpath
    )
else:
    # here we are clustering each treatment-heart combination
    # this will allow us to see how each heart responds to each treatment
    cfret_treatment_heart_clustered_df, cfret_treatment_heart_clustered_best_params = (
        optimized_clustering(
            profiles=cfret_df,
            meta_features=cfret_meta,
            morph_features=cfret_feats,
            treatment_col=treatment_heart_col,
            param_grid=cfret_cluster_param_grid,
            n_trials=200,
            n_jobs=1,
        )
    )

    # save best params as json and dataframe as parquet
    cfret_treatment_heart_clustered_df.write_parquet(
        cfret_treatment_heart_cluster_df_outpath
    )
    with open(treatment_heart_best_params_outpath, "w") as f:
        json.dump(
            cfret_treatment_heart_clustered_best_params,
            f,
            indent=4,
        )

Measure phenotypic activity between clusters

In [7]:
# setting output paths
treatment_dist_scores_outpath = (
    results_dir / "treatment_phenotypic_dist_scores.csv"
).resolve()
treatment_heart_dist_scores_outpath = (
    results_dir / "treatment_heart_dist_scores.csv"
).resolve()

if all(
    path.exists()
    for path in [
        treatment_dist_scores_outpath,
        treatment_heart_dist_scores_outpath,
    ]
):
    print("Distance scores already exist, skipping this step.")

    # load the distance scores
    treatment_dist_scores = pl.read_csv(treatment_dist_scores_outpath)
    treatment_heart_dist_scores = pl.read_csv(treatment_heart_dist_scores_outpath)

else:
    # measuring phenotypic activity
    treatment_dist_scores = measure_phenotypic_activity(
        profiles=cfret_treatment_clustered_df,
        on_signature=on_sigs,
        off_signature=off_sigs,
        ref_treatment="DMSO_heart_11",
    )

    treatment_heart_dist_scores = measure_phenotypic_activity(
        profiles=cfret_treatment_heart_clustered_df,
        on_signature=on_sigs,
        off_signature=off_sigs,
        ref_treatment="DMSO_heart_11",
        treatment_col=treatment_heart_col,
    )

    # save those as csv files
    treatment_dist_scores.write_csv(treatment_dist_scores_outpath)
    treatment_heart_dist_scores.write_csv(treatment_heart_dist_scores_outpath)

Distance scores already exist, skipping this step.


Rank treatments

In [8]:
# setting outptut paths
treatment_rankings_outpath = (results_dir / "treatment_rankings.csv").resolve()
treatment_heart_rankings_outpath = (
    results_dir / "treatment_heart_rankings.csv"
).resolve()

if all(
    path.exists()
    for path in [
        treatment_rankings_outpath,
        treatment_heart_rankings_outpath,
    ]
):
    print("Rankings already exist, skipping this step.")

    # load the rankings
    treatment_rankings = pl.read_csv(treatment_rankings_outpath)
    treatment_heart_rankings = pl.read_csv(treatment_heart_rankings_outpath)
else:
    treatment_rankings = identify_compound_hit(
        distance_df=treatment_dist_scores, method="weighted_sum"
    )

    treatment_heart_rankings = identify_compound_hit(
        distance_df=treatment_heart_dist_scores, method="weighted_sum"
    )

    # save as csv files
    treatment_rankings.write_csv(treatment_rankings_outpath)
    treatment_heart_rankings.write_csv(treatment_heart_rankings_outpath)

Rankings already exist, skipping this step.
