In [1]:
import sys
import pathlib
import json
import logging
from pprint import pprint

import polars as pl
from tqdm import tqdm

sys.path.append("../../")
from utils.signatures import get_signatures
from utils.identify_hits import identify_compound_hit
from utils.metrics import measure_phenotypic_activity
from utils.io_utils import load_profiles, load_configs
from utils.data_utils import shuffle_profiles, split_meta_and_features

Notebook parameters

In [2]:
# set to True for debugging purposes, to run the notebook faster with a subset of the data
subet_data = False
subet_fraction = 0.1

Setting input and output paths

In [2]:
# setting data directory
data_dir = pathlib.Path("../0.download-data/data/sc-profiles/").resolve(strict=True)
results_module_dir = pathlib.Path("./results").resolve(strict=True)

# setting cpjump1_dataset
cpjump1_profiles_path = (
    data_dir / "cpjump1/cpjump1_compound_concat_profiles.parquet"
).resolve(strict=True)

# get experimental dataset
cpjump1_experimental_metadata_path = (
    data_dir / "cpjump1/CPJUMP1-experimental-metadata.csv"
).resolve(strict=True)

# get shared feature space
shared_feature_space = (
    data_dir / "cpjump1/feature_selected_sc_qc_features.json"
).resolve(strict=True)

# moa config
cpjump1_compounds_moa = (data_dir / "cpjump1/cpjump1_compound_moa.tsv").resolve(
    strict=True
)

# set cluster labels dirctory
u2os_cluster_labels_path = (
    results_module_dir / "clusters/cpjump1_u2os_clusters.parquet"
).resolve(strict=True)

# create MoA analysis output directory
moa_analysis_output_dir = (results_module_dir / "moa_analysis").resolve()
moa_analysis_output_dir.mkdir(parents=True, exist_ok=True)

In [3]:
# loading shared features
shared_features = load_configs(shared_feature_space)["shared-features"]

# loading experimental and moa metadata
cpjump1_moa_df = pl.read_csv(cpjump1_compounds_moa, separator="\t")
cpjump1_experimental_data = pl.read_csv(cpjump1_experimental_metadata_path)

# Cluster labels
cluster_labels_df = pl.read_parquet(u2os_cluster_labels_path)

# load profiles
cpjump1_df = load_profiles(cpjump1_profiles_path)
cpjump1_meta, cpjump1_feats = split_meta_and_features(cpjump1_df)

# replace treatments where the MoA is 'null' to 'unknown'
cpjump1_moa_df = cpjump1_moa_df.with_columns(
    pl.when(pl.col("Metadata_moa").is_null())
    .then(pl.lit("unknown"))
    .otherwise(pl.col("Metadata_moa"))
    .alias("Metadata_moa")
)

# displaying dataframe information
print(f"Dataframe shape: {cpjump1_df.shape}")
print(
    "Number of unique treatments",
    cpjump1_df["Metadata_pert_iname"].n_unique(),
)

cpjump1_df.head()

Dataframe shape: (6505782, 323)
Number of unique treatments 303


Metadata_cell_id,Metadata_broad_sample,Metadata_solvent,Metadata_ImageNumber,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_TableNumber,Metadata_ObjectNumber_cytoplasm,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ObjectNumber_cells,Metadata_ObjectNumber,Metadata_InChIKey,Metadata_pert_iname,Metadata_pubchem_cid,Metadata_target,Metadata_target_list,Metadata_pert_type,Metadata_control_type,Metadata_smiles,__index_level_0__,Nuclei_Texture_InverseDifferenceMoment_ER_5_01_256,Cytoplasm_AreaShape_Zernike_4_2,Cytoplasm_AreaShape_Zernike_9_3,Nuclei_RadialDistribution_RadialCV_AGP_2of4,Nuclei_Correlation_Correlation_DNA_HighZBF,Cells_Texture_Correlation_HighZBF_5_00_256,Cells_AreaShape_Solidity,Nuclei_RadialDistribution_MeanFrac_HighZBF_4of4,Nuclei_AreaShape_Orientation,Nuclei_Texture_Correlation_ER_5_02_256,Cytoplasm_Correlation_Correlation_DNA_LowZBF,Cytoplasm_Texture_InfoMeas2_DNA_10_01_256,Cells_RadialDistribution_FracAtD_DNA_2of4,Cells_RadialDistribution_MeanFrac_HighZBF_1of4,Nuclei_RadialDistribution_MeanFrac_Mito_4of4,…,Nuclei_AreaShape_MinFeretDiameter,Nuclei_AreaShape_Zernike_5_3,Nuclei_AreaShape_Zernike_4_2,Nuclei_RadialDistribution_MeanFrac_HighZBF_3of4,Cytoplasm_Granularity_1_Brightfield,Nuclei_Correlation_Correlation_ER_Mito,Nuclei_AreaShape_Zernike_6_0,Cytoplasm_AreaShape_Solidity,Nuclei_RadialDistribution_FracAtD_DNA_3of4,Nuclei_AreaShape_Zernike_8_4,Nuclei_Intensity_MassDisplacement_HighZBF,Cytoplasm_Texture_Correlation_LowZBF_3_03_256,Cells_RadialDistribution_RadialCV_Mito_1of4,Nuclei_Intensity_MassDisplacement_AGP,Nuclei_Correlation_Correlation_AGP_Mito,Nuclei_RadialDistribution_MeanFrac_HighZBF_1of4,Cells_RadialDistribution_RadialCV_DNA_2of4,Nuclei_RadialDistribution_RadialCV_RNA_3of4,Cells_AreaShape_Zernike_7_3,Nuclei_RadialDistribution_MeanFrac_ER_1of4,Cytoplasm_Correlation_Overlap_ER_RNA,Cells_Texture_Correlation_LowZBF_5_01_256,Cytoplasm_Texture_Correlation_HighZBF_3_01_256,Nuclei_Texture_Correlation_LowZBF_3_02_256,Cells_RadialDistribution_RadialCV_AGP_3of4,Nuclei_RadialDistribution_MeanFrac_AGP_1of4,Cells_RadialDistribution_MeanFrac_Brightfield_1of4,Cytoplasm_Correlation_Correlation_DNA_HighZBF,Nuclei_Intensity_MassDisplacement_DNA,Cytoplasm_RadialDistribution_MeanFrac_DNA_4of4,Cells_Correlation_Correlation_AGP_DNA,Cytoplasm_Texture_InfoMeas2_RNA_3_01_256,Cells_RadialDistribution_RadialCV_DNA_4of4,Nuclei_Correlation_Correlation_DNA_LowZBF,Cytoplasm_Texture_Correlation_LowZBF_5_00_256,Cytoplasm_RadialDistribution_RadialCV_HighZBF_2of4,Nuclei_RadialDistribution_MeanFrac_Brightfield_1of4
str,str,str,i64,str,i64,str,str,i64,f64,f64,i64,i64,str,str,f64,str,str,str,str,str,i64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""be1d797513962cc53138bf40c17bfd…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",1,1.0,1.0,1,1,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",0,0.102111,-0.512725,-1.374218,0.245471,-0.370283,-0.374921,0.805692,-0.175325,-0.6419,0.459446,-0.297342,-1.828568,0.373612,0.107422,-0.396583,…,0.853844,-0.935811,-0.923693,0.294904,-0.127464,-0.177744,-0.726597,0.918014,-0.6842,-0.988241,-0.404599,0.421584,-0.38808,-0.819568,0.585497,0.378222,0.94035,-0.410238,-0.280105,0.698818,1.182724,0.14401,-0.127775,-0.135088,0.410226,0.249449,-0.608221,0.273933,-1.00028,-0.563395,1.199018,0.038707,-1.540164,-0.244568,-0.175755,-0.510789,-0.149818
"""c5e19078f768719a36c7233564defa…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",2,2.0,2.0,2,2,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",1,-0.197785,0.08208,1.36711,0.457317,-1.16733,1.583236,-0.217591,1.016596,1.682456,-0.366287,-0.979124,-1.76991,0.42688,-1.209187,0.087925,…,-0.673173,-0.591105,-0.089322,-1.354509,0.20838,0.670662,1.360484,0.102141,0.684594,1.031678,0.252068,-0.641819,-0.457965,0.867251,0.352346,0.844115,0.864975,2.093067,0.468929,-1.400653,-1.838547,-0.558138,-0.244386,0.208226,1.104397,0.027561,-0.319604,2.198975,-0.561141,-0.810842,0.797657,0.403821,-0.906756,0.584218,0.122659,0.298723,-0.295509
"""bb62f3e7834491d8e92bd12706d756…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",3,3.0,3.0,3,3,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",2,-1.583619,-0.926826,-1.593539,-0.326316,0.490516,-0.613725,-0.830433,0.032733,0.75598,-0.340149,-0.012428,0.745208,-1.669817,0.387462,-0.066588,…,0.041006,0.085795,0.484288,-0.525111,-0.450547,-1.325638,-0.18717,-0.523474,0.716498,0.099048,-0.464612,-0.786093,-0.491761,0.871847,-0.826041,0.895621,0.619553,-0.506535,-1.463842,0.827307,-0.561091,-0.573665,-0.236381,-1.098042,3.261606,1.454035,0.003784,-0.522931,1.15661,2.028431,1.533132,0.841716,0.826675,0.000598,0.133839,-0.186927,-0.21589
"""969d9a333283209ca25162bd78080c…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",4,4.0,4.0,4,4,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",3,1.408502,-0.261008,-0.362499,-0.696021,0.898733,-0.391842,0.319063,-0.324957,0.181359,-0.468413,0.726744,-0.254443,-0.452433,0.228761,-0.166874,…,0.495386,0.960028,-0.877688,0.181397,-0.222116,0.354555,1.091996,0.514927,-0.242439,0.033132,0.150035,0.549874,-0.018698,0.155669,0.193774,-0.364043,1.373037,-0.582662,-1.103498,-0.320097,0.284473,0.28714,-0.878138,1.296824,1.968544,0.292888,-0.106583,-0.118385,0.780071,-0.400939,0.395153,0.386699,-0.327482,-0.254597,0.623077,-0.485477,-0.17874
"""399590b2ed040a3193dbffd2d97673…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",5,5.0,5.0,5,5,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",4,2.836335,0.344793,0.155254,-0.513563,-1.655844,0.768565,-0.170601,0.696569,1.120323,-0.669434,-1.138388,-0.003439,-2.059624,0.830904,-0.186996,…,-1.042888,-0.969448,2.343198,-0.693298,-1.831954,-0.239624,-1.086604,0.402171,0.134366,-0.16589,-0.986735,0.303698,0.000713,-0.227783,0.492811,0.016042,-1.210011,-0.58153,0.535249,-0.466614,0.995592,0.682557,0.065037,-0.018444,-0.880599,-0.201976,0.234135,0.668182,0.415021,0.007404,0.820523,0.261967,0.952071,-0.079721,0.087565,-0.350473,0.444416


Identify plates that contains U2OS and A549 cells

In [4]:
# Split the dataset by cell type and treatment duration
# Filter U2OS cells (all records)
cpjump1_u2os_exp_metadata = cpjump1_experimental_data.filter(
    pl.col("Cell_type") == "U2OS"
)

# Filter A549 cells with density of 100 for consistency
cpjump1_a549_exp_metadata = cpjump1_experimental_data.filter(
    (pl.col("Cell_type") == "A549") & (pl.col("Density") == 100)
)

# Extract plate identifiers for each cell type
u2os_plates = cpjump1_u2os_exp_metadata["Assay_Plate_Barcode"].unique().to_list()
a549_plates = cpjump1_a549_exp_metadata["Assay_Plate_Barcode"].unique().to_list()

# Display the extracted plates for verification
print(f"U2OS plates: {u2os_plates}")
print(f"A549 plates: {a549_plates}")

U2OS plates: ['BR00117010', 'BR00117012', 'BR00117011', 'BR00117013']
A549 plates: ['BR00117019', 'BR00117015', 'BR00117017', 'BR00117016']


Add the MoA data into the profiles 

In [5]:
# merge moa data (join on Metadata_pert_iname)
cpjump1_df = cpjump1_df.filter(pl.col("Metadata_Plate").is_in(u2os_plates))
cpjump1_df = cpjump1_df.join(cpjump1_moa_df, how="inner", on="Metadata_pert_iname")

# Join cluster labels on Metadata_cell_id
cpjump1_df = cpjump1_df.join(
    cluster_labels_df,
    on="Metadata_cell_id",
    how="inner",  # Use inner join to keep only cells with cluster assignments
)
print(f"After joining cluster labels: {cpjump1_df.height} rows")

# Verify all required columns exist
required_cols = [
    "Metadata_cluster_id",
    "Metadata_cluster_ratio",
    "Metadata_control_type",
    "Metadata_pert_iname",
    "Metadata_moa",
]
missing_cols = [col for col in required_cols if col not in cpjump1_df.columns]
if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")

# print dataframe information
# displaying dataframe information
print("CPJUMP1 U2OS dataset")
print(f"Dataframe shape: {cpjump1_df.shape}")
print(
    "Number of poscon_cp",
    cpjump1_df.filter(pl.col("Metadata_control_type") == "poscon_cp")[
        "Metadata_pert_iname"
    ].n_unique(),
)
print(
    "Number of unique treatments that are not controls",
    cpjump1_df.filter(pl.col("Metadata_pert_type") == "trt")
    .select("Metadata_pert_iname")
    .n_unique(),
)

cpjump1_df.head()

After joining cluster labels: 1135692 rows
CPJUMP1 U2OS dataset
Dataframe shape: (1135692, 332)
Number of poscon_cp 26
Number of unique treatments that are not controls 256


Metadata_cell_id,Metadata_broad_sample,Metadata_solvent,Metadata_ImageNumber,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_TableNumber,Metadata_ObjectNumber_cytoplasm,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ObjectNumber_cells,Metadata_ObjectNumber,Metadata_InChIKey,Metadata_pert_iname,Metadata_pubchem_cid,Metadata_target,Metadata_target_list,Metadata_pert_type,Metadata_control_type,Metadata_smiles,__index_level_0__,Nuclei_Texture_InverseDifferenceMoment_ER_5_01_256,Cytoplasm_AreaShape_Zernike_4_2,Cytoplasm_AreaShape_Zernike_9_3,Nuclei_RadialDistribution_RadialCV_AGP_2of4,Nuclei_Correlation_Correlation_DNA_HighZBF,Cells_Texture_Correlation_HighZBF_5_00_256,Cells_AreaShape_Solidity,Nuclei_RadialDistribution_MeanFrac_HighZBF_4of4,Nuclei_AreaShape_Orientation,Nuclei_Texture_Correlation_ER_5_02_256,Cytoplasm_Correlation_Correlation_DNA_LowZBF,Cytoplasm_Texture_InfoMeas2_DNA_10_01_256,Cells_RadialDistribution_FracAtD_DNA_2of4,Cells_RadialDistribution_MeanFrac_HighZBF_1of4,Nuclei_RadialDistribution_MeanFrac_Mito_4of4,…,Nuclei_AreaShape_Zernike_8_4,Nuclei_Intensity_MassDisplacement_HighZBF,Cytoplasm_Texture_Correlation_LowZBF_3_03_256,Cells_RadialDistribution_RadialCV_Mito_1of4,Nuclei_Intensity_MassDisplacement_AGP,Nuclei_Correlation_Correlation_AGP_Mito,Nuclei_RadialDistribution_MeanFrac_HighZBF_1of4,Cells_RadialDistribution_RadialCV_DNA_2of4,Nuclei_RadialDistribution_RadialCV_RNA_3of4,Cells_AreaShape_Zernike_7_3,Nuclei_RadialDistribution_MeanFrac_ER_1of4,Cytoplasm_Correlation_Overlap_ER_RNA,Cells_Texture_Correlation_LowZBF_5_01_256,Cytoplasm_Texture_Correlation_HighZBF_3_01_256,Nuclei_Texture_Correlation_LowZBF_3_02_256,Cells_RadialDistribution_RadialCV_AGP_3of4,Nuclei_RadialDistribution_MeanFrac_AGP_1of4,Cells_RadialDistribution_MeanFrac_Brightfield_1of4,Cytoplasm_Correlation_Correlation_DNA_HighZBF,Nuclei_Intensity_MassDisplacement_DNA,Cytoplasm_RadialDistribution_MeanFrac_DNA_4of4,Cells_Correlation_Correlation_AGP_DNA,Cytoplasm_Texture_InfoMeas2_RNA_3_01_256,Cells_RadialDistribution_RadialCV_DNA_4of4,Nuclei_Correlation_Correlation_DNA_LowZBF,Cytoplasm_Texture_Correlation_LowZBF_5_00_256,Cytoplasm_RadialDistribution_RadialCV_HighZBF_2of4,Nuclei_RadialDistribution_MeanFrac_Brightfield_1of4,Metadata_clinical_phase,Metadata_moa,Metadata_target_right,Metadata_disease_area,Metadata_indication,Metadata_cluster_id,Metadata_cluster_n_cells,Metadata_treatment_n_cells,Metadata_cluster_ratio
str,str,str,i64,str,i64,str,str,i64,f64,f64,i64,i64,str,str,f64,str,str,str,str,str,i64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str,cat,u32,u32,f64
"""be1d797513962cc53138bf40c17bfd…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",1,1.0,1.0,1,1,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",0,0.102111,-0.512725,-1.374218,0.245471,-0.370283,-0.374921,0.805692,-0.175325,-0.6419,0.459446,-0.297342,-1.828568,0.373612,0.107422,-0.396583,…,-0.988241,-0.404599,0.421584,-0.38808,-0.819568,0.585497,0.378222,0.94035,-0.410238,-0.280105,0.698818,1.182724,0.14401,-0.127775,-0.135088,0.410226,0.249449,-0.608221,0.273933,-1.00028,-0.563395,1.199018,0.038707,-1.540164,-0.244568,-0.175755,-0.510789,-0.149818,"""Launched""","""adrenergic receptor agonist""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""neurology/psychiatry""","""restless leg syndrome|postherp…","""gabapentin-enacarbil_leiden_0""",2748,2857,0.961848
"""c5e19078f768719a36c7233564defa…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",2,2.0,2.0,2,2,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",1,-0.197785,0.08208,1.36711,0.457317,-1.16733,1.583236,-0.217591,1.016596,1.682456,-0.366287,-0.979124,-1.76991,0.42688,-1.209187,0.087925,…,1.031678,0.252068,-0.641819,-0.457965,0.867251,0.352346,0.844115,0.864975,2.093067,0.468929,-1.400653,-1.838547,-0.558138,-0.244386,0.208226,1.104397,0.027561,-0.319604,2.198975,-0.561141,-0.810842,0.797657,0.403821,-0.906756,0.584218,0.122659,0.298723,-0.295509,"""Launched""","""adrenergic receptor agonist""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""neurology/psychiatry""","""restless leg syndrome|postherp…","""gabapentin-enacarbil_leiden_0""",2748,2857,0.961848
"""bb62f3e7834491d8e92bd12706d756…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",3,3.0,3.0,3,3,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",2,-1.583619,-0.926826,-1.593539,-0.326316,0.490516,-0.613725,-0.830433,0.032733,0.75598,-0.340149,-0.012428,0.745208,-1.669817,0.387462,-0.066588,…,0.099048,-0.464612,-0.786093,-0.491761,0.871847,-0.826041,0.895621,0.619553,-0.506535,-1.463842,0.827307,-0.561091,-0.573665,-0.236381,-1.098042,3.261606,1.454035,0.003784,-0.522931,1.15661,2.028431,1.533132,0.841716,0.826675,0.000598,0.133839,-0.186927,-0.21589,"""Launched""","""adrenergic receptor agonist""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""neurology/psychiatry""","""restless leg syndrome|postherp…","""gabapentin-enacarbil_leiden_0""",2748,2857,0.961848
"""969d9a333283209ca25162bd78080c…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",4,4.0,4.0,4,4,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",3,1.408502,-0.261008,-0.362499,-0.696021,0.898733,-0.391842,0.319063,-0.324957,0.181359,-0.468413,0.726744,-0.254443,-0.452433,0.228761,-0.166874,…,0.033132,0.150035,0.549874,-0.018698,0.155669,0.193774,-0.364043,1.373037,-0.582662,-1.103498,-0.320097,0.284473,0.28714,-0.878138,1.296824,1.968544,0.292888,-0.106583,-0.118385,0.780071,-0.400939,0.395153,0.386699,-0.327482,-0.254597,0.623077,-0.485477,-0.17874,"""Launched""","""adrenergic receptor agonist""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""neurology/psychiatry""","""restless leg syndrome|postherp…","""gabapentin-enacarbil_leiden_0""",2748,2857,0.961848
"""399590b2ed040a3193dbffd2d97673…","""BRD-A86665761-001-01-1""","""DMSO""",1,"""BR00117012""",1,"""A01""","""321887486565336862673698806220…",5,5.0,5.0,5,5,"""TZDUHAJSIBHXDL-UHFFFAOYSA-N""","""gabapentin-enacarbil""",9883933.0,"""CACNB4""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""trt""",,"""CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O…",4,2.836335,0.344793,0.155254,-0.513563,-1.655844,0.768565,-0.170601,0.696569,1.120323,-0.669434,-1.138388,-0.003439,-2.059624,0.830904,-0.186996,…,-0.16589,-0.986735,0.303698,0.000713,-0.227783,0.492811,0.016042,-1.210011,-0.58153,0.535249,-0.466614,0.995592,0.682557,0.065037,-0.018444,-0.880599,-0.201976,0.234135,0.668182,0.415021,0.007404,0.820523,0.261967,0.952071,-0.079721,0.087565,-0.350473,0.444416,"""Launched""","""adrenergic receptor agonist""","""CACNA1A|CACNA1B|CACNA1C|CACNA1…","""neurology/psychiatry""","""restless leg syndrome|postherp…","""gabapentin-enacarbil_leiden_0""",2748,2857,0.961848


Generate a shuffled baseline dataset 

In [None]:
# create a subsetted dataframe for faster testing (optional)
if subet_data:
    print("Subsetting data for testing purposes...")
    print("subsetting fraction:", subet_fraction)
    print("original dataframe shape:", cpjump1_df.shape)
    cpjump1_df = (
        cpjump1_df.group_by(["Metadata_Plate", "Metadata_Well"])
        .agg(pl.all().sample(fraction=subet_fraction, seed=0))
        .explode(pl.all().exclude(["Metadata_Plate", "Metadata_Well"]))
    )
    print(f"New dataframe shape: {cpjump1_df.shape}")

# Create the shuffled baseline dataset
cpjump1_df_shuffled = shuffle_profiles(cpjump1_df, shared_features, seed=42)

In [7]:
# Parameters
# negcon_sub_sample (int) - fraction of negative controls to sub-sample
# n_same_moa_treatments (int) - minimum number of treatments sharing the same MoA
negcon_sub_sample = 0.25
n_same_moa_treatments = 3
n_iterations = 5

In [8]:
# counts number of treatments that have the same MoA
moa_counts = (
    (
        cpjump1_df.group_by("Metadata_moa").agg(
            pl.col("Metadata_pert_iname").n_unique().alias("treatment_count")
        )
    )
    .sort("treatment_count", descending=True)
    .filter(pl.col("treatment_count") >= n_same_moa_treatments)
)

# get all treatments with MoAs that have that passes the threshold of n_same_moa_treatments
selected_treatments_df = (
    cpjump1_df.filter(
        pl.col("Metadata_moa").is_in(moa_counts["Metadata_moa"].implode())
    )
    .select("Metadata_pert_iname")
    .unique()
    .to_series()
    .to_list()
) + ["DMSO"]

# display results
pprint(
    f"Number of MoAs with at least {n_same_moa_treatments} treatments: {moa_counts.height}"
)
pprint(f"The treatments are: {selected_treatments_df}")
print(
    f"total amount of treatments to be analyzed: {moa_counts['treatment_count'].sum()}"
)
moa_counts

'Number of MoAs with at least 3 treatments: 19'
("The treatments are: ['TUG-891', 'NSC-663284', 'amlodipine', 'picrotoxinin', "
 "'RGB-286638', 'cilostazol', 'ozanimod', 'geldanamycin', 'GSK-37647', "
 "'picrotin', 'TCN201', 'arcyriaflavin-a', 'salicylic-acid', 'UNC1999', "
 "'RGFP966', 'SRC-kinase-inhibitor-I', 'ceritinib', 'sodium-butyrate', "
 "'sulfasalazine', 'CYM-5442', 'flupirtine', 'carzenide', 'puromycin', "
 "'BRL-50481', 'nilvadipine', 'niflumic-acid', 'PP-2', 'zaprinast', "
 "'RX-821002', 'purvalanol-a', 'amiloride', 'ganetespib', 'EI1', 'L-Cystine', "
 "'ibutilide', 'VU591', 'bufexamac', 'glutamine-(l)', 'dalfampridine', "
 "'trometamol', 'SB-505124', 'pidolic-acid', 'benzamil', 'homoharringtonine', "
 "'felodipine', 'PP-1', 'NSC-625987', 'terazosin', 'romidepsin', "
 "'cycloheximide', 'efaroxan', 'KH-CB19', 'NVP-HSP990', 'WH-4-023', "
 "'saclofen', 'NSC-95397', 'Ro-20-1724', 'bepridil', 'guanidine', 'ryuvidine', "
 "'nimodipine', 'droxinostat', 'PHA-793887', 'lercanidipin

Metadata_moa,treatment_count
str,u32
"""calcium channel blocker""",7
"""unknown""",6
"""CDK inhibitor""",5
"""cyclooxygenase inhibitor""",4
"""HSP inhibitor""",4
…,…
"""adrenergic receptor antagonist""",3
"""CDC inhibitor""",3
"""glutamate receptor antagonist""",3
"""free fatty acid receptor agoni…",3


In [9]:
# reduce the profiles to only the treatments with MoAs that have at least n_same_moa_treatments
cpjump1_df = cpjump1_df.filter(
    pl.col("Metadata_pert_iname").is_in(selected_treatments_df)
)

# displaying dataframe information
print("CPJUMP1 U2OS dataset after filtering treatments by MoA counts")
print("Dataframe shape: {cpjump1_df.shape}")
print(f"Numbero of treatment: {cpjump1_df['Metadata_pert_iname'].n_unique()}")
cpjump1_df.head()

CPJUMP1 U2OS dataset after filtering treatments by MoA counts
Dataframe shape: {cpjump1_df.shape}
Numbero of treatment: 74


Metadata_cell_id,Metadata_broad_sample,Metadata_solvent,Metadata_ImageNumber,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_TableNumber,Metadata_ObjectNumber_cytoplasm,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ObjectNumber_cells,Metadata_ObjectNumber,Metadata_InChIKey,Metadata_pert_iname,Metadata_pubchem_cid,Metadata_target,Metadata_target_list,Metadata_pert_type,Metadata_control_type,Metadata_smiles,__index_level_0__,Nuclei_Texture_InverseDifferenceMoment_ER_5_01_256,Cytoplasm_AreaShape_Zernike_4_2,Cytoplasm_AreaShape_Zernike_9_3,Nuclei_RadialDistribution_RadialCV_AGP_2of4,Nuclei_Correlation_Correlation_DNA_HighZBF,Cells_Texture_Correlation_HighZBF_5_00_256,Cells_AreaShape_Solidity,Nuclei_RadialDistribution_MeanFrac_HighZBF_4of4,Nuclei_AreaShape_Orientation,Nuclei_Texture_Correlation_ER_5_02_256,Cytoplasm_Correlation_Correlation_DNA_LowZBF,Cytoplasm_Texture_InfoMeas2_DNA_10_01_256,Cells_RadialDistribution_FracAtD_DNA_2of4,Cells_RadialDistribution_MeanFrac_HighZBF_1of4,Nuclei_RadialDistribution_MeanFrac_Mito_4of4,…,Nuclei_AreaShape_Zernike_8_4,Nuclei_Intensity_MassDisplacement_HighZBF,Cytoplasm_Texture_Correlation_LowZBF_3_03_256,Cells_RadialDistribution_RadialCV_Mito_1of4,Nuclei_Intensity_MassDisplacement_AGP,Nuclei_Correlation_Correlation_AGP_Mito,Nuclei_RadialDistribution_MeanFrac_HighZBF_1of4,Cells_RadialDistribution_RadialCV_DNA_2of4,Nuclei_RadialDistribution_RadialCV_RNA_3of4,Cells_AreaShape_Zernike_7_3,Nuclei_RadialDistribution_MeanFrac_ER_1of4,Cytoplasm_Correlation_Overlap_ER_RNA,Cells_Texture_Correlation_LowZBF_5_01_256,Cytoplasm_Texture_Correlation_HighZBF_3_01_256,Nuclei_Texture_Correlation_LowZBF_3_02_256,Cells_RadialDistribution_RadialCV_AGP_3of4,Nuclei_RadialDistribution_MeanFrac_AGP_1of4,Cells_RadialDistribution_MeanFrac_Brightfield_1of4,Cytoplasm_Correlation_Correlation_DNA_HighZBF,Nuclei_Intensity_MassDisplacement_DNA,Cytoplasm_RadialDistribution_MeanFrac_DNA_4of4,Cells_Correlation_Correlation_AGP_DNA,Cytoplasm_Texture_InfoMeas2_RNA_3_01_256,Cells_RadialDistribution_RadialCV_DNA_4of4,Nuclei_Correlation_Correlation_DNA_LowZBF,Cytoplasm_Texture_Correlation_LowZBF_5_00_256,Cytoplasm_RadialDistribution_RadialCV_HighZBF_2of4,Nuclei_RadialDistribution_MeanFrac_Brightfield_1of4,Metadata_clinical_phase,Metadata_moa,Metadata_target_right,Metadata_disease_area,Metadata_indication,Metadata_cluster_id,Metadata_cluster_n_cells,Metadata_treatment_n_cells,Metadata_cluster_ratio
str,str,str,i64,str,i64,str,str,i64,f64,f64,i64,i64,str,str,f64,str,str,str,str,str,i64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str,cat,u32,u32,f64
"""7298ad0509d001614b055c7dfb4f4a…","""DMSO""","""DMSO""",10,"""BR00117012""",1,"""A02""","""143147995035741631744310007116…",1,1.0,1.0,1,1,"""IAZDPXIOMUYVGZ-UHFFFAOYSA-N""","""DMSO""",679.0,,,"""control""","""negcon""","""CS(=O)C""",713,1.734613,0.542837,-1.394495,-0.969604,0.693384,-0.589467,1.269071,-0.107634,1.352044,-0.286131,-0.160643,-0.249854,0.448648,-0.598451,-0.646375,…,-0.062655,-0.505713,-0.221562,-0.356353,-0.710083,0.589759,-1.332601,-0.227788,-0.875401,-0.730959,1.240754,1.068012,0.354253,-0.670786,-0.089834,1.210498,1.148646,-0.085105,1.115576,-1.090245,0.996511,1.645057,0.502425,0.122073,-0.984534,0.097818,-0.556657,-0.160223,"""Preclinical""","""control vehicle""",,,,"""DMSO_leiden_0""",225142,234693,0.959304
"""0eee816252da97d794f077da28f301…","""DMSO""","""DMSO""",10,"""BR00117012""",1,"""A02""","""143147995035741631744310007116…",2,2.0,2.0,2,2,"""IAZDPXIOMUYVGZ-UHFFFAOYSA-N""","""DMSO""",679.0,,,"""control""","""negcon""","""CS(=O)C""",714,1.866354,0.506305,-0.584838,-0.440934,0.006335,-0.718454,-0.212663,0.411589,-0.391941,0.747447,-1.05704,-1.194432,-1.344456,0.42839,0.562783,…,0.503826,-0.832291,0.04575,-0.306696,-0.35219,-1.870773,1.007983,1.088879,-0.587474,1.252216,-0.373649,0.656792,-0.108094,-0.652431,0.424492,1.223142,0.260406,-0.409743,1.243554,-1.00293,0.343701,1.509223,-1.462865,-0.072684,0.155631,-0.581478,-0.47483,0.284149,"""Preclinical""","""control vehicle""",,,,"""DMSO_leiden_0""",225142,234693,0.959304
"""257c13bd74b87cd1637e9e8ce15a6d…","""DMSO""","""DMSO""",10,"""BR00117012""",1,"""A02""","""143147995035741631744310007116…",3,3.0,3.0,3,3,"""IAZDPXIOMUYVGZ-UHFFFAOYSA-N""","""DMSO""",679.0,,,"""control""","""negcon""","""CS(=O)C""",715,1.62325,-0.971284,-1.661881,-1.049034,-0.107834,0.386149,-0.016588,0.312091,1.493204,-0.71494,-1.141462,-0.048884,0.61215,0.398546,0.173963,…,1.49043,0.041274,0.434001,-0.51928,-0.465372,-2.07815,0.89507,-0.664626,-1.102551,-1.495699,-0.364727,0.380825,0.054393,-0.431918,-0.06945,-0.496317,0.795016,0.115405,0.164059,-0.063953,0.267798,1.179484,0.570631,-0.317791,0.433424,-0.156294,-0.441902,0.260629,"""Preclinical""","""control vehicle""",,,,"""DMSO_leiden_0""",225142,234693,0.959304
"""a304b885903f995d532928fbfe4724…","""DMSO""","""DMSO""",10,"""BR00117012""",1,"""A02""","""143147995035741631744310007116…",4,4.0,4.0,4,4,"""IAZDPXIOMUYVGZ-UHFFFAOYSA-N""","""DMSO""",679.0,,,"""control""","""negcon""","""CS(=O)C""",716,2.126168,0.594688,-1.656958,0.160354,-0.020018,-0.101951,1.197911,0.568908,1.46611,0.656041,0.048091,-0.767642,0.671693,-0.005902,-0.315933,…,-0.696557,-0.060115,2.343548,-0.461924,-0.055153,-0.096089,0.100264,-0.300446,-0.228885,-0.668027,0.015134,-1.796884,0.049699,-1.960362,-0.69919,0.120686,0.314893,0.043373,0.390911,1.058276,0.34943,-2.949508,-0.440201,0.509058,-0.679422,-2.595646,-0.366495,-0.175393,"""Preclinical""","""control vehicle""",,,,"""DMSO_leiden_0""",225142,234693,0.959304
"""0a78aaab38ae98a30b4e6153be0782…","""DMSO""","""DMSO""",10,"""BR00117012""",1,"""A02""","""143147995035741631744310007116…",5,5.0,5.0,5,5,"""IAZDPXIOMUYVGZ-UHFFFAOYSA-N""","""DMSO""",679.0,,,"""control""","""negcon""","""CS(=O)C""",717,1.752162,1.28243,-0.179017,-0.787229,-0.011925,-0.275825,0.216054,-0.049795,-0.006867,1.249061,-1.931058,-0.658054,0.625903,0.177177,-0.612976,…,1.092979,-0.397298,0.026527,-0.319551,0.078047,1.413776,0.571626,0.399945,-0.360052,0.382469,-0.654585,1.55724,-0.38141,-0.095769,0.60304,1.371904,0.531176,0.061174,0.231458,0.031687,-0.57396,1.325172,0.278011,-0.702196,0.699876,0.264131,-0.459986,0.057226,"""Preclinical""","""control vehicle""",,,,"""DMSO_leiden_0""",225142,234693,0.959304


In [10]:
# make an MoA look up dictionary {"treatment_name": "MoA"}
moa_lookup = dict(
    zip(cpjump1_moa_df["Metadata_pert_iname"], cpjump1_moa_df["Metadata_moa"])
)
pprint(moa_lookup)

{'1-EBIO': 'potassium channel activator',
 '1-octanol': 'unknown',
 '2,5-furandimethanol': 'hemoglobin modulator',
 '2-Oleoylglycerol': 'glucose dependent insulinotropic receptor ligand',
 '4-CMTB': 'free fatty acid receptor agonist',
 '4-methylhistamine': 'histamine receptor agonist',
 '7-hydroxystaurosporine': 'CDK inhibitor|CHK inhibitor|PKC inhibitor',
 'A-987306': 'histamine receptor antagonist',
 'A205804': 'ICAM1 expression inhibitor',
 'AC-710': 'PDGFR tyrosine kinase receptor inhibitor',
 'AK-7': 'SIRT inhibitor',
 'AMG900': 'Aurora kinase inhibitor',
 'ANR-94': 'A1 adenosine receptor antagonist',
 'AR-12': 'phosphoinositide dependent kinase inhibitor',
 'AVL-292': "Bruton's tyrosine kinase (BTK) inhibitor",
 'AZ191': 'DYRK inhibitor',
 'AZD1283': 'purinergic receptor antagonist',
 'AZD7762': 'CHK inhibitor',
 'AZD9668': 'elastase inhibitor',
 'BAM7': 'BAX activator',
 'BAN-ORL-24': 'nociceptin/orphanin FQ receptor antagonist',
 'BAX-channel-blocker': 'cytochrome C release inh

In [11]:
# Set up a logger to track the process below
logger = logging.getLogger("buscar_moa_analysis")
logger.setLevel(logging.INFO)

# Create file handler which logs even debug messages
log_file_path = moa_analysis_output_dir / "buscar_moa_analysis.log"
fh = logging.FileHandler(log_file_path)
fh.setLevel(logging.INFO)

# Create formatter and add it to the handlers
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)

# Add the handlers to the logger
if not logger.hasHandlers():
    logger.addHandler(fh)
else:
    # Avoid duplicate handlers in Jupyter
    logger.handlers.clear()
    logger.addHandler(fh)

logger.info("Logger initialized for Buscar MoA analysis.")

In [12]:
# Run Buscar analysis for each treatment in both original and shuffled datasets
scores = {
    "original": {},
    "shuffled": {},
}  # store results here with dataset_type as top-level key
for dataset_type, cpjump1_df_to_use in {
    # "original": cpjump1_df,
    "shuffled": cpjump1_df_shuffled,
}.items():
    logger.info(f"Starting analysis for dataset: {dataset_type}")
    for treatment in (
        pbar := tqdm(selected_treatments_df, desc=f"{dataset_type} treatments")
    ):
        # skip DMSO treatment
        if treatment == "DMSO":
            continue

        # getting current iteration for progress tracking
        current_iter = pbar.n
        logger.info(
            f"Processing treatment: {treatment} in dataset: {dataset_type}. Progress: {current_iter + 1}/{len(selected_treatments_df)}"
        )

        for n_iter in range(n_iterations):
            logger.info(
                f"Iteration {n_iter} for treatment: {treatment} in dataset: {dataset_type}"
            )

            # Sample from negative controls
            negcon_df = cpjump1_df_to_use.filter(
                pl.col("Metadata_control_type") == "negcon"
            ).sample(fraction=0.025, seed=n_iter)

            # Make the selected treatment as the positive control
            poscon_df = cpjump1_df_to_use.filter(
                pl.col("Metadata_pert_iname") == treatment
            )

            # check the shape of negcon_df and poscon_df if 0 raise an error
            if negcon_df.height == 0 or poscon_df.height == 0:
                logger.error(
                    f"Empty dataframe encountered for treatment {treatment} in dataset {dataset_type} at iteration {n_iter}. "
                    f"negcon_df height: {negcon_df.height}, poscon_df height: {poscon_df.height}. Skipping iteration."
                )
                raise ValueError("Empty dataframe encountered.")

            logger.debug(
                f"Dataset: {dataset_type} | Treatment: {treatment} | Iteration: {n_iter}"
            )

            # Buscar step 1: identify on and off signatures
            on_signatures, off_signatures, _ = get_signatures(
                ref_profiles=negcon_df,
                exp_profiles=poscon_df,
                morph_feats=shared_features,
                test_method="mann_whitney_u",
                p_threshold=0.05,
                seed=n_iter,
            )

            # Skip if no on or off signatures were found
            if len(on_signatures) == 0 and len(off_signatures) == 0:
                logger.warning(
                    f"No on or off signatures found for treatment {treatment}. Skipping."
                )
                logger.debug(f"on_signatures: {len(on_signatures)}")
                logger.debug(f"off_signatures: {len(off_signatures)}")
                continue

            # Buscar step 2: measure phenotypic activity and rank treatments (lower is better)
            logger.debug("measuring phenotypic activity...")
            treatment_phenotypic_dist_scores = measure_phenotypic_activity(
                profiles=pl.concat(
                    [
                        negcon_df,
                        cpjump1_df_to_use.filter(
                            pl.col("Metadata_pert_iname") != "DMSO"
                        ),
                    ]
                ),
                on_signature=on_signatures,
                off_signature=off_signatures,
                ref_treatment=treatment,
                cluster_col="Metadata_cluster_id",
                treatment_col="Metadata_pert_iname",
            )

            # Skip if no treatment rankings were generated
            if treatment_phenotypic_dist_scores.height == 0:
                logger.warning("No treatment scores calculated.. skipping")
                continue

            # Buscar step 3: rank treatments based on phenotypic distance scores
            logger.debug("Ranking treatments...")
            treatment_rankings = identify_compound_hit(
                distance_df=treatment_phenotypic_dist_scores, method="weighted_sum"
            )
            logger.debug(f"Ranking columns: {treatment_rankings.columns}")

            # Skip if no treatment rankings were generated
            if treatment_rankings.height == 0:
                logger.warning("No treatment rankings computed. Skipping iteration...")
                continue

            # Prepare results for this iteration
            logger.debug("storing results for this iteration...")
            result = {
                "compound_scores": dict(
                    zip(
                        treatment_rankings["treatment"],
                        treatment_rankings["compound_score"],
                    )
                ),
                "ranks": dict(
                    zip(treatment_rankings["treatment"], treatment_rankings["rank"])
                ),
                "moa": moa_lookup[treatment],
            }

            # Store per treatment and per iteration under dataset_type
            if treatment not in scores[dataset_type]:
                scores[dataset_type][treatment] = {}

            iteration_key = f"iteration_{n_iter}"
            scores[dataset_type][treatment][iteration_key] = result

            # Save after each iteration
            with open(
                (moa_analysis_output_dir / "cpjump1_buscar_scores.json").resolve(
                    strict=False
                ),
                "w",
            ) as f:
                json.dump(scores, f, indent=4, default=str)
            logger.info(
                f"Saved results for treatment: {treatment}, iteration: {n_iter}, dataset: {dataset_type}"
            )

  check_result(result_code)
shuffled treatments: 100%|██████████| 74/74 [14:13:40<00:00, 692.17s/it]   
