# Applying JUMP dataset to pre-trained multi-class logistic regression model

In [1]:
import sys
import json
import pathlib

import joblib
import pandas as pd

# project module imports
sys.path.append("../../")  # noqa
from src.utils import (
    check_feature_order,
    generate_confusion_matrix_tl,
    split_meta_and_features,
)  # noqa

## Setting up file paths and parameters 

In [2]:
# setting up paths
results_dir = pathlib.Path("../../results")
data_split_dir = (results_dir / "1.data_splits/").resolve(strict=True)
jump_data_dir = pathlib.Path("../../data/JUMP_data").resolve(strict=True)
modeling_dir = pathlib.Path("../../results/2.modeling").resolve(strict=True)

# JUMP data files
jump_data_path = (jump_data_dir / "JUMP_all_plates_normalized_negcon.csv.gz").resolve(
    strict=True
)

# After holdout metadata
cell_injury_metadata_path = (
    data_split_dir / "cell_injury_metadata_after_holdout.csv.gz"
).resolve(strict=True)

# model paths
multi_class_model_path = (modeling_dir / "multi_class_model.joblib").resolve(
    strict=True
)
shuffled_multi_class_model_path = (
    modeling_dir / "shuffled_multi_class_model.joblib"
).resolve(strict=True)

# feature columns (from feature selected profile)
feature_col_names = (data_split_dir / "feature_cols.json").resolve(strict=True)
injury_codes_path = (data_split_dir / "injury_codes.json").resolve(strict=True)

# output paths
jump_analysis_dir = (results_dir / "3.jump_analysis").resolve()
jump_analysis_dir.mkdir(exist_ok=True)

## Loading Files

In [3]:
# loading in the negatlive controled normalized profiles
jump_df = pd.read_csv(jump_data_path)
cell_injury_meta_df = pd.read_csv(cell_injury_metadata_path)

# loading json file containing selected feature names
with open(feature_col_names, mode="r") as infile:
    cell_injury_cp_feature_cols = json.load(infile)

# loading json file that contains the coder and decoder injury labels
with open(injury_codes_path) as infile:
    injury_codes = json.load(infile)

injury_decoder = injury_codes["decoder"]
injury_encoder = injury_codes["encoder"]

# display dataframe and size
print("JUMP dataset size:", jump_df.shape)
jump_df.head()

JUMP dataset size: (38996, 5805)


Unnamed: 0,Metadata_broad_sample,Metadata_Plate,Metadata_Well,Metadata_gene,Metadata_pert_type,Metadata_control_type,Metadata_target_sequence,Metadata_negcon_control_type,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,Metadata_solvent,Metadata_InChIKey,Metadata_pert_iname,Metadata_pubchem_cid,Metadata_smiles
0,BRDN0001480888,BR00116997,A01,HIF1A,trt,,TATGTGTGAATTACGTTGTG,,1.3659,2.1598,...,-1.4483,-1.3863,-1.3688,-1.4264,-1.3239,,,,,
1,BRDN0001483495,BR00116997,A02,CATSPER4,trt,,CCGACCGTAGGACTCGTGAA,,-0.57723,-0.3286,...,-0.78766,-0.78295,-0.75735,-0.77082,-0.75818,,,,,
2,BRDN0001147364,BR00116997,A03,DDR2,trt,,CCGTGACAAACCGAGCACTG,,-0.34826,-0.077757,...,-0.61214,-0.60621,-0.58238,-0.6049,-0.5694,,,,,
3,BRDN0001490272,BR00116997,A04,OPRL1,trt,,AACGGGAACACCGACAACAG,,-0.076636,0.30289,...,0.001638,0.005891,0.029338,-0.014926,0.055581,,,,,
4,BRDN0001480510,BR00116997,A05,SLC7A11,trt,,GAAGAGATTCAAGTATTACG,,-0.28287,-0.064418,...,-0.43116,-0.40155,-0.37484,-0.42303,-0.34384,,,,,


## Finding overlapping Compounds

This notebook aims to identify overlapping compounds present in both the `cell_injury` and `JUMP` datasets. These overlapping compounds will be used for subsetting the `JUMP` dataset, which we'll consider as the ground truth for subsequent analyses.

## Approach
1. **Identifying Overlapping Compounds**: We compare the compounds present in both datasets to identify the overlapping ones.
2. **Subsetting the JUMP Dataset**: Once the overlapping compounds are identified, we subset the `JUMP` dataset to include only those compounds, forming our ground truth dataset.
3. **Save dataset**: The dataset will be saved in the `./results/3.jump_analysis`

### Step 1: Identifying Overlapping Compounds
Here, we used the International Chemical Identifier (InChI) to identify chemicals shared between the JUMP dataset and the Cell Injury dataset.

In [4]:
# get all InChI keys
cell_injury_InChI_keys = cell_injury_meta_df["Compound InChIKey"].tolist()
jump_InChI_keys = jump_df["Metadata_InChIKey"].tolist()

# identify common InChI Keys
common_compounds_inchikey = list(
    set(cell_injury_InChI_keys).intersection(jump_InChI_keys)
)

# identify the compounds
overlapping_compounds_df = cell_injury_meta_df.loc[
    cell_injury_meta_df["Compound InChIKey"].isin(common_compounds_inchikey)
]

# inserting injury code
overlapping_compounds_df.insert(
    0,
    "injury_code",
    overlapping_compounds_df["injury_type"].apply(lambda name: injury_encoder[name]),
)
unique_compound_names = overlapping_compounds_df["Compound Name"].unique().tolist()
print("Identified overlapping compounds:", ", ".join(unique_compound_names))


# now create a dataframe where it contains
overlapping_compounds_df = (
    overlapping_compounds_df[
        ["injury_code", "injury_type", "Compound Name", "Compound InChIKey"]
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)
overlapping_compounds_df

Identified overlapping compounds: DMSO, Colchicine, Cycloheximide, Menadione


Unnamed: 0,injury_code,injury_type,Compound Name,Compound InChIKey
0,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N
1,1,Cytoskeletal,Colchicine,IAKHMKGGTNLKSZ-INIZCTEOSA-N
2,5,Miscellaneous,Cycloheximide,YPHMISFOHDHNIV-FSZOTQKASA-N
3,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N


Once the common compounds and their associated cell injury types are identified, the next step involves selecintg it from the JUMP dataset to select only wells that possess the common InChI keys.

In [5]:
overlapping_jump_df = jump_df.loc[
    jump_df["Metadata_InChIKey"].isin(common_compounds_inchikey)
]

# agument filtered JUMP data with labels
overlapping_jump_df = pd.merge(
    overlapping_jump_df,
    overlapping_compounds_df,
    left_on="Metadata_InChIKey",
    right_on="Compound InChIKey",
)


print("shape: ", overlapping_jump_df.shape)
overlapping_jump_df.head()

shape:  (3186, 5809)


Unnamed: 0,Metadata_broad_sample,Metadata_Plate,Metadata_Well,Metadata_gene,Metadata_pert_type,Metadata_control_type,Metadata_target_sequence,Metadata_negcon_control_type,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_RNA_5_03_256,Metadata_solvent,Metadata_InChIKey,Metadata_pert_iname,Metadata_pubchem_cid,Metadata_smiles,injury_code,injury_type,Compound Name,Compound InChIKey
0,,BR00117008,A02,,control,negcon,,,1.318,1.1009,...,-2.1744,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,DMSO,679.0,CS(=O)C,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N
1,,BR00117008,A09,,control,negcon,,,3.1824,3.0804,...,-2.5559,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,DMSO,679.0,CS(=O)C,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N
2,,BR00117008,A17,,control,negcon,,,0.79105,0.6296,...,-1.56,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,DMSO,679.0,CS(=O)C,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N
3,,BR00117008,B03,,control,negcon,,,-1.2159,-1.2808,...,-0.65953,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,DMSO,679.0,CS(=O)C,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N
4,,BR00117008,B14,,control,negcon,,,-0.34084,-0.365,...,-0.65422,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,DMSO,679.0,CS(=O)C,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N


Now that we have identified the wells treated with overlapping treatments, we want to know the number of wells that a specific treatment have.

In [6]:
# count number of wells and agument with injury_code injury_yype and compound name
well_counts_df = (
    overlapping_jump_df.groupby("Metadata_InChIKey")
    # counting the numbver of wells
    .size()
    .to_frame()
    .reset_index()
    # merge based on InChIKey
    .merge(
        overlapping_compounds_df,
        left_on="Metadata_InChIKey",
        right_on="Compound InChIKey",
    )
    # remove duplicate InChIKey Column
    .drop(columns=["Compound InChIKey"])
)

# update columns
well_counts_df.columns = [
    "Metadata_InChIKey",
    "n_wells",
    "injury_code",
    "injury_type",
    "compund_name",
]
well_counts_df

Unnamed: 0,Metadata_InChIKey,n_wells,injury_code,injury_type,compund_name
0,IAKHMKGGTNLKSZ-INIZCTEOSA-N,48,1,Cytoskeletal,Colchicine
1,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,3044,0,Control,DMSO
2,MJVAVZPDRWSRRC-UHFFFAOYSA-N,46,6,Redox,Menadione
3,YPHMISFOHDHNIV-FSZOTQKASA-N,48,5,Miscellaneous,Cycloheximide


Next, we wanted to examine the distribution of treatments across plates.

In [7]:
# now lets look at the amount of wells have treatments and controls per plate
n_well_treatments = {}
for plate, df in overlapping_jump_df.groupby("Metadata_Plate"):
    treatment_counts = {}
    for treatment, df2 in df.groupby("Metadata_InChIKey"):
        counts = df2.shape[0]
        treatment_counts[df2["Compound Name"].unique()[0]] = counts

    n_well_treatments[plate] = treatment_counts

# looking treatment distribution across each plate
plate_treatments = (
    pd.DataFrame.from_dict(n_well_treatments, orient="columns")
    .T[["DMSO", "Colchicine", "Menadione", "Cycloheximide"]]
    .fillna(0)
    .astype(int)
    .reset_index()
)
plate_treatments.columns = [
    "plate_id",
    "DMSO",
    "Colchicine",
    "Menadione",
    "Cycloheximide",
]

# display
print(
    "Number of Plates that contain overlapping treatments:", plate_treatments.shape[0]
)
plate_treatments

Number of Plates that contain overlapping treatments: 24


Unnamed: 0,plate_id,DMSO,Colchicine,Menadione,Cycloheximide
0,BR00116991,128,2,2,2
1,BR00116992,128,2,2,2
2,BR00116993,128,2,2,2
3,BR00116994,128,2,2,2
4,BR00116995,100,2,0,2
5,BR00117008,128,2,2,2
6,BR00117009,128,2,2,2
7,BR00117010,128,2,2,2
8,BR00117011,128,2,2,2
9,BR00117012,128,2,2,2


Finally we save the overlapping_treaments_df as a csv.gz file.

In [8]:
# save overlapping files
overlapping_jump_df.to_csv(
    modeling_dir / "overlapping_treatments_jump_data.csv.gz",
    compression="gzip",
    index=False,
)

## Feature alignment

In this section, we are identifying the shared features present in both the cell injury and JUMP datasets. 
Once these features are identified, we update the JUMP dataset to include only those features that are shared between both profiles for our machine learning application

First we identify the CellProfiler (CP) features present in the JUMP data. 
We accomplish this by utilizing `pycytominer`'s  `infer_cp_features()`, which helps us identify CP features in the JUMP dataset.

In [9]:
# get compartments
metadata_prefix = "Metadata_"

# split metadata and feature column names
jump_meta_cols, jump_feat_cols = split_meta_and_features(jump_df, metadata_tag=True)

# display number of features of both profiles
print("Number of Metadata Features:", len(jump_meta_cols))
print(
    "Number of CP features that cell injury has",
    len(cell_injury_cp_feature_cols["feature_cols"]),
)
print("Number of CP features that JUMP has:", len(jump_feat_cols))

Number of Metadata Features: 13
Number of CP features that cell injury has 346
Number of CP features that JUMP has: 5792


Now that we have identified the features present in both datasets, the next step is to align them. This involves identifying the common features between both profiles and utilizing these features to update our JUMP dataset for our machine learning model.

In [10]:
cell_injury_cp_features = cell_injury_cp_feature_cols["feature_cols"]

# finding shared features using intersection
aligned_features = list(set(cell_injury_cp_features) & set(jump_feat_cols))

# displaying the number of shared features between both profiles
print("Number of shared features of both profiles", len(aligned_features))

Number of shared features of both profiles 207


The objective of this step is to preserve the order of the feature space.

Since we have identified the shared feature space across both profiles, we still need to address those that are missing. 
Therefore, to maintain the feature space order, we used the the cell injury feature space as our reference feature space order, since our multi-class model was trained to understand this specific order.

Next, we addressed features that were not found within the JUMP dataset.
This was done by including them in the alignment process, but defaulted their values to 0.

Ultimately, we generated a new profile called `aligned_jump_df`, which contains the correctly aligned and ordered feature space from the cell injury dataset.

In [11]:
# multiplier is the number of samples in JUMP data in order to maintaing data shape
multiplier = jump_df.shape[0]

# storing feature and values in order
aligned_jump = {}
for injury_feat in cell_injury_cp_features:
    if injury_feat not in aligned_features:
        aligned_jump[injury_feat] = [0.0] * multiplier
    else:
        aligned_jump[injury_feat] = jump_df[injury_feat].values.tolist()

# creating dataframe with the aligned features and retained feature order
aligned_jump_df = pd.DataFrame.from_dict(aligned_jump, orient="columns")

# sanity check: see if the feature order in the `cell_injury_cp_feature_cols` is the same with
# the newly generated aligned JUMP dataset
assert (
    cell_injury_cp_features == aligned_jump_df.columns.tolist()
), "feature space are not aligned"
assert check_feature_order(
    ref_feat_order=cell_injury_cp_features,
    input_feat_order=aligned_jump_df.columns.tolist(),
), "feature space do not follow the same order"

In [12]:
# augment aligned jump with the metadata and save it
aligned_jump_df = jump_df[jump_meta_cols].merge(
    aligned_jump_df, left_index=True, right_index=True
)

# display
print("shape of aligned dataset", aligned_jump_df.shape)
aligned_jump_df.head()

shape of aligned dataset (38996, 359)


Unnamed: 0,Metadata_broad_sample,Metadata_Plate,Metadata_Well,Metadata_gene,Metadata_pert_type,Metadata_control_type,Metadata_target_sequence,Metadata_negcon_control_type,Metadata_solvent,Metadata_InChIKey,...,Nuclei_Texture_InverseDifferenceMoment_DNA_20_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumVariance_DNA_20_0
0,BRDN0001480888,BR00116997,A01,HIF1A,trt,,TATGTGTGAATTACGTTGTG,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BRDN0001483495,BR00116997,A02,CATSPER4,trt,,CCGACCGTAGGACTCGTGAA,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BRDN0001147364,BR00116997,A03,DDR2,trt,,CCGTGACAAACCGAGCACTG,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BRDN0001490272,BR00116997,A04,OPRL1,trt,,AACGGGAACACCGACAACAG,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BRDN0001480510,BR00116997,A05,SLC7A11,trt,,GAAGAGATTCAAGTATTACG,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Applying JUMP dataset to Multi-Class Logistics Regression Model

### Applying to Complete JUMP dataset

In [13]:
# split the data
aligned_meta_cols, aligned_feature_cols = split_meta_and_features(aligned_jump_df)
X = aligned_jump_df[aligned_feature_cols]

In [14]:
# Loading in model
model = joblib.load(modeling_dir / "multi_class_model.joblib")
shuffled_model = joblib.load(modeling_dir / "shuffled_multi_class_model.joblib")

Here, we apply the JUMP dataset to the model to calculate the probabilities of each injury being present per well. These probabilities are then saved in a tidy long format suitable for plotting in R.

In [15]:
# get all injury classes
injury_classes = [injury_decoder[str(code)] for code in model.classes_.tolist()]

# prediction probabilities on both non-shuffled and shuffled models
y_proba = model.predict_proba(X)
shuffled_y_proba = shuffled_model.predict_proba(X)

# convert to pandas dataframe
y_proba_df = pd.DataFrame(y_proba)
shuffled_y_proba_df = pd.DataFrame(shuffled_y_proba)

# update column names with injury type names
y_proba_df.columns = [
    injury_codes["decoder"][str(colname)] for colname in y_proba_df.columns.tolist()
]

shuffled_y_proba_df.columns = [
    injury_codes["decoder"][str(colname)]
    for colname in shuffled_y_proba_df.columns.tolist()
]

# adding column if labels indicating if the prediction was done with a shuffled model
y_proba_df.insert(0, "shuffled_model", False)
shuffled_y_proba_df.insert(0, "shuffled_model", True)

# merge InChIKey based on index, since order is retained
# jump_df[aligned_meta_cols].merge(y_proba_df)
y_proba_df = pd.merge(
    jump_df[aligned_meta_cols]["Metadata_InChIKey"].to_frame(),
    y_proba_df,
    left_index=True,
    right_index=True,
)
shuffled_y_proba_df = pd.merge(
    jump_df[aligned_meta_cols]["Metadata_InChIKey"].to_frame(),
    shuffled_y_proba_df,
    left_index=True,
    right_index=True,
)

# concat all probabilities into one dataframe
all_probas_df = pd.concat([y_proba_df, shuffled_y_proba_df]).reset_index(drop=True)

# Add a column to indicate the most probable injury
# This is achieved by selecting the injury with the highest probability
all_probas_df.insert(
    2,
    "pred_injury",
    all_probas_df[injury_classes].apply(lambda row: row.idxmax(), axis=1),
)

# next is to convert the probabilities dataframe into tidy long
all_probas_df_tl = pd.melt(
    all_probas_df,
    id_vars=["Metadata_InChIKey", "shuffled_model", "pred_injury"],
    value_vars=injury_classes,
    var_name="injury_type",
    value_name="proba",
)

# save probabilities in tidy long format
all_probas_df_tl.to_csv(jump_analysis_dir / "JUMP_injury_proba.csv.gz", index=False)

print("tidy long format probability shape", all_probas_df_tl.shape)

tidy long format probability shape (1169880, 5)


Now that the Metadata_InChIKey metadata has been added to the probabilities dataframe, we can filter out the overlapping treatments based on their InChIKeys.

In [16]:
# display overlapping compounds
overlapping_compounds_df

Unnamed: 0,injury_code,injury_type,Compound Name,Compound InChIKey
0,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N
1,1,Cytoskeletal,Colchicine,IAKHMKGGTNLKSZ-INIZCTEOSA-N
2,5,Miscellaneous,Cycloheximide,YPHMISFOHDHNIV-FSZOTQKASA-N
3,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N


In [17]:
overlapping_compounds_probas_df = all_probas_df.loc[
    all_probas_df["Metadata_InChIKey"].isin(
        overlapping_compounds_df["Compound InChIKey"]
    )
]
overlapping_compounds_probas_df = overlapping_compounds_df.merge(
    overlapping_compounds_probas_df,
    how="inner",
    left_on="Compound InChIKey",
    right_on="Metadata_InChIKey",
)
overlapping_compounds_probas_df

Unnamed: 0,injury_code,injury_type,Compound Name,Compound InChIKey,Metadata_InChIKey,shuffled_model,pred_injury,Control,Cytoskeletal,Hsp90,...,Miscellaneous,Redox,HDAC,mTOR,Proteasome,Saponin,Mitochondria,Ferroptosis,Tannin,Nonspecific reactive
0,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,False,Nonspecific reactive,1.654685e-10,1.760070e-14,3.265176e-12,...,1.578660e-09,1.308449e-10,1.035134e-13,4.946584e-13,2.421907e-17,1.707735e-10,4.662660e-05,6.308207e-02,2.373785e-25,9.368709e-01
1,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,False,Ferroptosis,5.923648e-07,9.227059e-13,6.106879e-08,...,4.754724e-08,1.047427e-08,3.622747e-10,2.648696e-07,1.134640e-10,3.106847e-05,5.194344e-05,7.765251e-01,8.992019e-22,2.051300e-01
2,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,False,Ferroptosis,9.098675e-07,1.487504e-10,3.430503e-09,...,3.250602e-08,1.649960e-08,1.626662e-09,2.899114e-08,3.576608e-10,7.011772e-06,6.285196e-09,7.644943e-01,1.840985e-16,2.142676e-01
3,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,False,Nonspecific reactive,1.115321e-03,2.984645e-07,2.624631e-04,...,5.119857e-05,1.255866e-07,1.289511e-06,8.082201e-10,3.583682e-07,2.271663e-08,4.250258e-06,1.108986e-04,2.018675e-07,9.976418e-01
4,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,False,Nonspecific reactive,3.825096e-01,1.051887e-06,7.619147e-05,...,8.231749e-05,6.310727e-06,5.179568e-04,6.650970e-04,1.954933e-05,2.187047e-04,9.021370e-06,7.606034e-05,2.593927e-06,5.927650e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6367,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N,MJVAVZPDRWSRRC-UHFFFAOYSA-N,True,Mitochondria,8.272263e-08,4.820186e-07,4.953569e-10,...,2.046271e-08,8.047517e-11,4.678518e-01,3.175436e-14,1.152614e-19,2.738970e-09,5.321469e-01,3.125266e-13,2.135073e-12,9.353584e-16
6368,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N,MJVAVZPDRWSRRC-UHFFFAOYSA-N,True,Proteasome,2.286481e-05,3.971284e-04,9.266292e-03,...,5.580060e-05,3.017964e-05,3.028319e-01,6.698544e-07,6.872077e-01,1.522940e-07,1.989463e-12,4.654492e-06,4.713430e-06,2.422277e-13
6369,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N,MJVAVZPDRWSRRC-UHFFFAOYSA-N,True,HDAC,9.272566e-07,5.422968e-06,5.062051e-07,...,3.038844e-07,6.585694e-08,6.945802e-01,1.268738e-11,1.396227e-11,7.693623e-10,3.054058e-01,7.078683e-14,1.158707e-09,9.681026e-14
6370,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N,MJVAVZPDRWSRRC-UHFFFAOYSA-N,True,HDAC,7.457521e-08,2.754116e-07,7.485642e-10,...,1.710217e-08,1.080905e-12,9.999975e-01,6.177160e-15,1.176971e-23,1.718137e-08,1.116178e-08,1.029927e-13,1.448327e-10,2.276926e-16


In [26]:
overlapping_compounds_probas_df.loc[overlapping_compounds_probas_df["shuffled_model"]]

Unnamed: 0,injury_code,injury_type,Compound Name,Compound InChIKey,Metadata_InChIKey,shuffled_model,pred_injury,Control,Cytoskeletal,Hsp90,...,Miscellaneous,Redox,HDAC,mTOR,Proteasome,Saponin,Mitochondria,Ferroptosis,Tannin,Nonspecific reactive
3044,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,True,Tannin,1.038979e-03,2.657515e-04,3.702994e-04,...,2.066025e-03,3.048640e-03,6.294210e-08,8.631925e-08,2.080159e-05,8.991291e-02,2.623379e-06,5.249671e-05,8.477849e-01,5.492927e-02
3045,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,True,Mitochondria,2.109182e-02,6.996999e-03,1.528216e-03,...,2.240245e-02,3.625591e-02,9.674714e-04,9.089425e-07,9.087432e-09,8.200017e-02,5.284452e-01,1.858944e-03,2.531944e-01,2.171942e-02
3046,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,True,Tannin,1.065373e-02,5.124011e-03,3.158147e-03,...,1.083860e-02,1.773735e-02,1.635590e-02,1.078686e-06,3.135629e-05,1.251091e-01,3.094041e-01,1.864201e-05,4.881041e-01,2.262352e-04
3047,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,True,Nonspecific reactive,2.201038e-02,1.333557e-02,2.887443e-02,...,3.810805e-02,4.512571e-02,2.487559e-04,2.226225e-02,1.766670e-01,1.027549e-02,2.587161e-02,3.350666e-05,4.730036e-03,5.674719e-01
3048,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,True,Mitochondria,1.048159e-02,8.902019e-03,7.471222e-03,...,1.068099e-02,8.305348e-02,1.386411e-02,8.068077e-02,1.835257e-03,8.048456e-02,6.758756e-01,2.789195e-04,9.803289e-05,6.559288e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6367,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N,MJVAVZPDRWSRRC-UHFFFAOYSA-N,True,Mitochondria,8.272263e-08,4.820186e-07,4.953569e-10,...,2.046271e-08,8.047517e-11,4.678518e-01,3.175436e-14,1.152614e-19,2.738970e-09,5.321469e-01,3.125266e-13,2.135073e-12,9.353584e-16
6368,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N,MJVAVZPDRWSRRC-UHFFFAOYSA-N,True,Proteasome,2.286481e-05,3.971284e-04,9.266292e-03,...,5.580060e-05,3.017964e-05,3.028319e-01,6.698544e-07,6.872077e-01,1.522940e-07,1.989463e-12,4.654492e-06,4.713430e-06,2.422277e-13
6369,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N,MJVAVZPDRWSRRC-UHFFFAOYSA-N,True,HDAC,9.272566e-07,5.422968e-06,5.062051e-07,...,3.038844e-07,6.585694e-08,6.945802e-01,1.268738e-11,1.396227e-11,7.693623e-10,3.054058e-01,7.078683e-14,1.158707e-09,9.681026e-14
6370,6,Redox,Menadione,MJVAVZPDRWSRRC-UHFFFAOYSA-N,MJVAVZPDRWSRRC-UHFFFAOYSA-N,True,HDAC,7.457521e-08,2.754116e-07,7.485642e-10,...,1.710217e-08,1.080905e-12,9.999975e-01,6.177160e-15,1.176971e-23,1.718137e-08,1.116178e-08,1.029927e-13,1.448327e-10,2.276926e-16


### Confusion Matrix with Overlapping Treatments

In [19]:
overlapp_df = aligned_jump_df.loc[
    aligned_jump_df["Metadata_InChIKey"].isin(
        overlapping_compounds_df["Compound InChIKey"]
    )
]

# separate metadata and feature columns
overlapp_meta, overlapp_feats = split_meta_and_features(overlapp_df)

overlapp_df = overlapping_compounds_df.merge(
    overlapp_df, how="inner", left_on="Compound InChIKey", right_on="Metadata_InChIKey"
)
overlapp_df.head()

Unnamed: 0,injury_code,injury_type,Compound Name,Compound InChIKey,Metadata_broad_sample,Metadata_Plate,Metadata_Well,Metadata_gene,Metadata_pert_type,Metadata_control_type,...,Nuclei_Texture_InverseDifferenceMoment_DNA_20_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumVariance_DNA_20_0
0,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,,BR00117008,A02,,control,negcon,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,,BR00117008,A09,,control,negcon,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,,BR00117008,A17,,control,negcon,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,,BR00117008,B03,,control,negcon,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,Control,DMSO,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,,BR00117008,B14,,control,negcon,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# splitting data
X = overlapp_df[overlapp_feats]
y = overlapp_df["injury_code"]

In [21]:
# generated a confusion matrix in tidy long format
jump_overlap_cm = generate_confusion_matrix_tl(
    model, X, y, shuffled=False, dataset_type="JUMP Overlap"
).fillna(0)
shuffled_jump_overlap_cm = generate_confusion_matrix_tl(
    shuffled_model, X, y, shuffled=True, dataset_type="JUMP Overlap"
).fillna(0)

In [22]:
# save confusion matrix
pd.concat([jump_overlap_cm, shuffled_jump_overlap_cm]).to_csv(
    modeling_dir / "jump_overlap_confusion_matrix.csv.gz",
    compression="gzip",
    index=False,
)