## 1. Calculate mAP for phenotypic activity and consistency assesement

In [1]:
import numpy as np
import pandas as pd
from pycytominer import aggregate

from map_utils.map import calculate_map
from nelisa_utils import get_meta_features

In [2]:
cp_df = pd.read_parquet("outputs/cellpainting_profiles_normalized.parquet")
ne_df = pd.read_parquet("outputs/nelisa_profiles_normalized.parquet")

cp_df.fillna({"Metadata_broad_sample": "NA"}, inplace=True)
ne_df.fillna({"Metadata_broad_sample": "NA"}, inplace=True)
cp_df["Metadata_control_index"] = np.where(
    cp_df["Metadata_control_type"] == "negcon", cp_df.index, -1
)
ne_df["Metadata_control_index"] = np.where(
    ne_df["Metadata_control_type"] == "negcon", ne_df.index, -1
)

assert cp_df.shape[0] == ne_df.shape[0]
print(cp_df.shape, ne_df.shape)

(1512, 631) (1512, 223)


### Phenotypic activity via retrieving replicates

In [3]:
pair_config = {
    "pos_sameby": {
        "all": ["Metadata_broad_sample", "Metadata_control_index"],
        "any": [],
    },
    "pos_diffby": {"all": [], "any": []},
    "neg_sameby": {"all": ["Metadata_Plate"], "any": []},
    "neg_diffby": {
        "all": ["Metadata_broad_sample", "Metadata_control_index"],
        "any": [],
    },
}

map_config = {
    "null_size": 100000,
    "groupby_columns": ["Metadata_broad_sample"],
}

In [4]:
cp_map_results = calculate_map(cp_df, pair_config, map_config)
cp_map_results = cp_map_results.query("Metadata_broad_sample != 'NA'")
cp_map_results.to_csv("outputs/cp_map_activity_results.csv", index=False)
cp_map_results

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  ap_scores = np.add.reduceat(pr_k * rel_k_list, cutoffs) / num_pos


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

Unnamed: 0,Metadata_broad_sample,mAP,p_value,corrected_p_value,below_p,p < 0.05,-log10(mAP p-value)
0,BRD-A00827783-001-24-6,0.432287,0.011260,0.017235,True,True,1.763601
1,BRD-A01078468-001-14-8,0.522666,0.003860,0.006617,True,True,2.179334
2,BRD-A07207424-001-14-0,0.573214,0.002600,0.004643,True,True,2.333219
3,BRD-A08187463-001-12-9,0.518849,0.004080,0.006954,True,True,2.157736
4,BRD-A09722536-002-18-0,0.211931,0.086609,0.096590,False,False,1.015067
...,...,...,...,...,...,...,...
295,BRD-K97091514-001-11-4,0.223748,0.075959,0.085347,False,False,1.068809
296,BRD-K97181089-003-24-7,0.979167,0.000030,0.000084,True,True,4.075146
297,BRD-K98357249-001-02-9,0.138874,0.176928,0.183663,False,False,0.735979
298,BRD-K98763141-001-30-8,0.545696,0.003130,0.005523,True,True,2.257788


In [5]:
ne_map_results = calculate_map(ne_df, pair_config, map_config)
ne_map_results = ne_map_results.query("Metadata_broad_sample != 'NA'")
ne_map_results.to_csv("outputs/ne_map_activity_results.csv", index=False)
ne_map_results

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  ap_scores = np.add.reduceat(pr_k * rel_k_list, cutoffs) / num_pos


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

Unnamed: 0,Metadata_broad_sample,mAP,p_value,corrected_p_value,below_p,p < 0.05,-log10(mAP p-value)
0,BRD-A00827783-001-24-6,0.172251,0.124609,0.181853,False,False,0.740279
1,BRD-A01078468-001-14-8,0.473238,0.006410,0.018903,True,True,1.723476
2,BRD-A07207424-001-14-0,0.119185,0.218218,0.276225,False,False,0.558737
3,BRD-A08187463-001-12-9,0.171421,0.125479,0.181853,False,False,0.740279
4,BRD-A09722536-002-18-0,0.156462,0.142649,0.196368,False,False,0.706930
...,...,...,...,...,...,...,...
295,BRD-K97091514-001-11-4,0.074625,0.434676,0.464067,False,False,0.333420
296,BRD-K97181089-003-24-7,0.060519,0.578984,0.592484,False,False,0.227323
297,BRD-K98357249-001-02-9,0.094346,0.311017,0.349457,False,False,0.456606
298,BRD-K98763141-001-30-8,0.068006,0.495605,0.519865,False,False,0.284109


### Phenotypic consistency via same-target perturbation retrievability

In [6]:
pair_config = {
    "pos_sameby": {"all": ["Metadata_target"], "any": []},
    "pos_diffby": {"all": ["Metadata_broad_sample"], "any": []},
    "neg_sameby": {"all": [], "any": []},
    "neg_diffby": {"all": ["Metadata_target", "Metadata_broad_sample"], "any": []},
    "multilabel_col": "Metadata_target",
}

map_config = {
    "null_size": 100000,
    "groupby_columns": ["Metadata_target"],
}

In [7]:
cp_map_results = pd.read_csv("outputs/cp_map_activity_results.csv")
ne_map_results = pd.read_csv("outputs/ne_map_activity_results.csv")

cp_map_results.shape, ne_map_results.shape

((300, 7), (300, 7))

In [8]:
_, feature_cols = get_meta_features(cp_df)

cp_all_df = aggregate(
    cp_df,
    strata=["Metadata_broad_sample", "Metadata_target_list"],
    features=feature_cols,
)
cp_all_df["Metadata_target"] = cp_all_df["Metadata_target_list"].str.split("|")
cp_all_df = cp_all_df.query("Metadata_broad_sample != 'NA'")
cp_all_df.shape

(300, 618)

In [9]:
_, feature_cols = get_meta_features(ne_df)

ne_all_df = aggregate(
    ne_df,
    strata=["Metadata_broad_sample", "Metadata_target_list"],
    features=feature_cols,
)
ne_all_df["Metadata_target"] = ne_all_df["Metadata_target_list"].str.split("|")
ne_all_df = ne_all_df.query("Metadata_broad_sample != 'NA'")
ne_all_df.shape

(300, 194)

In [10]:
cp_all_map_results = calculate_map(cp_all_df, pair_config, map_config)
ne_all_map_results = calculate_map(ne_all_df, pair_config, map_config)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/385 [00:00<?, ?it/s]

  0%|          | 0/413 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/385 [00:00<?, ?it/s]

  0%|          | 0/413 [00:00<?, ?it/s]

In [11]:
cp_all_map_results.to_csv("outputs/cp_all_map_consistency_results.csv", index=False)
ne_all_map_results.to_csv("outputs/ne_all_map_consistency_results.csv", index=False)