In [1]:
%load_ext autoreload
%autoreload 2

import pickle
from pathlib import Path

from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import poisson, nbinom, lognorm

from cytominer_eval import evaluate_metrics
from cytominer_eval.transform import metric_melt, copairs_similarity
from cytominer_eval.utils.transform_utils import check_replicate_groups
from cytominer_eval.utils.operation_utils import assign_replicates, set_pair_ids

from cytominer_eval.operations import (
    replicate_reproducibility,
    precision_recall,
    grit,
    mp_value,
    enrichment,
    hitk,
)

from copairs.map import create_matcher, flatten_str_list, run_pipeline

from scripts.generate_utils import (
    generate_distribution_params,
    generate_distribution_params_differ,
    generate_features,
    aggregate_metrics_results,
    calculate_accuracy,
)

In [2]:
plate_col = "Metadata_Plate"
plate_map_col = "Metadata_Plate_Map"
well_col = "Metadata_Well"
pert_col = "Metadata_Perturbation"
negcon_pert_value = "negative_control"

SEED = 42

In [3]:
# # Example usage
# rng = np.random.default_rng(SEED)
# n_plates, n_wells, n_perts, n_controls, n_feats, n_plate_maps = 3, 5, 4, 1, 100, 2
# feature_proportions = {'gaussian': 1.0, 'lognormal': 0.0, 'poisson': 0.0, 'nbinom': 0.0}
# control_params = { "gaussian": (0, 1), "lognormal": (0, 1), "poisson": 3, "nbinom": (1, 0.5) }

# pert_params = generate_distribution_params(n_perts, rng)

# dframe, feats = generate_features(n_plates, n_wells, n_perts, n_controls, n_feats, n_plate_maps, feature_proportions, control_params, pert_params, rng)

# print(dframe)

In [4]:
rng = np.random.default_rng(SEED)

n_plates = 2
# n_wells = 108  -> # set to n_per_plate + n_controls
n_perts = 100  # constant
n_controls = 8
n_feats = 100  # constant
n_plate_maps = 1  # constant
n_perts_differ = 0

feature_proportions = {"gaussian": 1.0, "lognormal": 0.0, "poisson": 0.0, "nbinom": 0.0}
features_differ = {"gaussian": 1, "lognormal": 0, "poisson": 0, "nbinom": 0}

control_params = {
    "gaussian": (0, 1),
    "lognormal": (0, 1),
    "poisson": 3,
    "nbinom": (1, 0.5),
}
differ_params = {
    "gaussian": (1, 1),
    "lognormal": (0, 1),
    "poisson": 3,
    "nbinom": (1, 0.5),
}

replicate_groups = {
    "pos_sameby": {"all": [f"{pert_col} != '{negcon_pert_value}'"], "any": []},
    "pos_diffby": {"all": [], "any": []},
    "neg_sameby": {"all": [], "any": []},
    "neg_diffby": {"all": ["Metadata_Perturbation_Type"], "any": []},
}

metrics_config = {
    "replicate_reproducibility": {
        "return_median_correlations": True,
        "quantile_over_null": 0.95,
        "replicate_groups": [f"{pert_col}"],
    },
    # "precision_recall": {"k": 10, "groupby_columns": [f"{pert_col}"]},
    # "enrichment": {"percentile": 0.99},
    # "hitk": {"groupby_columns": [f"{pert_col}"], "percent_list": [2, 5, 10]},
    "mp_value": {
        "control_perts": [f"{negcon_pert_value}"],
        "replicate_id": f"{pert_col}",
        "rescale_pca": True,
        "nb_permutations": 1000,
    },
    # "grit": {
    #     "control_perts": ["DMSO"],
    #     "profile_col": "Metadata_broad_sample",
    #     "replicate_group_col": "Metadata_mg_per_ml",
    #     "replicate_summary_method": "mean"
    # }
    "mean_ap": {
        "null_size": 1000,
        "groupby_columns": [f"{pert_col}"],
    },
}

In [5]:
pert_params, differ_perts = generate_distribution_params_differ(
    n_perts, n_perts_differ, control_params, differ_params, rng
)

dframe, feats = generate_features(
    n_plates,
    n_perts + n_controls,
    n_perts,
    n_controls,
    n_feats,
    n_plate_maps,
    feature_proportions,
    control_params,
    pert_params,
    rng,
    features_differ=features_differ,
    differ_params=differ_params,
    resample_perts=False,
)

metrics_results = evaluate_metrics(
    profiles=pd.concat([dframe, feats], axis=1),
    features=feats.columns,
    meta_features=dframe.columns,
    replicate_groups=replicate_groups,
    metrics_config=metrics_config,
    use_copairs=True,
)

acc = calculate_accuracy(metrics_results)


Calculating distances.
Pos pairs size: 100, Neg pairs size: 3200


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


Calculating metric: replicate_reproducibility

Calculating metric: mean_ap


  ap_scores = np.add.reduceat(pr_k * rel_k_list, cutoffs) / num_pos


  0%|          | 0/2 [00:00<?, ?it/s]


Calculating distances.
Pos pairs size: 100, Neg pairs size: 3200

Calculating metric: mp_value


Calculating mp-values:   0%|          | 0/100 [00:00<?, ?it/s]

Processing groups:   0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
metrics_results

{'replicate_reproducibility': (0.06,
     Metadata_Perturbation  similarity_metric
  0                     c0           0.970454
  1                     c1           1.171977
  2                    c10           1.060499
  3                    c11           0.913356
  4                    c12           0.974992
  ..                   ...                ...
  95                   c95           1.014453
  96                   c96           0.970506
  97                   c97           0.866151
  98                   c98           0.936332
  99                   c99           1.018585
  
  [100 rows x 2 columns]),
 'mean_ap':    Metadata_Perturbation   mean_ap   p_value  n_pos_pairs  n_total_pairs
 0                     c0  0.155556  0.403097          1.0           17.0
 1                     c1  0.060662  0.966034          1.0           17.0
 2                    c10  0.069048  0.832667          1.0           17.0
 3                    c11  0.375000  0.176324          1.0           17.0


In [7]:
merged_metrics = aggregate_metrics_results(metrics_results, pert_col)
merged_metrics

Unnamed: 0,index,Metadata_Perturbation,similarity_metric,mean_ap,p_value,n_pos_pairs,n_total_pairs,clique_id,mp_value,-log10(p_value),-log10(mp_value),percent_replicating
0,0,c0,0.029546,0.155556,0.403097,1.0,17.0,1,0.760240,0.394591,0.119049,0.06
1,1,c1,-0.171977,0.060662,0.966034,1.0,17.0,43,0.532468,0.015008,0.273707,0.06
2,2,c10,-0.060499,0.069048,0.832667,1.0,17.0,87,0.146853,0.079528,0.833117,0.06
3,3,c11,0.086644,0.375000,0.176324,1.0,17.0,4,0.080919,0.753689,1.091949,0.06
4,4,c12,0.025008,0.118056,0.491009,1.0,17.0,13,0.655345,0.308911,0.183530,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,c95,-0.014453,0.095455,0.611389,1.0,17.0,25,0.192807,0.213683,0.714877,0.06
96,96,c96,0.029494,0.170455,0.437063,1.0,17.0,15,0.501499,0.359456,0.299730,0.06
97,97,c97,0.133849,0.375000,0.176324,1.0,17.0,27,0.083916,0.753689,1.076155,0.06
98,98,c98,0.063668,0.154762,0.368132,1.0,17.0,61,0.183816,0.433997,0.735616,0.06


In [8]:
acc

{'replicate_reproducibility': 0.06,
 'mp_value': 0.13,
 'mean_ap': 0.02,
 'mean_ap_p_value': 0.0}

In [9]:
raise

RuntimeError: No active exception to reraise

### Run gaussian sweeps over params

In [None]:
# feature_proportions_config = [
#     {"gaussian": 1.0, "lognormal": 0.0, "poisson": 0.0, "nbinom": 0.0},
#     {"gaussian": 0.4, "lognormal": 0.4, "poisson": 0.1, "nbinom": 0.1},
#     {"gaussian": 0.3, "lognormal": 0.3, "poisson": 0.2, "nbinom": 0.2}
# ]
# print(feature_proportions_config)

In [None]:
plate_nums = [2, 3, 4]
# plate_nums = [2]

control_nums = [4, 8, 16, 24]
# control_nums = [8]

differ_params_configs = [
    {"gaussian": (i, 1), "lognormal": (0, 1), "poisson": 3, "nbinom": (1, 0.5)} for i in range(1, 2)
]
# differ_params_configs = [{"gaussian": (i, 1), "lognormal": (0, 1), "poisson": 3, "nbinom": (1, 0.5)} for i in range(1, 4)]
print(differ_params_configs)

features_differ_configs = [{"gaussian": 2**i, "lognormal": 0, "poisson": 0, "nbinom": 0} for i in range(5)]
# features_differ_configs = [{"gaussian": 2**i, "lognormal": 0, "poisson": 0, "nbinom": 0} for i in range(7)]
print(features_differ_configs)

[{'gaussian': (1, 1), 'lognormal': (0, 1), 'poisson': 3, 'nbinom': (1, 0.5)}]
[{'gaussian': 1, 'lognormal': 0, 'poisson': 0, 'nbinom': 0}, {'gaussian': 2, 'lognormal': 0, 'poisson': 0, 'nbinom': 0}]


In [None]:
gaussian_accuracy_results = []

for n_plates in plate_nums:
    print(f"\nProcessing n_plates: {n_plates}")

    for n_controls in control_nums:
        print(f"\nProcessing n_controls: {n_controls}")

        for differ_params in differ_params_configs:
            print(f"\nProcessing differ_params: {differ_params}")

            pert_params, differ_perts = generate_distribution_params_differ(
                n_perts, n_perts_differ, control_params, differ_params, rng
            )

            for features_differ in features_differ_configs:
                print(f"\nProcessing features_differ: {features_differ}")

                dframe, feats = generate_features(
                    n_plates,
                    n_perts + n_controls,
                    n_perts,
                    n_controls,
                    n_feats,
                    n_plate_maps,
                    feature_proportions,
                    control_params,
                    pert_params,
                    rng,
                    features_differ=features_differ,
                    differ_params=differ_params,
                    resample_perts=False,
                )

                metrics_results = evaluate_metrics(
                    profiles=pd.concat([dframe, feats], axis=1),
                    features=feats.columns,
                    meta_features=dframe.columns,
                    replicate_groups=replicate_groups,
                    metrics_config=metrics_config,
                    use_copairs=True,
                )

                acc = calculate_accuracy(metrics_results)
                acc.update(
                    {
                        "n_plates": n_plates,
                        "n_controls": n_controls,
                        "features_differ": features_differ["gaussian"],
                        "differ_params": differ_params["gaussian"][0],
                    }
                )
                gaussian_accuracy_results.append(acc)

# with open("gaussian_accuracy_results.pkl", "wb") as f:
#     pickle.dump(gaussian_accuracy_results, f)


Processing n_plates: 2

Processing n_controls: 8

Processing differ_params: {'gaussian': (1, 1), 'lognormal': (0, 1), 'poisson': 3, 'nbinom': (1, 0.5)}

Processing features_differ: {'gaussian': 1, 'lognormal': 0, 'poisson': 0, 'nbinom': 0}

Calculating distances.
Pos pairs size: 100, Neg pairs size: 3200


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


Calculating metric: replicate_reproducibility

Calculating metric: mean_ap


  ap_scores = np.add.reduceat(pr_k * rel_k_list, cutoffs) / num_pos


  0%|          | 0/2 [00:00<?, ?it/s]


Calculating distances.
Pos pairs size: 100, Neg pairs size: 3200

Calculating metric: mp_value


Calculating mp-values:   0%|          | 0/100 [00:00<?, ?it/s]

IOStream.flush timed out


Processing groups:   0%|          | 0/100 [00:01<?, ?it/s]


Processing features_differ: {'gaussian': 2, 'lognormal': 0, 'poisson': 0, 'nbinom': 0}

Calculating distances.
Pos pairs size: 100, Neg pairs size: 3200


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


Calculating metric: replicate_reproducibility

Calculating metric: mean_ap


  ap_scores = np.add.reduceat(pr_k * rel_k_list, cutoffs) / num_pos


  0%|          | 0/2 [00:00<?, ?it/s]


Calculating distances.
Pos pairs size: 100, Neg pairs size: 3200

Calculating metric: mp_value


Calculating mp-values:   0%|          | 0/100 [00:00<?, ?it/s]

Processing groups:   0%|          | 0/100 [00:02<?, ?it/s]

In [None]:
gaussian_accuracy_results

[{'replicate_reproducibility': 0.02,
  'mp_value': 0.14,
  'mean_ap': 0.03,
  'mean_ap_p_value': 0.06,
  'n_plates': 2,
  'n_controls': 8,
  'features_differ': 1,
  'differ_params': 1},
 {'replicate_reproducibility': 0.03,
  'mp_value': 0.01,
  'mean_ap': 0.03,
  'mean_ap_p_value': 0.08,
  'n_plates': 2,
  'n_controls': 8,
  'features_differ': 2,
  'differ_params': 1}]

In [None]:
# merged_metrics = aggregate_metrics_results(metrics_results, pert_col)

# sns.pairplot(
#     merged_metrics[
#         [
#             "similarity_metric",
#             "precision",
#             "recall",
#             "-log10(mp_value)",
#             "mean_ap",
#             "-log10(p_value)",
#         ]
#     ]
# )

In [None]:
# with open("gaussian_accuracy_results.pkl", "wb") as f:
#     pickle.dump(gaussian_accuracy_results, f)