# Replicability

In [1]:
import pandas as pd
import numpy as np
from copairs.map import aggregate
from copairs.map import run_pipeline

In [2]:
data_level = "normalized_feature_select"

data_path = f"output/processed/{data_level}/combined.parquet"

df = pd.read_parquet(data_path)

In [3]:
df[[col for col in df.columns if col.startswith("Metadata_")]].sample(10)

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Plate_Map_Name,Metadata_cell_type,Metadata_line_ID,Metadata_line_condition,Metadata_line_source,Metadata_plating_density,Metadata_Object_Count,Metadata_Object_Count_inferred,Metadata_Site_Count
178,BR00127194,H11,BR00127194,progen,17,control,human,10000,2468.0,2649.012156,9
1370,BR_NCP_STEM_1,J03,BR_NCP_STEM_1,stem,38,deletion,human,10000,2941.02,2941.02,9
1699,Plate1_PE_PP96,F11,2022-08-07_batch_METADATA,astro,46,deletion,human,5000,603.0,640.836,7
1611,PE_PP_Plate2,G06,2022-08-07_batch_METADATA,astro,4,control,human,5000,341.0,937.552,7
1198,BR_NCP_STEM_1,B23,BR_NCP_STEM_1,stem,12,control,human,10000,389.43,389.43,9
835,BR00132673,C20,NCP_2022_03_03_METADATA,neuron,18,control,human,3500,326.0,377.982,3
1572,PE_PP_Plate2,D02,2022-08-07_batch_METADATA,astro,35,deletion,human,5000,41.0,67.6102,7
258,BR00127194,K19,BR00127194,progen,8,control,human,10000,1078.0,1332.951763,9
1273,BR_NCP_STEM_1,F02,BR_NCP_STEM_1,stem,27,deletion,human,10000,2387.88,2387.88,9
493,BR00132672,E14,NCP_2022_03_03_METADATA,neuron,14,control,human,3500,136.0,148.368,3


In [4]:
# extract meta and feat columns
meta_cols = [c for c in df.columns if c.startswith("Metadata")]
feat_cols = [c for c in df.columns if not c.startswith("Metadata")]
meta = df[meta_cols]
feats = df[feat_cols].values

# pos is what you are matching on
# sameby is what condition shall be the same (e.g., same perturbation, same target)
pos_sameby = [
    "Metadata_Plate",
    "Metadata_cell_type",
    "Metadata_line_ID",
    "Metadata_line_source",
]
# diffby is what condition shall be different (e.g., different plate)
pos_diffby = []

# neg is what you are matching against
# sameby is barely used unless you want to limit the sample, e.g., only against control in the same plate
neg_sameby = ["Metadata_Plate", "Metadata_cell_type"]
# diffby is what condition shall be different (e.g., control vs. non control)
neg_diffby = ["Metadata_line_ID"]
null_size = 10000
tech_result = run_pipeline(
    meta, feats, pos_sameby, pos_diffby, neg_sameby, neg_diffby, null_size
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  ap_scores = np.add.reduceat(pr_k * rel_k_list, cutoffs) / num_pos


  0%|          | 0/5 [00:00<?, ?it/s]

  ap = (pr_k * rel_k).sum(axis=1) / num_pos


In [5]:
# combine scores from samples with the same Metadata_Sample_Unique
tech_result_agg = aggregate(tech_result, pos_sameby, threshold=0.05)
tech_result_agg[["above_p_threshold", "above_q_threshold"]].value_counts()

above_p_threshold  above_q_threshold
False              False                178
True               True                  61
                   False                 48
Name: count, dtype: int64

In [9]:
output_dir = f"output/analysis_results/{data_level}/replicability/"

import os

os.makedirs(output_dir, exist_ok=True)

tech_result_agg.to_csv(f"{output_dir}/tech_result_agg.csv")

tech_result.to_csv(f"{output_dir}/tech_result.csv")