# Replicability

In [1]:
import pandas as pd
import numpy as np
from copairs.map import aggregate
from copairs.map import run_pipeline

import sys
import os

notebook_dir = os.path.abspath(os.getcwd())

ncp_src_path = os.path.abspath(os.path.join(notebook_dir, "..", "ncp", "src"))

if ncp_src_path not in sys.path:
    sys.path.append(ncp_src_path)

from my_run_pipeline import my_run_pipeline

# Suppressing warnings for cleaner output
import warnings

warnings.filterwarnings("ignore")

In [2]:
data_level = "normalized_feature_select"

data_path = f"output/processed/{data_level}/combined.parquet"

df = pd.read_parquet(data_path)

In [3]:
df[[col for col in df.columns if col.startswith("Metadata_")]].sample(10)

# filter to only the Metadata_Plate == "BR00132673"
# df = df.query("Metadata_Plate == 'BR00132673'")

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Plate_Map_Name,Metadata_cell_type,Metadata_line_ID,Metadata_line_condition,Metadata_line_source,Metadata_plating_density,Metadata_Object_Count,Metadata_Object_Count_inferred,Metadata_Site_Count
1026,BR00132673,K19,NCP_2022_03_03_METADATA,neuron,8,control,human,3500,431.0,454.524,3
1316,BR_NCP_STEM_1,G21,BR_NCP_STEM_1,stem,45,deletion,human,10000,1873.98,1873.98,9
839,BR00132673,C24,NCP_2022_03_03_METADATA,neuron,29,deletion,human,3500,371.0,461.238,3
1110,BR00132673,O07,NCP_2022_03_03_METADATA,neuron,31,deletion,human,3500,582.0,588.348,3
1551,PE_PP_Plate2,B05,2022-08-07_batch_METADATA,astro,7,control,human,5000,145.0,192.808,7
964,BR00132673,I05,NCP_2022_03_03_METADATA,neuron,44,deletion,human,3500,313.0,320.61,3
1691,Plate1_PE_PP96,F03,2022-08-07_batch_METADATA,astro,41,deletion,human,5000,884.0,950.53,7
1489,BR_NCP_STEM_1,O02,BR_NCP_STEM_1,stem,10,control,human,10000,2039.04,2039.04,9
929,BR00132673,G18,NCP_2022_03_03_METADATA,neuron,31,deletion,human,3500,271.0,288.396,3
523,BR00132672,F20,NCP_2022_03_03_METADATA,neuron,22,control,human,3500,108.0,113.388,3


In [4]:
# extract meta and feat columns
meta_cols = [c for c in df.columns if c.startswith("Metadata")]
feat_cols = [c for c in df.columns if not c.startswith("Metadata")]
meta = df[meta_cols]
feats = df[feat_cols].values

# pos is what you are matching on
# sameby is what condition shall be the same (e.g., same perturbation, same target)
pos_sameby = [
    "Metadata_Plate",
    "Metadata_cell_type",
    "Metadata_line_ID",
    "Metadata_line_source",
]
# diffby is what condition shall be different (e.g., different plate)
pos_diffby = []

# neg is what you are matching against
# sameby is barely used unless you want to limit the sample, e.g., only against control in the same plate
neg_sameby = ["Metadata_Plate", "Metadata_cell_type"]
# diffby is what condition shall be different (e.g., control vs. non control)
neg_diffby = ["Metadata_line_ID"]
null_size = 10000
tech_result = my_run_pipeline(
    meta, feats, pos_sameby, pos_diffby, neg_sameby, neg_diffby, null_size
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
# combine scores from samples with the same Metadata_Sample_Unique
tech_result_agg = aggregate(tech_result, pos_sameby, threshold=0.05)
tech_result_agg[["above_p_threshold", "above_q_threshold"]].value_counts()

above_p_threshold  above_q_threshold
False              False                178
True               True                  61
                   False                 48
Name: count, dtype: int64

In [6]:
output_dir = f"output/analysis_results/{data_level}/replicability/"

import os

os.makedirs(output_dir, exist_ok=True)

tech_result_agg.to_csv(f"{output_dir}/tech_result_agg.csv", index=False)

tech_result.to_csv(f"{output_dir}/tech_result.csv", index=False)