In [None]:
import pandas as pd
import numpy as np
import os
import sys
import sklearn.metrics

from tqdm import tqdm

sys.path.append("../profiling/")
import profiling

In [None]:
PROJECT_ROOT = "/dgx1nas1/cellpainting-datasets/BBBC022/"
EXP = "cp_dataset_mixedval"
OUTPUT_FILE = "data/well_level_data_efn128combinedcellsout_conv6a_1e-2_e30.csv"
MATRIX_FILE = "data/cos_efn128combinedcellsout_conv6a_1e-2_e30.csv"
REG_PARAM = 1e-2

In [None]:
# Load metadata
meta = pd.read_csv(os.path.join(PROJECT_ROOT, "inputs/metadata/index_after_qc_trimmed_maxconc.csv"))

In [None]:
meta["broad_sample"] = meta["Treatment"].str.split("@", expand=True)[0]

# 1. Load single-cell data

In [None]:
features = []
for i in tqdm(meta.index):
    filename = PROJECT_ROOT + "outputs/" + EXP + "/features/{}/{}/{}.npz"
    filename = filename.format(
        meta.loc[i, "Metadata_Plate"], 
        meta.loc[i, "Metadata_Well"], 
        meta.loc[i, "Metadata_Site"]
    )
    if os.path.isfile(filename):
        with open(filename, "rb") as data:
            info = np.load(data)
            cells = np.array(np.copy(info["features"]))
            cells_f = cells[~np.isnan(cells).any(axis=1)]
            features.append(cells_f)
    else:
        features.append([])

In [None]:
total_single_cells = 0
for i in range(len(features)):
    if len(features[i]) > 0:
        total_single_cells += features[i].shape[0]

num_features = features[0].shape[1]
print("Total images",len(features),features[0].shape)
print("Total single cells:", total_single_cells)

# 2. Site-level profiles / Median Aggregation

In [None]:
site_level_data = []
site_level_features = []
for plate in tqdm(meta["Metadata_Plate"].unique()):
    m1 = meta["Metadata_Plate"] == plate
    wells = meta[m1]["Metadata_Well"].unique()
    for well in wells:
        result = meta.query("Metadata_Plate == '{}' and Metadata_Well == '{}'".format(plate, well))
        for i in result.index:
            if len(features[i]) == 0:
                continue
            mean_profile = np.median(features[i], axis=0)
            pert_name = result["Treatment"].unique()
            replicate = result["broad_sample_Replicate"].unique()
            if len(pert_name) > 1:
                print(pert_name)
            site_level_data.append(
                {
                    "Plate": plate,
                    "Well": well,
                    "Treatment": pert_name[0],
                    "Replicate": replicate[0],
                    "broad_sample": pert_name[0].split("@")[0]
                }
            )
            site_level_features.append(mean_profile)

In [None]:
columns1 = ["Plate", "Well", "Treatment", "Replicate", "broad_sample"]
columns2 = [i for i in range(num_features)]

sites1 = pd.DataFrame(columns=columns1, data=site_level_data)
sites2 = pd.DataFrame(columns=columns2, data=site_level_features)
sites = pd.concat([sites1, sites2], axis=1)

# 3. Well-level profiles / Mean Aggregation

In [None]:
# Collapse well data
wells = sites.groupby(["Plate", "Well", "Treatment"]).mean().reset_index()

tmp = meta.groupby(["Metadata_Plate", "Metadata_Well", "Treatment", "broad_sample"])["DNA"].count().reset_index()
wells = pd.merge(wells, tmp, how="left", left_on=["Plate", "Well", "Treatment"], right_on=["Metadata_Plate", "Metadata_Well", "Treatment"])

wells = wells[columns1 + columns2]

# 4. Whitening

In [None]:
whN = profiling.WhiteningNormalizer(wells.loc[wells["Treatment"] == "DMSO@0", columns2], reg_param=REG_PARAM)

In [None]:
whD = whN.normalize(wells[columns2])

In [None]:
# Save whitened profiles
wells[columns2] = whD
wells.to_csv(OUTPUT_FILE, index=False)

# 5. Treatment-level profiles / Mean Aggreagation

In [None]:
# Aggregate profiles
columns1 = ["Plate", "Well", "Treatment", "Replicate", "broad_sample"]
columns2 = [i for i in range(num_features)] 
profiles = wells.groupby("Treatment").mean().reset_index()

In [None]:
wells["broad_sample"] = wells["Treatment"].str.split("@", expand=True)[0]

In [None]:
# Recover broad_sample column (cannot be used in groupby because it contains NaN values)
tmp = wells.groupby(["Treatment", "broad_sample"])["Replicate"].count().reset_index()
profiles = pd.merge(profiles.reset_index(), tmp, on="Treatment", how="left")

In [None]:
profiles = profiles[["Treatment", "broad_sample"] + columns2]

In [None]:
# Remove samples without MOA (according to [1])
Y = pd.read_csv("data/BBBC022_MOA_MATCHES_official.csv")
profiles = pd.merge(profiles, Y, left_on="broad_sample", right_on="Var1")
profiles = profiles[["Treatment", "broad_sample", "Metadata_moa.x"] + columns2].sort_values(by="broad_sample")

# 6. Similarity matrix

In [None]:
# Compute Cosine Similarities
COS = sklearn.metrics.pairwise.cosine_similarity(profiles[columns2], profiles[columns2])

In [None]:
# Transform to tidy format
df = pd.DataFrame(data=COS, index=list(profiles.broad_sample), columns=list(profiles.broad_sample))
df = df.reset_index().melt(id_vars=["index"])

In [None]:
# Annotate rows
df2 = pd.merge(
    df, 
    profiles[["broad_sample", "Metadata_moa.x"]], 
    how="left", 
    left_on="index", # <=== Rows
    right_on="broad_sample"
).drop("broad_sample",axis=1)

# Annotate columns
df2 = pd.merge(
    df2, profiles[["broad_sample", "Metadata_moa.x"]],
    how="left", 
    left_on="variable", # <=== Columns
    right_on="broad_sample"
).drop("broad_sample",axis=1)

In [None]:
# Rename columns and save
df2.columns = ["Var1", "Var2", "value", "Metadata_moa.x", "Metadata_moa.y"]
df2.to_csv(MATRIX_FILE, index=False)