In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics
import os
import sys

from tqdm import tqdm

sys.path.append("../profiling/")
import profiling

In [None]:
PROJECT_ROOT = ''
EXP = "cell_painting_CNN"

df_path = 'Cell_Painting_data/enriched_index_max_concentration.csv'

output_folder = 'data'
output_file = "well_level_profiles_cpcnn_LINCS_1e-5_final.csv"
REG_PARAM = 1e-5

In [None]:
# Load metadata
meta = pd.read_csv(os.path.join(PROJECT_ROOT, df_path))

# 1. Load single-cell data

In [None]:
features = []
channels = ['DNA','RNA', 'ER', 'AGP', 'Mito']
for i in tqdm(meta.index):
    filename = PROJECT_ROOT + "outputs/" + EXP + "/features/{}/{}/{}.npz"
    filename = filename.format(
        meta.loc[i, "Metadata_Plate"],
        meta.loc[i, "Metadata_Well"],
        meta.loc[i, "Metadata_Site"],
    )
    if os.path.isfile(filename):
        with open(filename, "rb") as data:
            info = np.load(data)
            features.append(info["features"])
    else:
        features.append([])

In [None]:
total_single_cells = 0
for i in range(len(features)):
    if len(features[i]) > 0:
        total_single_cells += features[i].shape[0]

num_features = features[0].shape[1]
print("Total images",len(features),features[0].shape)
print("Total single cells:", total_single_cells)

# 2. Site-level profiles / Median Aggregation

In [None]:
site_level_data = []
site_level_features = []

for plate in tqdm(meta["Metadata_Plate"].unique()):
    m1 = meta["Metadata_Plate"] == plate
    wells = meta[m1]["Metadata_Well"].unique()
    for well in wells:
        result = meta.query("Metadata_Plate == '{}' and Metadata_Well == '{}'".format(plate, well))
        for i in result.index:
            if len(features[i]) == 0:
                continue
            mean_profile = np.median(features[i], axis=0)
            Treatment = result["Treatment"].unique()
            broad_sample = result["Metadata_broad_sample"].unique()
            site_level_data.append(
                {
                    "Plate": plate,
                    "Well": well,
                    "Treatment": Treatment[0],
                    "broad_sample": broad_sample[0]
                }
            )
            site_level_features.append(mean_profile)

In [None]:
num_features = features[0].shape[1]
columns1 = ["Plate", "Well", "Treatment", "broad_sample"]
columns2 = [i for i in range(num_features)]

sites1 = pd.DataFrame(columns=columns1, data=site_level_data)
sites2 = pd.DataFrame(columns=columns2, data=site_level_features)
sites = pd.concat([sites1, sites2], axis=1)
sites.shape

# 3. Well-level profiles / Mean Aggregation

In [None]:
# Collapse well data
wells = sites.groupby(["Plate", "Well", "Treatment", "broad_sample"]).mean().reset_index()
wells = wells[columns1 + columns2]
wells.to_csv(f'{output_folder}/Wells_Prewhitened_CPCNN_LINCS.csv', index=False)
wells.shape

# 4. Whitening

In [None]:
whN = profiling.WhiteningNormalizer(wells.loc[wells["Treatment"].isin(["DMSO@NA"]), 
                                              columns2], REG_PARAM)

whD = whN.normalize(wells[columns2])

In [None]:
# Save whitened profiles
wells[columns2] = whD
wells.to_csv(f'{output_folder}/{output_file}', index=False)