In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import os
import sys
import umap

from tqdm import tqdm
import matplotlib.pyplot as plt

sys.path.append("../profiling/")
import profiling

In [2]:
PROJECT_ROOT = "/raid/data/cellpainting/TAORF/"
EXP = "taorf_efn_128_newindex"
NUM_FEATURES = 6400
OUTPUT_FILE = "data/well_level_data.csv"

In [3]:
# Load metadata
meta = pd.read_csv(os.path.join(PROJECT_ROOT, "inputs/metadata/index.csv"))

# 1. Load single-cell data

In [4]:
features = []
for i in tqdm(meta.index):
    filename = PROJECT_ROOT + "outputs/" + EXP + "/features/{}/{}_{}.npz"
    filename = filename.format(
        meta.loc[i, "Metadata_Plate"], 
        meta.loc[i, "Metadata_Well"], 
        meta.loc[i, "Metadata_Site"]
    )
    if os.path.isfile(filename):
        with open(filename, "rb") as data:
            info = np.load(data)
            features.append(info["features"])
    else:
        features.append([])

100%|██████████| 17227/17227 [03:12<00:00, 89.61it/s] 


In [5]:
total_single_cells = 0
for i in range(len(features)):
    if len(features[i]) > 0:
        total_single_cells += features[i].shape[0]

print("Total images",len(features),features[0].shape)
print("Total single cells:", total_single_cells)

Total images 17227 (22, 6400)
Total single cells: 883098


# 2. Site-level profiles / Median Aggregation

In [6]:
site_level_data = []
site_level_features = []
for plate in tqdm(meta["Metadata_Plate"].unique()):
    m1 = meta["Metadata_Plate"] == plate
    wells = meta[m1]["Metadata_Well"].unique()
    for well in wells:
        result = meta.query("Metadata_Plate == '{}' and Metadata_Well == '{}'".format(plate, well))
        for i in result.index:
            if len(features[i]) == 0:
                continue
            num_features = features[i].shape[1]
            mean_profile = np.median(features[i], axis=0)
            pert_name = result["pert_name"].unique()
            replicate = result["pert_name_replicate"].unique()
            broad_sample = result["broad_sample"].unique()
            if len(pert_name) > 1:
                print(pert_name)
            site_level_data.append(
                {
                    "Plate": plate,
                    "Well": well,
                    "pert_name": pert_name[0],
                    "Replicate": replicate[0],
                    "broad_sample": broad_sample[0]
                }
            )
            site_level_features.append(mean_profile)

100%|██████████| 5/5 [02:07<00:00, 25.52s/it]


In [7]:
columns1 = ["Plate", "Well", "pert_name", "Replicate", "broad_sample"]
columns2 = [i for i in range(num_features)]

sites1 = pd.DataFrame(columns=columns1, data=site_level_data)
sites2 = pd.DataFrame(columns=columns2, data=site_level_features)
sites = pd.concat([sites1, sites2], axis=1)

# 3. Well-level profiles / Mean Aggregation

In [8]:
# Collapse well data
wells = sites.groupby(["Plate", "Well", "pert_name"]).mean().reset_index()

tmp = meta.groupby(["Metadata_Plate", "Metadata_Well", "pert_name", "broad_sample"])["DNA"].count().reset_index()
wells = pd.merge(wells, tmp, how="left", left_on=["Plate", "Well", "pert_name"], right_on=["Metadata_Plate", "Metadata_Well", "pert_name"])

wells = wells[columns1 + columns2]

# 4. Whitening

In [9]:
whN = profiling.WhiteningNormalizer(wells.loc[wells["pert_name"] == "EMPTY_", columns2])

(6400,) (6400, 6400)


In [10]:
whD = whN.normalize(wells[columns2])

In [11]:
# Save whitened profiles
wells[columns2] = whD
wells.to_csv(OUTPUT_FILE, index=False)

# 5. Treatment-level profiles / Mean Aggreagation

In [12]:
# Aggregate profiles
columns1 = ["Plate", "Well", "pert_name", "Replicate", "broad_sample"]
columns2 = [i for i in range(NUM_FEATURES)] 
profiles = wells.groupby("pert_name").mean().reset_index()

In [13]:
# Recover broad_sample column (cannot be used in groupby because it contains NaN values)
tmp = wells.groupby(["pert_name", "broad_sample"])["Replicate"].count().reset_index()
profiles = pd.merge(profiles.reset_index(), tmp, on="pert_name", how="left")
profiles = profiles[["pert_name", "broad_sample"] + columns2]

In [14]:
# Remove samples without MOA (according to [1])
Y = pd.read_csv("data/TAORF_MOA_MATCHES.csv")
profiles = pd.merge(profiles, Y, left_on="broad_sample", right_on="Var1")
profiles = profiles[["pert_name", "broad_sample", "Metadata_moa.x"] + columns2].sort_values(by="broad_sample")

# 6. Correlation matrix

In [15]:
# Compute Pearson correlation
CRM = np.corrcoef(profiles[columns2])

# Transform to tidy format
df = pd.DataFrame(data=CRM, index=list(profiles.broad_sample), columns=list(profiles.broad_sample))
df = df.reset_index().melt(id_vars=["index"])

In [16]:
# Annotate rows
df2 = pd.merge(
    df, 
    profiles[["broad_sample", "Metadata_moa.x"]], 
    how="left", 
    left_on="index", # <=== Rows
    right_on="broad_sample"
).drop("broad_sample",axis=1)

# Annotate columns
df2 = pd.merge(
    df2, profiles[["broad_sample", "Metadata_moa.x"]],
    how="left", 
    left_on="variable", # <=== Columns
    right_on="broad_sample"
).drop("broad_sample",axis=1)

In [17]:
# Rename columns and save
df2.columns = ["Var1", "Var2", "value", "Metadata_moa.x", "Metadata_moa.y"]
df2.to_csv("data/correlation_matrix.csv")