In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics
import os
import sys

sys.path.append("../profiling/")
import profiling

In [None]:
PROJECT_ROOT = "/raid/data/cellpainting/TAORF/"
OUTPUT_FILE = "well_level_data_cellprofiler_1e-2.csv"
MATRIX_FILE = "cos_cellprofiler_1e-2.csv"
REG_PARAM = 1e-2

In [None]:
# Load metadata
meta = pd.read_csv(os.path.join(PROJECT_ROOT, "inputs/metadata/index_taorf_minus2wells.csv"))

In [None]:
# Load profiles
prof41744 = pd.read_csv(os.path.join(PROJECT_ROOT, "inputs/cp_features/41744/41744_normalized.csv")) 
prof41754 = pd.read_csv(os.path.join(PROJECT_ROOT, "inputs/cp_features/41754/41754_normalized.csv")) 
prof41755 = pd.read_csv(os.path.join(PROJECT_ROOT, "inputs/cp_features/41755/41755_normalized.csv")) 
prof41756 = pd.read_csv(os.path.join(PROJECT_ROOT, "inputs/cp_features/41756/41756_normalized.csv")) 
prof41757 = pd.read_csv(os.path.join(PROJECT_ROOT, "inputs/cp_features/41757/41757_normalized.csv")) 
well_level_data = pd.concat([prof41744, prof41754, prof41755, prof41756, prof41757])
well_level_data.drop(['Metadata_Assay_Plate_Barcode', 'Metadata_Plate_Map_Name', 'Metadata_gene_name', 'Metadata_well_position', 'Metadata_cell_line', 'Metadata_ASSAY_WELL_ROLE', 'Metadata_GeneID', 'Metadata_pert_id', 
                      'Metadata_pert_mfc_id', 'Metadata_pert_well', 'Metadata_pert_id_vendor', 'Metadata_cell_id', 'Metadata_broad_sample_type', 'Metadata_pert_type'], axis=1, inplace=True)

columns1 = ["Metadata_Plate", "Metadata_Well", "Metadata_pert_name", "Metadata_broad_sample"]
columns2 = well_level_data.columns[4:]
part1 = well_level_data.loc[:, columns1]
part2 = well_level_data.loc[:, columns2]

part2.dropna(axis='columns', inplace=True)

well_level_data = pd.concat([part1, part2],axis=1)
well_level_data.reset_index(inplace=True, drop=True)
columns2 = well_level_data.columns[4:]

In [None]:
#Drop wells which did not pass the QC
a = well_level_data[(well_level_data.Metadata_Plate == 41754) & (well_level_data.Metadata_Well.isin(['c01', 'd01']))].index
well_level_data.drop(a,inplace=True)
well_level_data.reset_index(drop=True,inplace=True)

In [None]:
meta1 = meta.loc[:,['Metadata_Plate','Metadata_Well','pert_name_replicate']]
well_level_data = well_level_data.merge(meta1, how='left', left_on=['Metadata_Plate', 'Metadata_Well'], right_on=['Metadata_Plate', 'Metadata_Well']).drop_duplicates()
well_level_data = well_level_data.astype({'pert_name_replicate':'object', 'Metadata_Plate':'object', 'Metadata_Well':'object', 'Metadata_pert_name':'object', 'Metadata_broad_sample':'object'})
well_level_data.reset_index(drop=True, inplace=True)

# 4. Whitening

In [None]:
whN = profiling.WhiteningNormalizer(well_level_data.loc[well_level_data["Metadata_pert_name"].isin(["EMPTY_"]), columns2],REG_PARAM)

In [None]:
whD = whN.normalize(well_level_data[columns2])

In [None]:
# Save whitened profiles
well_level_data[columns2] = whD
well_level_data.to_csv(OUTPUT_FILE, index=False)

# 5. Treatment-level profiles / Mean Aggreagation

In [None]:
# Aggregate profiles
profiles = well_level_data.groupby("Metadata_pert_name").mean().reset_index()

In [None]:
# Recover broad_sample column (cannot be used in groupby because it contains NaN values)
tmp = well_level_data.groupby(["Metadata_pert_name", "Metadata_broad_sample"])["pert_name_replicate"].count().reset_index()
profiles = pd.merge(profiles.reset_index(), tmp, on="Metadata_pert_name", how="left")
profiles = profiles[["Metadata_pert_name", "Metadata_broad_sample"] + list(columns2)]

In [None]:
# Remove samples without MOA (according to [1])
Y = pd.read_csv("/raid/data/cellpainting/TAORF/nikita_experiments/TAORF_MOA_MATCHES.csv")
profiles = pd.merge(profiles, Y, left_on="Metadata_broad_sample", right_on="Var1")
profiles = profiles[["Metadata_pert_name", "Metadata_broad_sample", "Metadata_moa.x"] + list(columns2)].sort_values(by="Metadata_broad_sample")

# 6. Correlation matrix

In [None]:
# Compute Pearson correlation
COS = sklearn.metrics.pairwise.cosine_similarity(profiles[columns2], profiles[columns2])

In [None]:
# Transform to tidy format
df = pd.DataFrame(data=COS, index=list(profiles.Metadata_broad_sample), columns=list(profiles.Metadata_broad_sample))
df = df.reset_index().melt(id_vars=["index"])

In [None]:
# Annotate rows
df2 = pd.merge(
    df, 
    profiles[["Metadata_broad_sample", "Metadata_moa.x"]], 
    how="left", 
    left_on="index", # <=== Rows
    right_on="Metadata_broad_sample"
).drop("Metadata_broad_sample",axis=1)

# Annotate columns
df2 = pd.merge(
    df2, profiles[["Metadata_broad_sample", "Metadata_moa.x"]],
    how="left", 
    left_on="variable", # <=== Columns
    right_on="Metadata_broad_sample"
).drop("Metadata_broad_sample",axis=1)

In [None]:
# Rename columns and save
df2.columns = ["Var1", "Var2", "value", "Metadata_moa.x", "Metadata_moa.y"]
df2.to_csv(MATRIX_FILE)