## 0. Preprocess nELISA and Cell Painting profiles

In [1]:
import pandas as pd

from nelisa_utils import normalize_select, unify_pert_ids

In [2]:
cp_feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]

norm_confis = {
    "cellpainting": {
        "path": "inputs/cellpainting_profiles.parquet",
        "norm_group_col": "Metadata_Plate",
        "select_features": cp_feature_select_ops,
    },
    "nelisa": {
        "path": "inputs/nelisa_profiles.parquet",
        "norm_group_col": "Metadata_nelisa_plate_id",
        "select_features": None,
    },
}

In [3]:
normed_dfs = {}

for norm_name, norm_conf in norm_confis.items():
    df = pd.read_parquet(norm_conf.pop("path"))
    print(f"Normalizing {norm_name} profiles of shape {df.shape}...")
    normed_df = normalize_select(df, **norm_conf)
    print(f"Normalized {norm_name} profiles to shape {normed_df.shape}")
    normed_dfs[norm_name] = normed_df

Normalizing cellpainting profiles of shape (1535, 919)...
Normalized cellpainting profiles to shape (1535, 630)
Normalizing nelisa profiles of shape (1525, 222)...
Normalized nelisa profiles to shape (1525, 222)


In [4]:
unified_dfs = unify_pert_ids(
    *list(normed_dfs.values()), pert_col="Metadata_broad_sample"
)
assert all(len(df) == len(unified_dfs[0]) for df in unified_dfs)

for df_id, norm_name in enumerate(norm_confis.keys()):
    print(f"Unified {norm_name} profiles to shape {unified_dfs[df_id].shape}")
    normed_dfs[norm_name] = unified_dfs[df_id]

Unified cellpainting profiles to shape (1512, 630)
Unified nelisa profiles to shape (1512, 222)


In [5]:
for norm_name, normed_df in normed_dfs.items():
    normed_df.to_parquet(f"outputs/{norm_name}_profiles_normalized.parquet")
    print(
        f"Saved normalized {norm_name} profiles to outputs/{norm_name}_profiles_normalized.parquet"
    )

Saved normalized cellpainting profiles to outputs/cellpainting_profiles_normalized.parquet
Saved normalized nelisa profiles to outputs/nelisa_profiles_normalized.parquet
