# Preprocessing data 

In this notebook, we will preprocess the dataset by loading all profiles in batches, adding additional labels, and concatenating them into a single dataset for further exploration.

In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd

sys.path.append("../../../")
from utils import io_utils, data_utils

## Helper functions
These are helper functions that will be used only in this notebook

In [2]:
def add_control_type(profile: pd.DataFrame) -> pd.DataFrame:
    """Add control type metadata to the dataframe based on cell type.

    Parameters
    ----------
    profile : pandas.DataFrame
        DataFrame containing the profiles with 'Metadata_cell_type' column.

    Returns
    -------
    pandas.DataFrame
        DataFrame with an additional 'Metadata_control_type' column.
    """
    # add a new column to the dataframe
    profile.insert(2, "Metadata_control_type", np.nan)

    # this adds the label "positive" to wells that contains healthy cells and treated with DMSO
    profile.loc[(profile["Metadata_cell_type"] == "healthy") & (profile["Metadata_treatment"] == "DMSO"), "Metadata_control_type"] = (
        "positive"
    )
    
    # this adds the label "negative" to wells that contains failing CF cells and treated with DMSO
    profile.loc[(profile["Metadata_cell_type"] == "failing") & (profile["Metadata_treatment"] == "DMSO"), "Metadata_control_type"] = (
        "negative"
    )
    
    # this adds the label "trt" to wells that contains failing CF cells and treated with a compound
    profile.loc[(profile["Metadata_cell_type"] == "failing") & (profile["Metadata_treatment"] != "DMSO"), "Metadata_control_type"] = (
        "trt"
    )
    return profile


Setting up paths on what to load and output directories

In [3]:
# setting in input paths
data_dir_path = pathlib.Path("../../data")

# selecting aggregated feature selected files
list_of_paths = list(
    (data_dir_path / "agg_fs_profiles/").resolve(strict=True).glob("*.parquet")
)

# shared features columns
shared_features_path = pathlib.Path(
    "../../1.map-analysis/results/shared_features.json"
).resolve(strict=True)

# set configs path
config_path = pathlib.Path("../../config.yaml").resolve(strict=True)

# creating a results output directory
results_dir = pathlib.Path("./results").resolve()
results_dir.mkdir(exist_ok=True)

Next, we load the configuration file that contains the shared features across all plates within the batch. Then, we load each plate within the batch and add new metadata columns for downstream analysis.

In [4]:
# loading config
config = io_utils.load_config(config_path)

# loading shared features
shared_features = io_utils.load_config(shared_features_path)["shared_features"]

# loading all feature selected aggregated profiles and updating it with the shared features
loaded_aggregated_profiles = []
loaded_shuffled_profiles = []
for plate_idx, profile_path in enumerate(list_of_paths):
    # getting the plate name
    plate_name = profile_path.stem.split("_")[0]

    # loading aggregated profiles
    aggregated_profiles = pd.read_parquet(profile_path)

    # updating the profile with the shared features
    aggregated_profiles = aggregated_profiles[shared_features]

    # inserting the plate name at the first column
    aggregated_profiles.insert(0, "Metadata_plate_barcode", plate_name)
    aggregated_profiles.insert(1, "Metadata_plate_index", plate_idx + 1)

    # next is to shuffled the data
    shuffled_aggregated_profiles = data_utils.shuffle_features(aggregated_profiles)

    # append it to the list
    loaded_aggregated_profiles.append(aggregated_profiles)
    loaded_shuffled_profiles.append(shuffled_aggregated_profiles)

# concatenating all the profiles
loaded_aggregated_profiles = pd.concat(loaded_aggregated_profiles).reset_index(
    drop=True
)
shuffled_aggregated_profiles = pd.concat(loaded_shuffled_profiles).reset_index(
    drop=True
)

# add metadata into the dmso profile where if Metadata_cell_type == "healthy" then Metadata_control_type == "positive"
# add if Metadata_cell_type == "failing" then Metadata_control_type == "negative"
# Apply to both aggregated and shuffled profiles
loaded_aggregated_profiles = add_control_type(loaded_aggregated_profiles)
shuffled_aggregated_profiles = add_control_type(shuffled_aggregated_profiles)

# split metadata and morphology feature columns
meta_cols, feat_cols = data_utils.split_meta_and_features(loaded_aggregated_profiles)

# store aggregate data profiles as batched
loaded_profiles = {"batch_1": loaded_aggregated_profiles}
shuffled_loaded_profiles = {"batch_1": shuffled_aggregated_profiles}

# display only not shuffled aggregated profiles dmso profiles
print(loaded_profiles["batch_1"].shape)
loaded_profiles["batch_1"].head()

(220, 485)


  profile.loc[(profile["Metadata_cell_type"] == "healthy") & (profile["Metadata_treatment"] == "DMSO"), "Metadata_control_type"] = (
  profile.insert(2, "Metadata_control_type", np.nan)
  profile.loc[(profile["Metadata_cell_type"] == "healthy") & (profile["Metadata_treatment"] == "DMSO"), "Metadata_control_type"] = (


Unnamed: 0,Metadata_plate_barcode,Metadata_plate_index,Metadata_control_type,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Pathway,...,Nuclei_Texture_InfoMeas2_PM_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Hoechst_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Hoechst_3_01_256,Nuclei_Texture_InverseDifferenceMoment_Hoechst_3_02_256,Nuclei_Texture_InverseDifferenceMoment_Hoechst_3_03_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumEntropy_PM_3_01_256
0,localhost240927060001,1,positive,B,2,7,healthy,,DMSO,,...,0.340226,-0.397719,-0.55441,-0.597032,-0.599841,-0.452233,-0.51269,-0.47906,-0.487456,0.377711
1,localhost240927060001,1,trt,B,3,19,failing,dilated_cardiomyopathy,UCD-0159256,Apoptosis,...,0.426732,0.192443,0.201591,0.193599,0.207943,-0.591082,-0.595971,-0.438921,-0.572451,0.509203
2,localhost240927060001,1,trt,B,4,19,failing,dilated_cardiomyopathy,UCD-0001766,Angiogenesis,...,0.97009,-0.78076,-0.846263,-0.65464,-0.655297,-1.28222,-1.381008,-1.344134,-1.241213,1.432368
3,localhost240927060001,1,positive,B,5,7,healthy,,DMSO,,...,0.218604,-0.12584,-0.107253,-0.217232,-0.302984,-0.218122,-0.285449,-0.181189,-0.307638,0.22736
4,localhost240927060001,1,trt,B,6,19,failing,dilated_cardiomyopathy,UCD-0159262,Others,...,1.039161,-1.04313,-1.173101,-1.0735,-0.994948,-1.680824,-1.553425,-1.727551,-1.443283,1.551807


We save the concatenated profiles of this batch.

In [6]:
loaded_profiles["batch_1"].to_csv(results_dir / "batch1_concat_agg_fs.csv", index=False)