# Load, clean, and transform data

In [22]:
import sys
import os
import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.path.abspath(os.getcwd())

# Construct the path to the 'src' directory
ncp_src_path = os.path.abspath(os.path.join(notebook_dir, "..", "ncp", "src"))

# Add the 'src' directory to sys.path
if ncp_src_path not in sys.path:
    sys.path.append(ncp_src_path)

# Now you can import the modules
from data_management import (
    load_dataframe,
    process_dataframes_by_cell_type,
    check_matching_columns,
)

In [23]:
# Define paths using dictionary with context/repo names
base_paths = {
    "local_repo": "../1.run-workflows/profiles",
    "external_repo": "../2019_05_28_Neuronal_Cell_Painting/profiles",
}

# Update datasets to include 'key' for each entry
datasets = [
    {
        "key": "stem",
        "base_path": base_paths["local_repo"],
        "batch": "NCP_STEM_1",
        "plate": "BR_NCP_STEM_1",
        "data_level": "augmented",
    },
    {
        "key": "progen",
        "base_path": base_paths["local_repo"],
        "batch": "NCP_PROGENITORS_1",
        "plate": "BR00127194",
        "data_level": "augmented",
    },
    {
        "key": "neuron_p72",
        "base_path": base_paths["external_repo"],
        "batch": "2022_03_03_NCP_NEURONS_2_20x",
        "plate": "BR00132672",
        "data_level": "augmented",
    },
    {
        "key": "neuron_p73",
        "base_path": base_paths["external_repo"],
        "batch": "2022_03_03_NCP_NEURONS_2_20x",
        "plate": "BR00132673",
        "data_level": "augmented",
    },
    {
        "key": "astro_p1",
        "base_path": base_paths["external_repo"],
        "batch": "2022-08-07_batch",
        "plate": "Plate1_PE_PP96",
        "data_level": "augmented",
    },
    {
        "key": "astro_p2",
        "base_path": base_paths["external_repo"],
        "batch": "2022-08-07_batch",
        "plate": "PE_PP_Plate2",
        "data_level": "augmented",
    },
]

# Load and process dataframes using dictionary
dfs = {}

for dataset in datasets:
    dfs[dataset["key"]] = load_dataframe(
        dataset["base_path"], dataset["batch"], dataset["plate"], dataset["data_level"]
    )

In [24]:
# Define cell types and their associated keys and columns to drop
cell_types_data = {
    "stem": {
        "keys": ["stem"],
        "columns_to_drop": [
            "Cytoplasm_Parent_Cells",
            "Cytoplasm_Parent_Nuclei",
            "Image_",
            "Metadata_well_position",
            "Metadata_Assay_Plate_Barcode",
        ],
        "columns_to_compute": {
            "Metadata_Object_Count": "2 * Metadata_Site_Count * Cells_Number_Object_Number",
            "Metadata_Object_Count_inferred": "2 * Metadata_Site_Count * Cells_Number_Object_Number",
        },
        "columns_to_add": {
            "Metadata_Site_Count": 9,
            "Nuclei_ObjectSkeleton_NumberNonTrunkBranches_CellImageSkel": 0,
            "Nuclei_ObjectSkeleton_NumberBranchEnds_CellImageSkel": 0,
            "Nuclei_ObjectSkeleton_TotalObjectSkeletonLength_CellImageSkel": 0,
        },
    },
    "progen": {
        "keys": ["progen"],
        "columns_to_drop": [
            "Cytoplasm_Parent_Cells",
            "Cytoplasm_Parent_Nuclei",
            "Image_",
            "Metadata_well_position",
            "Metadata_Assay_Plate_Barcode",
            "Nuclei_ObjectSkeleton_NumberTrunks_CellImageSkel",
        ],
        "columns_to_compute": {
            "Metadata_Object_Count_inferred": "2 * Metadata_Site_Count * Cells_Number_Object_Number",
        },
    },
    "neuron": {
        "keys": ["neuron_p72", "neuron_p73"],
        "columns_to_drop": [
            "Nuclei_ObjectSkeleton_NumberTrunks_CellImageSkel",
            "Image_",
            "Metadata_well_position",
            "Metadata_Assay_Plate_Barcode",
            "Metadata_EXPERIMENT_NAME",
        ],
        "columns_to_rename": {
            "Metadata_LINE_ID": "Metadata_line_ID",
            "Metadata_DENSITY": "Metadata_plating_density",
            "Metadata_plate_map_name": "Metadata_Plate_Map_Name",
            "Metadata_GENOTYPE": "Metadata_line_condition",
        },
        "columns_to_add": {"Metadata_line_source": "unknown"},
        "columns_to_compute": {
            "Metadata_Object_Count_inferred": "2 * Metadata_Site_Count * Cells_Number_Object_Number",
        },
    },
    "astro": {
        "keys": ["astro_p1", "astro_p2"],
        "columns_to_drop": [
            "Nuclei_ObjectSkeleton_NumberTrunks_CellImageSkel",
            "Image_",
            "Metadata_well_position",
            "Metadata_Assay_Plate_Barcode",
            "Metadata_METADATA_SOURCE",
            "Metadata_Count_Nuclei",
            "Metadata_Count_Cells",
            "Metadata_Count_Cytoplasm",
        ],
        "columns_to_rename": {
            "Metadata_METADATA_SAMPLE_ID": "Metadata_line_ID",
            "Metadata_METADATA_DENSITY": "Metadata_plating_density",
            "Metadata_plate_map_name": "Metadata_Plate_Map_Name",
            "Metadata_METADATA_CONDITION": "Metadata_line_condition",
        },
        "columns_to_add": {"Metadata_line_source": "unknown"},
        "columns_to_compute": {
            "Metadata_Object_Count_inferred": "2 * Metadata_Site_Count * Cells_Number_Object_Number",
        },
    },
}

# Process dataframes for each cell type
processed_dfs = {
    cell_type: process_dataframes_by_cell_type(dfs, data)
    for cell_type, data in cell_types_data.items()
}

In [25]:
for cell_type, df in processed_dfs.items():
    print(f"{cell_type} shape: {df.shape}")

stem shape: (384, 4304)
progen shape: (384, 4304)
neuron shape: (768, 4304)
astro shape: (188, 4304)


In [26]:
check_matching_columns([processed_dfs["stem"], processed_dfs["progen"]])

In [27]:
check_matching_columns([processed_dfs["progen"], processed_dfs["neuron"]])

In [28]:
check_matching_columns([processed_dfs["progen"], processed_dfs["astro"]])

In [29]:
check_matching_columns([processed_dfs["neuron"], processed_dfs["astro"]])

In [30]:
for cell_type, df in processed_dfs.items():
    print(f"Summary for {cell_type}:\n")

    # Filter columns starting with "Metadata_"
    metadata_cols = [
        col
        for col in df.columns
        if col.startswith("Metadata_") and col != "Metadata_Well"
    ]

    for col in metadata_cols:
        unique_vals = df[col].unique()
        print(f"{col}: {len(unique_vals)} unique values")
        print(unique_vals)
        print("\n")

    print("=" * 80)
    print("\n")

Summary for stem:

Metadata_Plate: 1 unique values
['BR_NCP_STEM_1']


Metadata_Plate_Map_Name: 1 unique values
['BR_NCP_STEM_1']


Metadata_plating_density: 1 unique values
[10000]


Metadata_line_ID: 48 unique values
[ 1 30  9 26  4 34 15 36  2 16 20 12 17 39 37 42  7 29 11 38 43 18  6 27
 41 32 46 33 14 23 22  3 28  5 35 13 19 24 21 31 10 45 44 47 40 48  8 25]


Metadata_line_condition: 2 unique values
['control' 'deletion']


Metadata_line_source: 3 unique values
['human' 'isogenic_deletion' 'isogenic_control']


Metadata_Site_Count: 1 unique values
[9]


Metadata_Object_Count: 383 unique values
[1496.196 2075.22  2084.04  3082.68   862.884  806.922 1206.54  1299.654
 1077.246  890.496 2017.98  2626.74   573.966  897.732 1719.972 2037.6
 1932.3   1992.96  1193.058 1803.78   651.924 1105.866  334.062  520.524
 1667.394 1205.964 1643.94  1490.004  964.512  816.12  1244.754 1506.204
 1549.53  1015.002 2800.44  3560.58  1117.278  759.708 2183.58  2131.92
 2378.88  2325.06  1104.3    93

In [31]:
check_matching_columns(
    [
        processed_dfs["stem"],
        processed_dfs["progen"],
        processed_dfs["neuron"],
        processed_dfs["astro"],
    ]
)

# Concatenate all dataframes
all_df = pd.concat(processed_dfs.values())

all_df = (
    pd.concat(
        processed_dfs.values(), keys=processed_dfs.keys(), names=["Metadata_cell_type"]
    ).reset_index(level="Metadata_cell_type")
    # .drop(columns="level_1")
)

In [32]:
# Extract 'Metadata_line_ID' and related columns from 'stem' dataframe
# because it has the most complete information
lookup_df = processed_dfs["stem"][
    ["Metadata_line_ID", "Metadata_line_condition", "Metadata_line_source"]
]

# Drop duplicates to have a unique mapping for each 'Metadata_line_ID'
lookup_df = lookup_df.drop_duplicates().set_index("Metadata_line_ID")

# Map 'Metadata_line_condition' and 'Metadata_line_source' using the lookup table
all_df["Metadata_line_condition"] = all_df["Metadata_line_ID"].map(
    lookup_df["Metadata_line_condition"]
)

all_df["Metadata_line_source"] = all_df["Metadata_line_ID"].map(
    lookup_df["Metadata_line_source"]
)

In [33]:
desired_order = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Plate_Map_Name",
    "Metadata_cell_type",
    "Metadata_line_ID",
    "Metadata_line_condition",
    "Metadata_line_source",
    "Metadata_plating_density",
    "Metadata_Object_Count",
    "Metadata_Object_Count_inferred",
    "Metadata_Site_Count",
]

# Get the remaining columns using set difference
remaining_columns = list(set(all_df.columns) - set(desired_order))

# check if there are any columns in remaining_columns that start with Metadata_
assert not any(col.startswith("Metadata_") for col in remaining_columns)

# Combine to form the new column order
new_column_order = desired_order + remaining_columns

# Reorder the DataFrame columns
all_df = all_df[new_column_order]

# Save the DataFrame to a Parquet file
all_df.to_parquet("output/ncp_augmented.parquet", index=False)