# Transform features

In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sys
import os
import pandas as pd

from pycytominer import (
    normalize,
    feature_select,
)

notebook_dir = os.path.abspath(os.getcwd())

ncp_src_path = os.path.abspath(os.path.join(notebook_dir, "..", "ncp", "src"))

if ncp_src_path not in sys.path:
    sys.path.append(ncp_src_path)

from utils import apply_function_to_groups

## Load data

In [2]:
data_level = "augmented"

augmented_file = f"output/processed/{data_level}/combined.parquet"

augmented_df = pd.read_parquet(augmented_file)

## Transform features

In [3]:
data_level = "normalized"

os.makedirs(f"output/processed/{data_level}/", exist_ok=True)

normalized_file = f"output/processed/{data_level}/combined.parquet"

normalized_df = apply_function_to_groups(
    df=augmented_df,
    group_col="Metadata_Plate",
    func=normalize,
    features="infer",
    image_features=False,
    samples="all",
    method="mad_robustize",
    mad_robustize_epsilon=1e-6,
)

normalized_df.to_parquet(normalized_file)

In [14]:
data_level = "normalized_feature_select"

os.makedirs(f"output/processed/{data_level}/", exist_ok=True)

normalized_feature_select_file = f"output/processed/{data_level}/combined.parquet"

normalized_feature_select_df = feature_select(
    profiles=normalized_df,
    features="infer",
    image_features=False,
    operation=["variance_threshold", "correlation_threshold"],
)

normalized_feature_select_df.to_parquet(normalized_feature_select_file)

## Run checks

In [9]:
grouped_median = (
    normalized_df.groupby("Metadata_Plate")["Cells_AreaShape_Area"]
    .median()
    .reset_index()
)

grouped_median

Unnamed: 0,Metadata_Plate,Cells_AreaShape_Area
0,BR00127194,4.343314e-16
1,BR00132672,0.0
2,BR00132673,0.0
3,BR_NCP_STEM_1,8.387388e-16
4,PE_PP_Plate2,0.0
5,Plate1_PE_PP96,0.0
