## Apply normalization and feature selection to single cell DMSO profiles

In [1]:
import pathlib
import numpy as np
import pandas as pd

from pycytominer import normalize, feature_select, aggregate
from pycytominer.cyto_utils import infer_cp_features

In [2]:
# Load constants
data_dir = pathlib.Path("../data")

feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]

feature_select_summary_file = pathlib.Path("tables/feature_select_summary.csv")
aggregate_full_file = pathlib.Path(f"{data_dir}/dmso_aggregate_all_plates.csv")

In [3]:
# Load data
data_dir = pathlib.Path("../data")
data_files = [x for x in data_dir.iterdir() if "_dmso.csv" in str(x)]
data_files

[PosixPath('../data/SQ00015145_dmso.csv'),
 PosixPath('../data/SQ00015143_dmso.csv'),
 PosixPath('../data/SQ00015201_dmso.csv'),
 PosixPath('../data/SQ00015142_dmso.csv'),
 PosixPath('../data/SQ00015144_dmso.csv')]

In [4]:
features_selected_list = []
aggregated_list = []
for file in data_files:
    plate = str(file).split("/")[-1].split("_")[0]
    
    print(f"Now processing {plate}...")
    df = pd.read_csv(file)

    metadata_cols = ["Image_Metadata_Well"] + infer_cp_features(df, metadata=True)
    feature_cols = infer_cp_features(df, compartments=["Cells", "Cytoplasm", "Nuclei"])

    output_file = pathlib.Path(f"{data_dir}/{plate}_dmso_normalized.csv")
    
    # Apply normalization and output files
    normalize(
        profiles=df,
        features=feature_cols,
        meta_features=metadata_cols,
        method="standardize",
        output_file=output_file
    )
    
    # Apply feature selection only to determine which features to use
    feature_select_df = feature_select(
        profiles=output_file,
        features="infer",
        operation=feature_select_ops,
        na_cutoff=0,
        corr_threshold=0.8
    )
    
    # Identify which features were selected
    selected_features = pd.DataFrame(np.zeros((len(df.columns), 1)), index=df.columns, columns=[plate])
    selected_features.loc[selected_features.index.isin(feature_select_df.columns), plate] = 1
    selected_features = selected_features.astype(int)
    
    features_selected_list.append(selected_features)
    
    # Aggregate the normalized single cell files
    aggregate_df = aggregate(
        population_df=df,
        strata=["Image_Metadata_Well", "Metadata_Plate"],
        features=feature_cols,
        operation="median"
    )
    
    aggregated_list.append(aggregate_df)

Now processing SQ00015145...
Now processing SQ00015143...
Now processing SQ00015201...
Now processing SQ00015142...
Now processing SQ00015144...


In [5]:
# Track features that were selected
feature_select_summary_df = pd.concat(features_selected_list, axis="columns")

feature_select_summary_df.to_csv(feature_select_summary_file, sep=",", index=True)

print(feature_select_summary_df.shape)
feature_select_summary_df.head()

(2448, 5)


Unnamed: 0,SQ00015145,SQ00015143,SQ00015201,SQ00015142,SQ00015144
Image_Metadata_Well,1,1,1,1,1
TableNumber,0,0,0,0,0
ImageNumber,0,0,0,0,0
ObjectNumber,0,0,0,0,0
Cells_AreaShape_Area,0,0,0,0,0


In [6]:
# Output the aggregate profiles
aggregate_full_df = pd.concat(aggregated_list, axis="rows")

aggregate_full_df.to_csv(aggregate_full_file, sep=",", index=False)

print(aggregate_full_df.shape)
aggregate_full_df.head()

(120, 1785)


Unnamed: 0,Image_Metadata_Well,Metadata_Plate,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,A01,SQ00015145,9311.5,1128.0,1067.5,1.152393,0.721164,1.0,0.587911,0.330187,...,3.189956,1.713095,1.581675,1.823595,1.560678,1.420565,1.716546,2.743128,2.70308,2.681895
1,A02,SQ00015145,9746.0,1077.0,1027.0,1.150973,0.71903,1.0,0.587361,0.326732,...,3.240498,1.672825,1.515702,1.756751,1.523617,1.391794,1.686863,2.732458,2.697093,2.671854
2,A03,SQ00015145,10024.0,1141.0,983.5,1.141808,0.708185,1.0,0.592471,0.322619,...,3.204776,1.673316,1.530477,1.78303,1.570991,1.433625,1.747671,2.774419,2.707194,2.71141
3,A04,SQ00015145,10654.0,1114.0,1050.0,1.150804,0.718026,1.0,0.587334,0.322159,...,3.229464,1.585951,1.411656,1.71414,1.642538,1.478418,1.802124,2.858609,2.790332,2.810005
4,A05,SQ00015145,9789.5,1117.0,1009.5,1.145755,0.714386,1.0,0.590788,0.323806,...,3.22527,1.647358,1.488801,1.73555,1.606238,1.461589,1.768636,2.821141,2.748339,2.771754
