## Normalize and select CP features using Pycytominer 

In [1]:
import pandas as pd

# to install pycytominer, run `poetry run pip install <path_to_pycytominer>`
from pycytominer import normalize, feature_select

Predefine parameters for feature normalization (RobustMAD) and selection

In [2]:
MAD_EPSILON = 0.0
INC_IMAGE_FEATURES = True
FEAT_SELECT_OPS = ["variance_threshold", "correlation_threshold", "drop_na_columns", "blocklist"]

In [3]:
profiles = pd.read_parquet("output/raw_profiles.parquet")
profiles.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Eccentricity,Nuclei_AreaShape_Area
0,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125633,M20,0.78118,1074.6
1,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125634,M20,0.77979,1103.1
2,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125635,M20,0.7777,1111.2
3,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125636,M20,0.79073,1085.8
4,JCP2022_900007,ccsbBroad304_00009,ORF009407.1_TRC304.1,pLX_304,NM_001096.3,ACLY,47,9606,ATP citrate lyase,100.0,3303.0,trt,source_4,BR00125633,F03,0.78059,1053.1


Run feature normalization with `RobustMAD`

In [4]:
profiles = normalize(profiles, method="mad_robustize", mad_robustize_epsilon=MAD_EPSILON, image_features=INC_IMAGE_FEATURES)
profiles.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Eccentricity,Nuclei_AreaShape_Area
0,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125633,M20,-0.186118,-0.384347
1,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125634,M20,-0.393081,0.689563
2,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125635,M20,-0.70427,0.99478
3,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125636,M20,1.235822,0.037681
4,JCP2022_900007,ccsbBroad304_00009,ORF009407.1_TRC304.1,pLX_304,NM_001096.3,ACLY,47,9606,ATP citrate lyase,100.0,3303.0,trt,source_4,BR00125633,F03,-0.273965,-1.194489


Run feature selection

In [5]:
profiles = feature_select(profiles, operation=FEAT_SELECT_OPS, image_features=INC_IMAGE_FEATURES)
profiles.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Eccentricity,Nuclei_AreaShape_Area
0,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125633,M20,-0.186118,-0.384347
1,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125634,M20,-0.393081,0.689563
2,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125635,M20,-0.70427,0.99478
3,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt,source_4,BR00125636,M20,1.235822,0.037681
4,JCP2022_900007,ccsbBroad304_00009,ORF009407.1_TRC304.1,pLX_304,NM_001096.3,ACLY,47,9606,ATP citrate lyase,100.0,3303.0,trt,source_4,BR00125633,F03,-0.273965,-1.194489


Save preprocessed profiles

In [6]:
profiles.to_parquet("output/processed_profiles.parquet")