## Reprocess Cell Health profiles

Use a whole-plate normalization scheme instead of normalization by controls only.

We will use the control normalization in downstream analyses, but we are interested in comparing the impact of normalization strategy on grit calculations.

In [1]:
import pathlib
import pandas as pd

from pycytominer import normalize, feature_select

In [2]:
def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482bb664d9de1"):
    link = f"https://github.com/broadinstitute/cell-health/raw/{commit}/1.generate-profiles/data/profiles/{plate}/{plate}_augmented.csv.gz"
    
    annotate_df = pd.read_csv(link)
    
    norm_file = pathlib.Path(f"{output_dir}/{plate}_wholeplate_normalized.csv.gz")
    feat_select_file = pathlib.Path(f"{output_dir}/{plate}_wholeplate_normalized_feature_selected.csv.gz")
    
    normalize(
        profiles=annotate_df,
        features="infer",
        meta_features=meta_features,
        samples="all",
        method="mad_robustize",
        output_file=norm_file,
        compression_options={"method": "gzip", "mtime": 1}
    )

In [3]:
# Define the plates
plates = [
    "SQ00014610",
    "SQ00014611",
    "SQ00014612",
    "SQ00014613",
    "SQ00014614",
    "SQ00014615",
    "SQ00014616",
    "SQ00014617",
    "SQ00014618"
]

# Define metadata features
meta_features = [
    "Image_Metadata_Plate",
    "Image_Metadata_Well",
    "Metadata_WellRow",
    "Metadata_WellCol",
    "Metadata_gene_name",
    "Metadata_pert_name",
    "Metadata_broad_sample",
    "Metadata_cell_line",
]

output_dir = pathlib.Path("data/cell-health/profiles")

commit = "cd91bd0daacef2b5ea25dcceb62482bb664d9de1"

In [4]:
for plate in plates:
    normalize_profile(plate, output_dir, commit)

## Now form a single merged dataset to perform feature selection

In [5]:
# Load different normalized data
plate_files = [x for x in output_dir.iterdir() if "_normalized.csv.gz" in x.name]

In [6]:
# Concatentate all plates
x_df = (
    pd.concat(
        [pd.read_csv(x) for x in plate_files],
        sort=True
    )
    .rename(
        {
            "Image_Metadata_Plate": "Metadata_Plate",
            "Image_Metadata_Well": "Metadata_Well"
        },
        axis="columns")
    .drop(["Metadata_broad_sample"], axis="columns")
)

# Realign metadata column names
x_metadata_cols = x_df.columns[x_df.columns.str.startswith("Metadata")]
x_metadata_df = x_df.loc[:, x_metadata_cols]

x_df = x_df.drop(x_metadata_cols, axis="columns")
x_df = pd.concat([x_metadata_df, x_df], axis="columns")

print(x_df.shape)
x_df.head()

(3456, 1790)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,SQ00014614,A01,1,A,ES2,EMPTY,EMPTY,0.198679,0.144891,-0.75259,...,-0.603577,-1.843878,-1.565373,-1.684384,-1.720628,-2.109142,-1.599917,0.068394,0.705348,-0.357706
1,SQ00014614,A02,2,A,ES2,MCL1,MCL1-5,1.028739,0.864347,-1.043686,...,0.65036,-0.223084,-0.102971,-0.207832,-0.435918,-0.844094,-0.205457,-0.956752,-0.101427,-1.087406
2,SQ00014614,A03,3,A,ES2,AKT1,AKT1-1,0.12799,-0.354732,-0.234297,...,0.660783,-0.013818,-0.27722,-0.097459,-0.484098,-0.695575,-0.154885,-0.928039,-0.239437,-0.870137
3,SQ00014614,A04,4,A,ES2,KRAS,KRAS-2B,0.29186,-0.634521,-0.291096,...,-0.157085,-1.565116,-1.696331,-1.43414,-1.082391,-1.208717,-0.872927,-1.132148,-0.481178,-1.259132
4,SQ00014614,A05,5,A,ES2,AKT1,AKT1-2,-0.397893,-0.174868,-0.546693,...,0.117074,-0.885385,-0.859397,-0.691697,-0.656142,-0.900225,-0.532379,-1.211184,-1.078744,-1.295632


In [7]:
# Perform feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]

x_df = feature_select(
    profiles=x_df,
    operation=feature_select_ops,
    na_cutoff=0
)

print(x_df.shape)
x_df.head(2)

(3456, 440)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_Variance_ER_20_0
0,SQ00014614,A01,1,A,ES2,EMPTY,EMPTY,0.193423,0.810565,0.394067,...,-1.069082,-0.157315,-0.114324,2.222454,-1.883674,-0.661164,-2.079387,-0.879608,-0.586046,-1.565373
1,SQ00014614,A02,2,A,ES2,MCL1,MCL1-5,0.063269,0.17946,-0.621186,...,0.745802,1.866147,1.293618,2.372686,-0.420094,-0.140664,-0.323854,-0.878583,-0.928709,-0.102971


In [8]:
# Also drop Costes features
costes_cols_to_drop = [x for x in x_df.columns if "costes" in x.lower()]
print("Dropping {} costes features".format(len(costes_cols_to_drop)))
x_df = x_df.drop(costes_cols_to_drop, axis="columns")

print(x_df.shape)
x_df.head(2)

Dropping 3 costes features
(3456, 437)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_Variance_ER_20_0
0,SQ00014614,A01,1,A,ES2,EMPTY,EMPTY,0.193423,0.810565,0.394067,...,-1.069082,-0.157315,-0.114324,2.222454,-1.883674,-0.661164,-2.079387,-0.879608,-0.586046,-1.565373
1,SQ00014614,A02,2,A,ES2,MCL1,MCL1-5,0.063269,0.17946,-0.621186,...,0.745802,1.866147,1.293618,2.372686,-0.420094,-0.140664,-0.323854,-0.878583,-0.928709,-0.102971


In [9]:
# Output
profile_file = pathlib.Path(f"{output_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected.tsv.gz")
x_df.to_csv(profile_file, index=False, sep="\t")