# Normalize Cell Health Labels

**Gregory Way, 2019**

In [1]:
import os
import numpy as np
from scipy.stats import median_absolute_deviation
import pandas as pd

from pycytominer import write_gct

In [2]:
%matplotlib inline

In [3]:
# Function to scale cell health target variables
def scale(x):
    x_median = np.nanmedian(x)
    x_std = np.nanstd(x)
    x_scale = (x - x_median) / x_std
    return x_scale

In [4]:
file = os.path.join("data", "labels", "cell_health_labels.tsv")
label_df = pd.read_csv(file, sep='\t')

print(label_df.shape)
label_df.head(2)

(2302, 75)


Unnamed: 0,cell_id,guide,plate_name,well_col,well_row,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,ES2,AKT1-1,Plate 1,3,A,0.04287,0.007976,0.003988,1003,1.777,...,1465.0,0.0271,1.64,0.03173,0.01652,0.0119,0.0152,0.9683,,
1,ES2,AKT1-1,Plate 1,22,O,0.02635,0.005988,0.005988,835,1.582,...,1575.0,0.03169,1.405,0.03961,0.02255,0.01463,0.01706,0.9598,279.6,2083.0


In [5]:
# Some infinite values are present, replace them with NA
label_df = label_df.replace([np.inf, -np.inf], np.nan)

# Apply normalization by plate
normalized_label_df = (
    label_df
    .drop(["guide", "well_col", "well_row"], axis="columns")
    .groupby(["cell_id", "plate_name"])
    .transform(scale)
)

normalized_label_df = pd.concat(
    [
        label_df.loc[:, ["cell_id", "guide", "plate_name", "well_col", "well_row"]],
        normalized_label_df
    ],
    axis="columns"
)

print(normalized_label_df.shape)
normalized_label_df.head(2)

(2302, 75)


Unnamed: 0,cell_id,guide,plate_name,well_col,well_row,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,ES2,AKT1-1,Plate 1,3,A,0.655229,-0.565658,-0.839186,-0.513748,0.3136,...,0.281397,-0.279051,-0.9203,-0.139875,-0.016549,-0.429141,-0.177258,0.14057,,
1,ES2,AKT1-1,Plate 1,22,O,-0.251336,-0.816445,-0.52594,-0.81981,-0.450799,...,0.543716,-0.221588,-1.070176,-0.046783,0.268559,-0.311041,-0.149198,0.040163,-0.29248,0.008339


In [6]:
# How many replicates per perturbation?
normalized_label_df.groupby(["cell_id", "guide"])["plate_name"].count().value_counts()

4      307
16      42
8        6
10       3
6        3
112      1
111      1
83       1
Name: plate_name, dtype: int64

In [7]:
# Write to file
file = os.path.join("data", "labels", "normalized_cell_health_labels.tsv")
normalized_label_df.to_csv(file, index=False, sep='\t')

## Build Cell Health Target Variable GCT Files

For viewing heatmaps in Morpheus.

In [8]:
# Recode metadata variables
normalized_label_df = (
    normalized_label_df
    .rename(
        {
            "cell_id": "Metadata_cell_id",
            "guide": "Metadata_guide",
            "plate_name": "Metadata_plate_name",
            "well_col": "Metadata_well_col",
            "well_row": "Metadata_well_row"
        },
        axis="columns"
    )
)

print(normalized_label_df.shape)
normalized_label_df.head(2)

(2302, 75)


Unnamed: 0,Metadata_cell_id,Metadata_guide,Metadata_plate_name,Metadata_well_col,Metadata_well_row,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,ES2,AKT1-1,Plate 1,3,A,0.655229,-0.565658,-0.839186,-0.513748,0.3136,...,0.281397,-0.279051,-0.9203,-0.139875,-0.016549,-0.429141,-0.177258,0.14057,,
1,ES2,AKT1-1,Plate 1,22,O,-0.251336,-0.816445,-0.52594,-0.81981,-0.450799,...,0.543716,-0.221588,-1.070176,-0.046783,0.268559,-0.311041,-0.149198,0.040163,-0.29248,0.008339


In [9]:
# Load feature map
file = os.path.join("data", "labels", "feature_mapping_annotated.csv")
feature_map = (
    pd.read_csv(file, index_col=0)
    .rename(
        {
            "id": "variable_name"
        },
        axis="columns"
    )
    .transpose()
    .reset_index()
    .transpose()
    .rename(
        {
            "index": "id"
        },
        axis="rows"
    )
)

feature_map.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
id,readable_name,original_name,feature_type,measurement,gate_required,assay,hoechst,edu,ph3,gh2ax,caspase,draq7,cell_rox,dpc,description
well_row,Row,Row,metadata,metadata,,,,,,,,,,,
well_col,Column,Column,metadata,metadata,,,,,,,,,,,
plate_name,Plate Name,Plate Name,metadata,metadata,,,,,,,,,,,
guide,sgRNA,sgRNA,metadata,metadata,,,,,,,,,,,


In [10]:
# Build and output gct file
cell_health_features = [x for x in normalized_label_df.columns if not x.startswith("Metadata_")]
output_file = os.path.join("data", "labels", "normalized_cell_health_labels.gct")

write_gct(
    profiles=normalized_label_df,
    output_file=output_file,
    features=cell_health_features,
    meta_features="infer",
    feature_metadata=feature_map
)