# Generate Consensus Signatures

**Gregory Way, 2019**

We do not have well-level information for the cell health data.
Therefore, we cannot map to cell painting replicates.

Instead, we generate consensus signatures for each treatment.
We will use the MODZ (moderated z-score) transform used in the L1000 analysis paper ([Subramanian et al. 2017](https://doi.org/10.1016/j.cell.2017.10.049)).

We apply this transformation to both:

* Cell Painting Data
* Cell Health Assay Readout Data

In [1]:
import os
import numpy as np
import pandas as pd

from pycytominer.consensus import modz
from pycytominer.get_na_columns import get_na_columns

## Load Cell Painting Data

This will be our x matrix in machine learning appications.

In [2]:
batch = "CRISPR_PILOT_B1"
profile_dir = os.path.join("data", "profiles", batch)

all_profile_files = []
for plate in os.listdir(profile_dir):
    plate_dir = os.path.join(profile_dir, plate)
    
    if plate == '.DS_Store':
        continue

    for profile_file in os.listdir(plate_dir):
        if "feature_select" in profile_file:
            all_profile_files.append(os.path.join(plate_dir, profile_file))

In [3]:
# Concatentate all cell painting datasets
x_df = (
    pd.concat(
        [pd.read_csv(x) for x in all_profile_files],
        sort=True
    )
    .rename(
        {
            "Image_Metadata_Plate": "Metadata_Plate",
            "Image_Metadata_Well": "Metadata_Well"
        },
        axis="columns")
)

# Realign metadata column names
x_metadata_cols = x_df.columns[x_df.columns.str.startswith("Metadata")]
x_metadata_df = x_df.loc[:, x_metadata_cols]

x_df = x_df.drop(x_metadata_cols, axis="columns")
x_df = pd.concat([x_metadata_df, x_df], axis="columns")

# Drop all features that have missing values
additional_exclude_features = get_na_columns(x_df, features="infer", cutoff=0)
print("Drop {} features for missing values".format(len(additional_exclude_features)))
x_df = x_df.drop(additional_exclude_features, axis="columns")

print(x_df.shape)
x_df.head(2)

Drop 21 features for missing values
(3456, 1288)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,SQ00014611,A01,1,A,A549,EMPTY,EMPTY,-0.303953,2.314706,0.424618,...,0.935404,1.080805,0.916672,1.541207,0.133984,0.132009,0.219259,-0.137173,0.617988,-1.180383
1,SQ00014611,A02,2,A,A549,MCL1,MCL1-5,-0.190626,1.057335,0.162234,...,1.222029,1.57802,1.2658,1.989133,0.502503,0.513252,0.656997,1.445111,0.822091,1.997297


## Load Cell Health Assay Data

This will be the y matrix in machine learning applications.

In [4]:
file = os.path.join("data", "labels", "normalized_cell_health_labels.tsv")
y_df = pd.read_csv(file, sep='\t').drop(["plate_name", "well_col", "well_row"], axis="columns")

print(y_df.shape)
y_df.head(2)

(2302, 72)


Unnamed: 0,cell_id,guide,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,ES2,AKT1-1,0.655229,-0.565658,-0.839186,-0.513748,0.3136,0.263062,0.109983,-0.226513,...,0.281397,-0.279051,-0.429141,-0.177258,-0.9203,-0.139875,-0.016549,0.14057,,
1,ES2,AKT1-1,-0.251336,-0.816445,-0.52594,-0.81981,-0.450799,-0.811628,-0.468875,-0.167787,...,0.543716,-0.221588,-0.311041,-0.149198,-1.070176,-0.046783,0.268559,0.040163,-0.29248,0.008339


## Determine how many Cell Painting profiles have Cell Health status labels

In [5]:
x_groupby_cols = ["Metadata_gene_name", "Metadata_pert_name", "Metadata_cell_line"]

x_metacount_df = (
    x_df
    .loc[:, x_groupby_cols]
    .assign(n_measurements=1)
    .groupby(x_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_painting")
    .merge(x_df.loc[:, x_groupby_cols + ["Metadata_Well", "Metadata_Plate"]],
           how="left",
           on=x_groupby_cols)
)

print(x_metacount_df.shape)
x_metacount_df.head(2)

(3456, 7)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements,data_type,Metadata_Well,Metadata_Plate
0,AKT1,AKT1-1,A549,6,cell_painting,A03,SQ00014611
1,AKT1,AKT1-1,A549,6,cell_painting,O22,SQ00014611


In [6]:
y_groupby_cols = ["guide", "cell_id"]

y_metacount_df = (
    y_df
    .loc[:, y_groupby_cols]
    .assign(n_measurements=1)
    .groupby(y_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_health")
)

print(y_metacount_df.shape)
y_metacount_df.head(2)

(364, 4)


Unnamed: 0,guide,cell_id,n_measurements,data_type
0,AKT1-1,A549,4,cell_health
1,AKT1-1,ES2,4,cell_health


In [7]:
all_measurements_df = (
    x_metacount_df
    .merge(
        y_metacount_df,
        left_on=["Metadata_pert_name", "Metadata_cell_line"],
        right_on=["guide", "cell_id"],
        suffixes=["_paint", "_health"],
        how="inner")
    .sort_values(by=["Metadata_cell_line", "Metadata_pert_name"])
    .reset_index(drop=True)
    .drop(["Metadata_Well", "guide", "cell_id"], axis="columns")
)

file = os.path.join("results", "all_profile_metadata.tsv")
all_measurements_df.to_csv(file, sep='\t', index=False)

print(all_measurements_df.shape)
all_measurements_df.head()

(3456, 8)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements_paint,data_type_paint,Metadata_Plate,n_measurements_health,data_type_health
0,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health
1,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health
2,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
3,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
4,AKT1,AKT1-1,A549,6,cell_painting,SQ00014612,4,cell_health


## Apply the MODZ Consensus Aggregation

### 1) To the Cell Painting Data

In [8]:
x_consensus_df = modz(
    x_df,
    replicate_columns=["Metadata_cell_line", "Metadata_pert_name"],
    precision=5
)

x_consensus_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_MaxFeretDiameter,Cells_AreaShape_MaximumRadius,Cells_AreaShape_MeanRadius,Cells_AreaShape_MedianRadius,Cells_AreaShape_MinFeretDiameter,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
Metadata_cell_line,Metadata_pert_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A549,AKT1-1,0.07193,0.058673,-0.048603,0.0,0.092972,0.11407,0.117479,0.120168,0.095324,0.119927,...,0.933031,1.271866,1.1438,1.246878,0.649138,0.590802,0.590566,0.342978,0.164454,0.529136
A549,AKT1-2,-0.138047,0.396241,0.464852,0.0,0.268839,-0.156438,-0.116528,-0.083662,-0.064358,-0.166207,...,0.526123,0.371192,0.194148,0.412839,0.012015,0.055485,0.089602,0.162255,0.147265,0.086007
A549,ARID1B-1,0.178516,0.739506,-0.348868,0.0,-0.481855,0.321717,0.120957,0.102188,0.113411,0.20658,...,-0.521309,-0.019196,0.134715,-0.100963,-0.049586,0.076895,-0.202828,0.331005,0.200163,0.346467
A549,ARID1B-2,0.486482,-0.019855,0.370251,0.0,-0.373809,0.62825,0.41706,0.419055,0.394183,0.548287,...,-0.018916,-0.361304,-0.291002,-0.287105,-0.12557,-0.115333,0.073608,0.619306,0.429432,0.680737
A549,ATF4-1,3.42064,-0.124241,-0.071464,0.0,0.46515,3.456872,3.254955,3.235557,3.225669,3.290238,...,0.417267,-0.250491,0.05976,-0.144306,0.012049,-0.202045,0.220401,1.356764,2.36021,0.349848


In [9]:
x_consensus_df = (
    x_consensus_df
    .reset_index()
    .query("Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()")
    .query("Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()")
    .reset_index(drop=True)
    .reset_index()
    .rename({"index": "Metadata_profile_id"}, axis='columns')
)
x_consensus_df.Metadata_profile_id = ["profile_{}".format(x) for x in x_consensus_df.Metadata_profile_id]

print(x_consensus_df.shape)
x_consensus_df.head(5)

(357, 1284)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_MaxFeretDiameter,Cells_AreaShape_MaximumRadius,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,profile_0,A549,AKT1-1,0.07193,0.058673,-0.048603,0.0,0.092972,0.11407,0.117479,...,0.933031,1.271866,1.1438,1.246878,0.649138,0.590802,0.590566,0.342978,0.164454,0.529136
1,profile_1,A549,AKT1-2,-0.138047,0.396241,0.464852,0.0,0.268839,-0.156438,-0.116528,...,0.526123,0.371192,0.194148,0.412839,0.012015,0.055485,0.089602,0.162255,0.147265,0.086007
2,profile_2,A549,ARID1B-1,0.178516,0.739506,-0.348868,0.0,-0.481855,0.321717,0.120957,...,-0.521309,-0.019196,0.134715,-0.100963,-0.049586,0.076895,-0.202828,0.331005,0.200163,0.346467
3,profile_3,A549,ARID1B-2,0.486482,-0.019855,0.370251,0.0,-0.373809,0.62825,0.41706,...,-0.018916,-0.361304,-0.291002,-0.287105,-0.12557,-0.115333,0.073608,0.619306,0.429432,0.680737
4,profile_4,A549,ATF4-1,3.42064,-0.124241,-0.071464,0.0,0.46515,3.456872,3.254955,...,0.417267,-0.250491,0.05976,-0.144306,0.012049,-0.202045,0.220401,1.356764,2.36021,0.349848


In [10]:
# Output Profile Mapping for Downstream Analysis
profile_id_mapping_df = x_consensus_df.loc[:, x_consensus_df.columns.str.startswith("Metadata")]
file = os.path.join("data", "profile_id_metadata_mapping.tsv")
profile_id_mapping_df.to_csv(file, sep='\t', index=False)

profile_id_mapping_df.head()

Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name
0,profile_0,A549,AKT1-1
1,profile_1,A549,AKT1-2
2,profile_2,A549,ARID1B-1
3,profile_3,A549,ARID1B-2
4,profile_4,A549,ATF4-1


### 2) To the Cell Health Assay Data

In [11]:
cell_health_meta_features = ["cell_id", "guide"]
cell_health_features = y_df.drop(cell_health_meta_features, axis="columns").columns.tolist()

In [12]:
y_consensus_df = modz(
    y_df,
    features=cell_health_features,
    replicate_columns=cell_health_meta_features,
    precision=5)

print(y_consensus_df.shape)
y_consensus_df.head()

(364, 70)


Unnamed: 0_level_0,Unnamed: 1_level_0,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,cc_cc_edu_pos_mean,cc_cc_g1_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
cell_id,guide,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A549,AKT1-1,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,0.040322,0.091643,0.043915,...,0.438339,0.059414,0.082424,0.0,-0.06505,-0.020236,-0.00797,0.020263,0.408214,0.654575
A549,AKT1-2,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,-0.182571,0.893453,-1.023968,...,0.067568,0.256141,0.132834,0.386327,0.575026,0.225091,0.220461,-0.224965,0.284962,0.567898
A549,ARID1B-1,0.118598,1.198685,0.16514,0.330071,0.417723,0.514065,-0.308749,-0.172735,0.455015,-0.606599,...,0.100365,0.299229,0.249557,0.283246,0.501941,0.055517,0.047697,-0.055445,-0.363766,-0.205937
A549,ARID1B-2,-0.072919,0.317079,-0.089281,0.155305,-0.385316,-0.300279,-0.120261,-0.299718,0.131195,-0.221492,...,0.283802,0.143096,0.13679,0.111855,0.493883,-0.084415,-0.085658,0.084439,0.198285,-0.162976
A549,ATF4-1,4.286179,0.007467,3.284383,-2.246762,3.015881,3.065773,2.854124,1.246508,-1.623145,0.923187,...,-2.434136,0.315381,0.332291,0.192878,0.509217,0.100668,0.104857,-0.100596,-0.811271,-0.092505


In [13]:
y_meta_cols = ["Metadata_profile_id", "Metadata_pert_name", "Metadata_cell_line"]

y_consensus_df = (
    y_consensus_df
    .reset_index()
    .reset_index(drop=True)
    .merge(
        x_consensus_df.loc[:, y_meta_cols],
        left_on=["guide", "cell_id"],
        right_on=["Metadata_pert_name", "Metadata_cell_line"],
        how="right"
    )
)

# Get columns in correct order
y_columns = (
    y_meta_cols +
    y_consensus_df
    .loc[:, ~y_consensus_df.columns.str.startswith("Metadata_")]
    .columns
    .tolist()
)

y_consensus_df = (
    y_consensus_df
    .loc[:, y_columns]
    .drop(["guide", "cell_id"], axis="columns")
)

print(y_consensus_df.shape)
y_consensus_df.head(5)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,...,0.438339,0.059414,0.082424,0.0,-0.06505,-0.020236,-0.00797,0.020263,0.408214,0.654575
1,profile_1,AKT1-2,A549,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,...,0.067568,0.256141,0.132834,0.386327,0.575026,0.225091,0.220461,-0.224965,0.284962,0.567898
2,profile_2,ARID1B-1,A549,0.118598,1.198685,0.16514,0.330071,0.417723,0.514065,-0.308749,...,0.100365,0.299229,0.249557,0.283246,0.501941,0.055517,0.047697,-0.055445,-0.363766,-0.205937
3,profile_3,ARID1B-2,A549,-0.072919,0.317079,-0.089281,0.155305,-0.385316,-0.300279,-0.120261,...,0.283802,0.143096,0.13679,0.111855,0.493883,-0.084415,-0.085658,0.084439,0.198285,-0.162976
4,profile_4,ATF4-1,A549,4.286179,0.007467,3.284383,-2.246762,3.015881,3.065773,2.854124,...,-2.434136,0.315381,0.332291,0.192878,0.509217,0.100668,0.104857,-0.100596,-0.811271,-0.092505


In [14]:
# Confirm that matrices are aligned
pd.testing.assert_series_equal(x_consensus_df.Metadata_profile_id,
                               y_consensus_df.Metadata_profile_id, check_names=True)

# Are the guides aligned?
pd.testing.assert_series_equal(x_consensus_df.Metadata_pert_name,
                               y_consensus_df.Metadata_pert_name, check_names=True)

# Are the cells aligned?
pd.testing.assert_series_equal(x_consensus_df.Metadata_cell_line,
                               y_consensus_df.Metadata_cell_line, check_names=True)

## Output Consensus Signatures

In [15]:
file = os.path.join("data", "consensus", "cell_painting_modz.tsv.gz")
x_consensus_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "consensus", "cell_health_modz.tsv.gz")
y_consensus_df.to_csv(file, sep="\t", index=False)