# Generate Consensus Signatures

**Gregory Way, 2019**

We do not have well-level information for the cell health data.
Therefore, we cannot map to cell painting replicates.

Instead, we generate consensus signatures for each treatment.
We generate consensus signatures in two ways.

1. Median consensus
2. MODZ (moderated z-score) transform used in the L1000 analysis paper ([Subramanian et al. 2017](https://doi.org/10.1016/j.cell.2017.10.049)).

We apply these transformations to both:

* Cell Painting Data
* Cell Health Assay Readout Data

In [1]:
import os
import numpy as np
import pandas as pd

from pycytominer.consensus import modz
from pycytominer import get_na_columns, aggregate

## Load Cell Painting Data

This will be our x matrix in machine learning appications.

In [2]:
profile_dir = os.path.join("data", "profiles")

all_profile_files = []
for plate in os.listdir(profile_dir):
    plate_dir = os.path.join(profile_dir, plate)
    
    if plate == '.DS_Store':
        continue

    for profile_file in os.listdir(plate_dir):
        if "feature_select" in profile_file:
            all_profile_files.append(os.path.join(plate_dir, profile_file))

In [3]:
all_profile_files

['data/profiles/SQ00014618/SQ00014618_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014611/SQ00014611_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014616/SQ00014616_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014617/SQ00014617_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014610/SQ00014610_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014615/SQ00014615_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014612/SQ00014612_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014613/SQ00014613_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014614/SQ00014614_normalized_feature_select.csv.gz']

In [4]:
# Concatentate all cell painting datasets
x_df = (
    pd.concat(
        [pd.read_csv(x) for x in all_profile_files],
        sort=True
    )
    .rename(
        {
            "Image_Metadata_Plate": "Metadata_Plate",
            "Image_Metadata_Well": "Metadata_Well"
        },
        axis="columns")
    .drop(["Metadata_broad_sample"], axis="columns")
)

# Realign metadata column names
x_metadata_cols = x_df.columns[x_df.columns.str.startswith("Metadata")]
x_metadata_df = x_df.loc[:, x_metadata_cols]

x_df = x_df.drop(x_metadata_cols, axis="columns")
x_df = pd.concat([x_metadata_df, x_df], axis="columns")

# Drop columns with na values
na_cols_to_drop = get_na_columns(x_df, cutoff=0)
print("Dropping {} columns because of missing data".format(len(na_cols_to_drop)))
x_df = x_df.drop(na_cols_to_drop, axis="columns")

# Also drop Costes features
costes_cols_to_drop = [x for x in x_df.columns if "costes" in x.lower()]
print("Dropping {} costes features".format(len(costes_cols_to_drop)))
x_df = x_df.drop(costes_cols_to_drop, axis="columns")

print(x_df.shape)
x_df.head(2)

Dropping 675 columns because of missing data
Dropping 1 costes features
(3456, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,-0.894997,-1.515696,-1.787667,...,0.107581,-0.659049,-0.676846,-1.229791,-1.336051,-1.125138,-0.97236,-1.393856,-1.244227,-1.308729
1,SQ00014618,A02,2,A,HCC44,MCL1,MCL1-5,-0.479926,0.246423,0.629901,...,0.165935,1.999006,1.204036,0.560228,0.686189,0.601634,1.154001,0.596441,0.680359,0.715469


## Output Full Merged Profiles DataFrame

In [5]:
profile_file = os.path.join("data", "processed", "cell_health_profiles_merged.tsv.gz")
x_df.to_csv(profile_file, index=False, sep='\t')

## Load Cell Health Assay Data

This will be the y matrix in machine learning applications.

In [6]:
file = os.path.join("data", "labels", "normalized_cell_health_labels.tsv")
y_df = pd.read_csv(file, sep='\t').drop(["plate_name", "well_col", "well_row"], axis="columns")

print(y_df.shape)
y_df.head(2)

(2302, 72)


Unnamed: 0,cell_id,guide,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,ES2,AKT1-1,0.655229,-0.565658,-0.839186,-0.513748,0.3136,0.263062,0.109983,-0.226513,...,0.281397,-0.279051,-0.9203,-0.139875,-0.016549,-0.429141,-0.177258,0.14057,,
1,ES2,AKT1-1,-0.251336,-0.816445,-0.52594,-0.81981,-0.450799,-0.811628,-0.468875,-0.167787,...,0.543716,-0.221588,-1.070176,-0.046783,0.268559,-0.311041,-0.149198,0.040163,-0.29248,0.008339


## Determine how many Cell Painting profiles have Cell Health status labels

In [7]:
x_groupby_cols = ["Metadata_gene_name", "Metadata_pert_name", "Metadata_cell_line"]

x_metacount_df = (
    x_df
    .loc[:, x_groupby_cols]
    .assign(n_measurements=1)
    .groupby(x_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_painting")
    .merge(x_df.loc[:, x_groupby_cols + ["Metadata_Well", "Metadata_Plate"]],
           how="left",
           on=x_groupby_cols)
)

print(x_metacount_df.shape)
x_metacount_df.head(2)

(3456, 7)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements,data_type,Metadata_Well,Metadata_Plate
0,AKT1,AKT1-1,A549,6,cell_painting,A03,SQ00014611
1,AKT1,AKT1-1,A549,6,cell_painting,O22,SQ00014611


In [8]:
y_groupby_cols = ["guide", "cell_id"]

y_metacount_df = (
    y_df
    .loc[:, y_groupby_cols]
    .assign(n_measurements=1)
    .groupby(y_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_health")
)

print(y_metacount_df.shape)
y_metacount_df.head(2)

(364, 4)


Unnamed: 0,guide,cell_id,n_measurements,data_type
0,AKT1-1,A549,4,cell_health
1,AKT1-1,ES2,4,cell_health


In [9]:
all_measurements_df = (
    x_metacount_df
    .merge(
        y_metacount_df,
        left_on=["Metadata_pert_name", "Metadata_cell_line"],
        right_on=["guide", "cell_id"],
        suffixes=["_paint", "_health"],
        how="inner")
    .sort_values(by=["Metadata_cell_line", "Metadata_pert_name"])
    .reset_index(drop=True)
    .drop(["Metadata_Well", "guide", "cell_id"], axis="columns")
)

file = os.path.join("results", "all_profile_metadata.tsv")
all_measurements_df.to_csv(file, sep='\t', index=False)

print(all_measurements_df.shape)
all_measurements_df.head()

(3456, 8)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements_paint,data_type_paint,Metadata_Plate,n_measurements_health,data_type_health
0,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health
1,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health
2,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
3,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
4,AKT1,AKT1-1,A549,6,cell_painting,SQ00014612,4,cell_health


## A. Apply Median Consensus Aggregation

### 1) To the Cell Painting Data

In [10]:
x_median_df = aggregate(
    x_df,
    strata=["Metadata_cell_line", "Metadata_pert_name"],
    features="infer",
    operation="median"
)


x_median_df = (
    x_median_df
    .query("Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()")
    .query("Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()")
    .reset_index(drop=True)
    .reset_index()
    .rename({"index": "Metadata_profile_id"}, axis='columns')
)
x_median_df.Metadata_profile_id = ["profile_{}".format(x) for x in x_median_df.Metadata_profile_id]

print(x_median_df.shape)
x_median_df.head()

(357, 952)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,profile_0,A549,AKT1-1,0.596128,0.39128,0.463658,-0.221246,1.0115,-0.305663,-0.440232,...,0.647205,0.827639,0.863358,0.929565,0.886017,0.985453,0.892091,1.071022,1.072497,1.120483
1,profile_1,A549,AKT1-2,0.515609,-0.156584,0.092082,0.330569,-0.208782,0.083119,0.506794,...,-0.071948,0.597993,0.455386,0.537999,0.664216,0.527794,0.528653,0.608856,0.747248,0.555587
2,profile_2,A549,ARID1B-1,-0.363977,1.120855,1.093451,-0.812694,0.625135,-1.211121,-0.490666,...,0.082424,-0.650126,-0.749495,-0.56024,-0.310823,-0.490446,-0.6498,-0.52915,-0.357135,-0.516982
3,profile_3,A549,ARID1B-2,0.494768,0.3042,0.731194,-0.233966,-0.048039,-0.725937,0.088292,...,0.475349,-0.502658,-0.790581,0.077721,0.463475,0.008737,-0.694479,-0.125924,0.02021,-0.030855
4,profile_4,A549,ATF4-1,-0.260876,-0.214451,0.202429,0.560428,0.506326,-1.213235,1.06049,...,-0.400131,0.782377,0.310806,0.459031,1.301323,0.362271,0.188553,-0.016442,0.20045,0.251894


In [11]:
# Output Profile Mapping for Downstream Analysis
profile_id_mapping_df = x_median_df.loc[:, x_median_df.columns.str.startswith("Metadata")]
file = os.path.join("data", "profile_id_metadata_mapping.tsv")
profile_id_mapping_df.to_csv(file, sep='\t', index=False)

profile_id_mapping_df.head()

Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name
0,profile_0,A549,AKT1-1
1,profile_1,A549,AKT1-2
2,profile_2,A549,ARID1B-1
3,profile_3,A549,ARID1B-2
4,profile_4,A549,ATF4-1


### 2) To the Cell Health Assay Data

In [12]:
cell_health_meta_features = ["cell_id", "guide"]
cell_health_features = y_df.drop(cell_health_meta_features, axis="columns").columns.tolist()
y_meta_merge_cols = ["Metadata_profile_id", "Metadata_pert_name", "Metadata_cell_line"]

In [13]:
y_median_df = aggregate(
    y_df,
    strata=cell_health_meta_features,
    features=cell_health_features,
    operation="median"
)

print(y_median_df.shape)
y_median_df.head()

(364, 72)


Unnamed: 0,cell_id,guide,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,A549,AKT1-1,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,0.039147,...,0.399842,0.0,-0.118976,-0.132871,-0.12109,0.0,0.0,0.132882,0.80697,1.293984
1,A549,AKT1-2,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,-0.183445,...,0.10167,0.318027,0.621374,0.100032,0.074036,0.132751,0.467027,-0.099917,0.558041,1.151867
2,A549,ARID1B-1,0.111163,1.092964,0.151393,0.290203,0.402121,0.4817,-0.27698,-0.149979,...,0.080701,0.3391,0.598093,0.055951,0.042014,0.165161,0.247058,-0.05592,-0.393937,0.103202
3,A549,ARID1B-2,-0.061528,0.320829,-0.091007,0.141819,-0.378769,-0.288693,-0.108741,-0.300783,...,0.265754,0.098699,0.37193,-0.063935,-0.05516,0.138654,0.0,0.063946,0.210005,0.055291
4,A549,ATF4-1,3.967818,0.0034,3.268615,-2.246887,2.891737,2.878938,2.853995,1.243444,...,-2.343919,0.0,-0.089544,0.141535,0.131393,0.0,0.0,-0.141397,-0.63139,0.106477


In [14]:
y_median_df = (
    y_median_df
    .reset_index(drop=True)
    .merge(
        x_median_df.loc[:, y_meta_merge_cols],
        left_on=["guide", "cell_id"],
        right_on=["Metadata_pert_name", "Metadata_cell_line"],
        how="right"
    )
)

# Get columns in correct order
y_columns = (
    y_meta_merge_cols +
    y_median_df
    .loc[:, ~y_median_df.columns.str.startswith("Metadata_")]
    .columns
    .tolist()
)

y_median_df = (
    y_median_df
    .loc[:, y_columns]
    .drop(["guide", "cell_id"], axis="columns")
)

print(y_median_df.shape)
y_median_df.head(5)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,...,0.399842,0.0,-0.118976,-0.132871,-0.12109,0.0,0.0,0.132882,0.80697,1.293984
1,profile_1,AKT1-2,A549,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,...,0.10167,0.318027,0.621374,0.100032,0.074036,0.132751,0.467027,-0.099917,0.558041,1.151867
2,profile_2,ARID1B-1,A549,0.111163,1.092964,0.151393,0.290203,0.402121,0.4817,-0.27698,...,0.080701,0.3391,0.598093,0.055951,0.042014,0.165161,0.247058,-0.05592,-0.393937,0.103202
3,profile_3,ARID1B-2,A549,-0.061528,0.320829,-0.091007,0.141819,-0.378769,-0.288693,-0.108741,...,0.265754,0.098699,0.37193,-0.063935,-0.05516,0.138654,0.0,0.063946,0.210005,0.055291
4,profile_4,ATF4-1,A549,3.967818,0.0034,3.268615,-2.246887,2.891737,2.878938,2.853995,...,-2.343919,0.0,-0.089544,0.141535,0.131393,0.0,0.0,-0.141397,-0.63139,0.106477


In [15]:
# Confirm that matrices are aligned
pd.testing.assert_series_equal(
    x_median_df.Metadata_profile_id,
    y_median_df.Metadata_profile_id,
    check_names=True
)

# Are the guides aligned?
pd.testing.assert_series_equal(
    x_median_df.Metadata_pert_name,
    y_median_df.Metadata_pert_name,
    check_names=True
)

# Are the cells aligned?
pd.testing.assert_series_equal(
    x_median_df.Metadata_cell_line,
    y_median_df.Metadata_cell_line,
    check_names=True
)

## B. Apply the MODZ Consensus Aggregation

### 1) To the Cell Painting Data

In [16]:
x_consensus_df = modz(
    x_df,
    replicate_columns=["Metadata_cell_line", "Metadata_pert_name"],
    precision=5
)

x_consensus_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_3,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
Metadata_cell_line,Metadata_pert_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A549,AKT1-1,-0.18016,-0.155631,0.014646,0.188053,1.231056,0.031064,-0.585477,0.467976,0.254203,0.129446,...,0.562585,0.988876,0.87995,0.904785,0.906875,0.923143,0.944998,0.984938,1.122724,0.961945
A549,AKT1-2,0.370572,-0.247842,-0.030773,0.433778,0.062456,0.26686,0.838679,-0.467417,0.340793,-0.02445,...,0.018933,0.446225,0.359496,0.557998,0.631931,0.504751,0.407462,0.522251,0.64437,0.519441
A549,ARID1B-1,-0.360905,0.79474,0.743296,-0.438752,0.827288,-0.728499,-0.422394,-0.147624,0.08387,-0.160722,...,-0.042733,-0.721832,-0.840771,-0.536215,-0.330336,-0.497296,-0.748232,-0.51524,-0.42144,-0.526734
A549,ARID1B-2,0.26245,0.480421,0.821096,-0.337898,0.054479,-0.662314,0.067979,0.02501,0.158138,-0.342062,...,0.688435,-0.345377,-0.521766,0.124801,0.457536,0.063444,-0.560178,-0.062851,0.085026,0.026056
A549,ATF4-1,-0.110264,-0.074895,0.569377,0.571678,0.976504,-1.111947,1.348542,0.584906,-3.128483,-2.324773,...,-0.638839,1.133282,0.753884,0.808944,1.721671,0.594059,0.674015,0.140325,0.29123,0.417465


In [17]:
x_consensus_df = (
    x_consensus_df
    .reset_index()
    .query("Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()")
    .query("Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()")
    .reset_index(drop=True)
    .reset_index()
    .rename(
        {
            "index": "Metadata_profile_id"
        },
        axis='columns'
    )
)
x_consensus_df.Metadata_profile_id = ["profile_{}".format(x) for x in x_consensus_df.Metadata_profile_id]

print(x_consensus_df.shape)
x_consensus_df.head(5)

(357, 952)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,profile_0,A549,AKT1-1,-0.18016,-0.155631,0.014646,0.188053,1.231056,0.031064,-0.585477,...,0.562585,0.988876,0.87995,0.904785,0.906875,0.923143,0.944998,0.984938,1.122724,0.961945
1,profile_1,A549,AKT1-2,0.370572,-0.247842,-0.030773,0.433778,0.062456,0.26686,0.838679,...,0.018933,0.446225,0.359496,0.557998,0.631931,0.504751,0.407462,0.522251,0.64437,0.519441
2,profile_2,A549,ARID1B-1,-0.360905,0.79474,0.743296,-0.438752,0.827288,-0.728499,-0.422394,...,-0.042733,-0.721832,-0.840771,-0.536215,-0.330336,-0.497296,-0.748232,-0.51524,-0.42144,-0.526734
3,profile_3,A549,ARID1B-2,0.26245,0.480421,0.821096,-0.337898,0.054479,-0.662314,0.067979,...,0.688435,-0.345377,-0.521766,0.124801,0.457536,0.063444,-0.560178,-0.062851,0.085026,0.026056
4,profile_4,A549,ATF4-1,-0.110264,-0.074895,0.569377,0.571678,0.976504,-1.111947,1.348542,...,-0.638839,1.133282,0.753884,0.808944,1.721671,0.594059,0.674015,0.140325,0.29123,0.417465


### 2) To the Cell Health Assay Data

In [18]:
y_consensus_df = modz(
    y_df,
    features=cell_health_features,
    replicate_columns=cell_health_meta_features,
    precision=5
)

print(y_consensus_df.shape)
y_consensus_df.head()

(364, 70)


Unnamed: 0_level_0,Unnamed: 1_level_0,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,cc_cc_early_mitosis,cc_cc_g1,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
cell_id,guide,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A549,AKT1-1,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,0.040322,0.375966,0.043915,...,0.438339,0.059414,-0.06505,-0.020236,-0.00797,0.082424,0.0,0.020263,0.408214,0.654575
A549,AKT1-2,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,-0.182571,0.736046,-1.023968,...,0.067568,0.256141,0.575026,0.225091,0.220461,0.132834,0.386327,-0.224965,0.284962,0.567898
A549,ARID1B-1,0.118598,1.198685,0.16514,0.330071,0.417723,0.514065,-0.308749,-0.172735,0.148168,-0.606599,...,0.100365,0.299229,0.501941,0.055517,0.047697,0.249557,0.283246,-0.055445,-0.363766,-0.205937
A549,ARID1B-2,-0.072919,0.317079,-0.089281,0.155305,-0.385316,-0.300279,-0.120261,-0.299718,0.012239,-0.221492,...,0.283802,0.143096,0.493883,-0.084415,-0.085658,0.13679,0.111855,0.084439,0.198285,-0.162976
A549,ATF4-1,4.286179,0.007467,3.284383,-2.246762,3.015881,3.065773,2.854124,1.246508,0.049095,0.923187,...,-2.434136,0.315381,0.509217,0.100668,0.104857,0.332291,0.192878,-0.100596,-0.811271,-0.092505


In [19]:
y_consensus_df = (
    y_consensus_df
    .reset_index()
    .reset_index(drop=True)
    .merge(
        x_consensus_df.loc[:, y_meta_merge_cols],
        left_on=["guide", "cell_id"],
        right_on=["Metadata_pert_name", "Metadata_cell_line"],
        how="right"
    )
    .loc[:, y_columns]
    .drop(["guide", "cell_id"], axis="columns")
)

print(y_consensus_df.shape)
y_consensus_df.head(5)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,...,0.438339,0.059414,-0.06505,-0.020236,-0.00797,0.082424,0.0,0.020263,0.408214,0.654575
1,profile_1,AKT1-2,A549,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,...,0.067568,0.256141,0.575026,0.225091,0.220461,0.132834,0.386327,-0.224965,0.284962,0.567898
2,profile_2,ARID1B-1,A549,0.118598,1.198685,0.16514,0.330071,0.417723,0.514065,-0.308749,...,0.100365,0.299229,0.501941,0.055517,0.047697,0.249557,0.283246,-0.055445,-0.363766,-0.205937
3,profile_3,ARID1B-2,A549,-0.072919,0.317079,-0.089281,0.155305,-0.385316,-0.300279,-0.120261,...,0.283802,0.143096,0.493883,-0.084415,-0.085658,0.13679,0.111855,0.084439,0.198285,-0.162976
4,profile_4,ATF4-1,A549,4.286179,0.007467,3.284383,-2.246762,3.015881,3.065773,2.854124,...,-2.434136,0.315381,0.509217,0.100668,0.104857,0.332291,0.192878,-0.100596,-0.811271,-0.092505


In [20]:
# Confirm that matrices are aligned
pd.testing.assert_series_equal(
    x_consensus_df.Metadata_profile_id,
    y_consensus_df.Metadata_profile_id,
    check_names=True
)

# Are the guides aligned?
pd.testing.assert_series_equal(
    x_consensus_df.Metadata_pert_name,
    y_consensus_df.Metadata_pert_name,
    check_names=True
)

# Are the cells aligned?
pd.testing.assert_series_equal(
    x_consensus_df.Metadata_cell_line,
    y_consensus_df.Metadata_cell_line,
    check_names=True
)

## Output Median and MODZ Consensus Signatures

In [21]:
consensus_dir = os.path.join("data", "consensus")

file = os.path.join(consensus_dir, "cell_painting_median.tsv.gz")
x_median_df.to_csv(file, sep="\t", index=False)

file = os.path.join(consensus_dir, "cell_health_median.tsv.gz")
y_median_df.to_csv(file, sep="\t", index=False)

file = os.path.join(consensus_dir, "cell_painting_modz.tsv.gz")
x_consensus_df.to_csv(file, sep="\t", index=False)

file = os.path.join(consensus_dir, "cell_health_modz.tsv.gz")
y_consensus_df.to_csv(file, sep="\t", index=False)