# Generate Consensus Signatures

**Gregory Way, 2019**

We do not have well-level information for the cell health data.
Therefore, we cannot map to cell painting replicates.

Instead, we generate consensus signatures for each treatment.
We generate consensus signatures in two ways.

1. Median consensus
2. MODZ (moderated z-score) transform used in the L1000 analysis paper ([Subramanian et al. 2017](https://doi.org/10.1016/j.cell.2017.10.049)).

We apply these transformations to both:

* Cell Painting Data
* Cell Health Assay Readout Data

In [1]:
import os
import numpy as np
import pandas as pd

from pycytominer.consensus import modz
from pycytominer import get_na_columns, aggregate

## Load Cell Painting Data

This will be our x matrix in machine learning appications.

In [2]:
profile_dir = os.path.join("data", "profiles")

all_profile_files = []
for plate in os.listdir(profile_dir):
    plate_dir = os.path.join(profile_dir, plate)
    
    if plate == '.DS_Store':
        continue

    for profile_file in os.listdir(plate_dir):
        if "feature_select" in profile_file:
            all_profile_files.append(os.path.join(plate_dir, profile_file))

In [3]:
all_profile_files

['data/profiles/SQ00014618/SQ00014618_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014611/SQ00014611_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014616/SQ00014616_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014617/SQ00014617_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014610/SQ00014610_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014615/SQ00014615_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014612/SQ00014612_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014613/SQ00014613_normalized_feature_select.csv.gz',
 'data/profiles/SQ00014614/SQ00014614_normalized_feature_select.csv.gz']

In [4]:
# Concatentate all cell painting datasets
x_df = (
    pd.concat(
        [pd.read_csv(x) for x in all_profile_files],
        sort=True
    )
    .rename(
        {
            "Image_Metadata_Plate": "Metadata_Plate",
            "Image_Metadata_Well": "Metadata_Well"
        },
        axis="columns")
    .drop(["Metadata_broad_sample"], axis="columns")
)

# Realign metadata column names
x_metadata_cols = x_df.columns[x_df.columns.str.startswith("Metadata")]
x_metadata_df = x_df.loc[:, x_metadata_cols]

x_df = x_df.drop(x_metadata_cols, axis="columns")
x_df = pd.concat([x_metadata_df, x_df], axis="columns")

# Drop columns with na values
na_cols_to_drop = get_na_columns(x_df, cutoff=0)
print("Dropping {} columns because of missing data".format(len(na_cols_to_drop)))
x_df = x_df.drop(na_cols_to_drop, axis="columns")

# Also drop Costes features
costes_cols_to_drop = [x for x in x_df.columns if "costes" in x.lower()]
print("Dropping {} costes features".format(len(costes_cols_to_drop)))
x_df = x_df.drop(costes_cols_to_drop, axis="columns")

print(x_df.shape)
x_df.head(2)

Dropping 34 columns because of missing data
Dropping 2 costes features
(3456, 1605)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,-0.891801,0.870588,-0.683168,...,-0.888368,-0.273388,-0.136467,-0.709549,-0.754013,-0.834143,-0.937134,0.829446,0.69203,0.691189
1,SQ00014618,A02,2,A,HCC44,MCL1,MCL1-5,0.739305,1.247059,-0.366337,...,0.485662,1.515287,1.43019,1.170855,0.48787,0.880912,0.231417,0.937816,1.829701,0.636011


## Load Cell Health Assay Data

This will be the y matrix in machine learning applications.

In [5]:
file = os.path.join("data", "labels", "normalized_cell_health_labels.tsv")
y_df = pd.read_csv(file, sep='\t').drop(["plate_name", "well_col", "well_row"], axis="columns")

print(y_df.shape)
y_df.head(2)

(2302, 72)


Unnamed: 0,cell_id,guide,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,ES2,AKT1-1,0.655229,-0.565658,-0.839186,-0.513748,0.3136,0.263062,0.109983,-0.226513,...,0.281397,-0.279051,-0.429141,-0.177258,-0.9203,-0.139875,-0.016549,0.14057,,
1,ES2,AKT1-1,-0.251336,-0.816445,-0.52594,-0.81981,-0.450799,-0.811628,-0.468875,-0.167787,...,0.543716,-0.221588,-0.311041,-0.149198,-1.070176,-0.046783,0.268559,0.040163,-0.29248,0.008339


## Determine how many Cell Painting profiles have Cell Health status labels

In [6]:
x_groupby_cols = ["Metadata_gene_name", "Metadata_pert_name", "Metadata_cell_line"]

x_metacount_df = (
    x_df
    .loc[:, x_groupby_cols]
    .assign(n_measurements=1)
    .groupby(x_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_painting")
    .merge(x_df.loc[:, x_groupby_cols + ["Metadata_Well", "Metadata_Plate"]],
           how="left",
           on=x_groupby_cols)
)

print(x_metacount_df.shape)
x_metacount_df.head(2)

(3456, 7)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements,data_type,Metadata_Well,Metadata_Plate
0,AKT1,AKT1-1,A549,6,cell_painting,A03,SQ00014611
1,AKT1,AKT1-1,A549,6,cell_painting,O22,SQ00014611


In [7]:
y_groupby_cols = ["guide", "cell_id"]

y_metacount_df = (
    y_df
    .loc[:, y_groupby_cols]
    .assign(n_measurements=1)
    .groupby(y_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_health")
)

print(y_metacount_df.shape)
y_metacount_df.head(2)

(364, 4)


Unnamed: 0,guide,cell_id,n_measurements,data_type
0,AKT1-1,A549,4,cell_health
1,AKT1-1,ES2,4,cell_health


In [8]:
all_measurements_df = (
    x_metacount_df
    .merge(
        y_metacount_df,
        left_on=["Metadata_pert_name", "Metadata_cell_line"],
        right_on=["guide", "cell_id"],
        suffixes=["_paint", "_health"],
        how="inner")
    .sort_values(by=["Metadata_cell_line", "Metadata_pert_name"])
    .reset_index(drop=True)
    .drop(["Metadata_Well", "guide", "cell_id"], axis="columns")
)

file = os.path.join("results", "all_profile_metadata.tsv")
all_measurements_df.to_csv(file, sep='\t', index=False)

print(all_measurements_df.shape)
all_measurements_df.head()

(3456, 8)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements_paint,data_type_paint,Metadata_Plate,n_measurements_health,data_type_health
0,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health
1,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health
2,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
3,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
4,AKT1,AKT1-1,A549,6,cell_painting,SQ00014612,4,cell_health


## A. Apply Median Consensus Aggregation

### 1) To the Cell Painting Data

In [9]:
x_median_df = aggregate(
    x_df,
    strata=["Metadata_cell_line", "Metadata_pert_name"],
    features="infer",
    operation="median"
)


x_median_df = (
    x_median_df
    .query("Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()")
    .query("Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()")
    .reset_index(drop=True)
    .reset_index()
    .rename({"index": "Metadata_profile_id"}, axis='columns')
)
x_median_df.Metadata_profile_id = ["profile_{}".format(x) for x in x_median_df.Metadata_profile_id]

print(x_median_df.shape)
x_median_df.head()

(357, 1601)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,profile_0,A549,AKT1-1,0.390141,-0.472324,0.454178,0.300041,0.322093,-0.160304,0.093525,...,0.896247,0.678922,0.798388,0.95074,0.520392,0.225345,0.714348,0.212517,0.189997,0.253847
1,profile_1,A549,AKT1-2,0.045949,0.436621,0.403512,-0.118672,0.052351,0.231665,-0.081538,...,0.421047,0.035434,0.010126,0.200546,-0.15023,-0.097422,0.01191,0.176618,0.06061,-0.153813
2,profile_2,A549,ARID1B-1,0.249454,0.702219,-0.286093,0.857301,0.856635,-0.603966,0.200036,...,-0.418698,0.068137,0.212118,0.064007,-0.151543,0.00275,-0.350798,0.14106,0.242216,0.117324
3,profile_3,A549,ARID1B-2,0.896802,-0.404669,0.390859,0.233926,0.508465,-0.178178,-0.070928,...,-0.023804,-0.140058,0.05211,-0.091973,0.052738,-0.141271,0.096027,0.593044,0.191046,0.431945
4,profile_4,A549,ATF4-1,3.839796,0.465345,-0.206819,-0.160101,0.164711,0.43323,-1.220911,...,0.186003,-0.120315,0.261485,0.006571,-0.053772,-0.372899,0.242087,0.858518,2.205523,0.117149


In [10]:
# Output Profile Mapping for Downstream Analysis
profile_id_mapping_df = x_median_df.loc[:, x_median_df.columns.str.startswith("Metadata")]
file = os.path.join("data", "profile_id_metadata_mapping.tsv")
profile_id_mapping_df.to_csv(file, sep='\t', index=False)

profile_id_mapping_df.head()

Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name
0,profile_0,A549,AKT1-1
1,profile_1,A549,AKT1-2
2,profile_2,A549,ARID1B-1
3,profile_3,A549,ARID1B-2
4,profile_4,A549,ATF4-1


### 2) To the Cell Health Assay Data

In [11]:
cell_health_meta_features = ["cell_id", "guide"]
cell_health_features = y_df.drop(cell_health_meta_features, axis="columns").columns.tolist()
y_meta_merge_cols = ["Metadata_profile_id", "Metadata_pert_name", "Metadata_cell_line"]

In [12]:
y_median_df = aggregate(
    y_df,
    strata=cell_health_meta_features,
    features=cell_health_features,
    operation="median"
)

print(y_median_df.shape)
y_median_df.head()

(364, 72)


Unnamed: 0,cell_id,guide,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,A549,AKT1-1,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,0.039147,...,0.399842,0.0,0.0,0.0,-0.118976,-0.132871,-0.12109,0.132882,0.80697,1.293984
1,A549,AKT1-2,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,-0.183445,...,0.10167,0.318027,0.132751,0.467027,0.621374,0.100032,0.074036,-0.099917,0.558041,1.151867
2,A549,ARID1B-1,0.111163,1.092964,0.151393,0.290203,0.402121,0.4817,-0.27698,-0.149979,...,0.080701,0.3391,0.165161,0.247058,0.598093,0.055951,0.042014,-0.05592,-0.393937,0.103202
3,A549,ARID1B-2,-0.061528,0.320829,-0.091007,0.141819,-0.378769,-0.288693,-0.108741,-0.300783,...,0.265754,0.098699,0.138654,0.0,0.37193,-0.063935,-0.05516,0.063946,0.210005,0.055291
4,A549,ATF4-1,3.967818,0.0034,3.268615,-2.246887,2.891737,2.878938,2.853995,1.243444,...,-2.343919,0.0,0.0,0.0,-0.089544,0.141535,0.131393,-0.141397,-0.63139,0.106477


In [13]:
y_median_df = (
    y_median_df
    .reset_index(drop=True)
    .merge(
        x_median_df.loc[:, y_meta_merge_cols],
        left_on=["guide", "cell_id"],
        right_on=["Metadata_pert_name", "Metadata_cell_line"],
        how="right"
    )
)

# Get columns in correct order
y_columns = (
    y_meta_merge_cols +
    y_median_df
    .loc[:, ~y_median_df.columns.str.startswith("Metadata_")]
    .columns
    .tolist()
)

y_median_df = (
    y_median_df
    .loc[:, y_columns]
    .drop(["guide", "cell_id"], axis="columns")
)

print(y_median_df.shape)
y_median_df.head(5)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,...,0.399842,0.0,0.0,0.0,-0.118976,-0.132871,-0.12109,0.132882,0.80697,1.293984
1,profile_1,AKT1-2,A549,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,...,0.10167,0.318027,0.132751,0.467027,0.621374,0.100032,0.074036,-0.099917,0.558041,1.151867
2,profile_2,ARID1B-1,A549,0.111163,1.092964,0.151393,0.290203,0.402121,0.4817,-0.27698,...,0.080701,0.3391,0.165161,0.247058,0.598093,0.055951,0.042014,-0.05592,-0.393937,0.103202
3,profile_3,ARID1B-2,A549,-0.061528,0.320829,-0.091007,0.141819,-0.378769,-0.288693,-0.108741,...,0.265754,0.098699,0.138654,0.0,0.37193,-0.063935,-0.05516,0.063946,0.210005,0.055291
4,profile_4,ATF4-1,A549,3.967818,0.0034,3.268615,-2.246887,2.891737,2.878938,2.853995,...,-2.343919,0.0,0.0,0.0,-0.089544,0.141535,0.131393,-0.141397,-0.63139,0.106477


In [14]:
# Confirm that matrices are aligned
pd.testing.assert_series_equal(x_median_df.Metadata_profile_id,
                               y_median_df.Metadata_profile_id, check_names=True)

# Are the guides aligned?
pd.testing.assert_series_equal(x_median_df.Metadata_pert_name,
                               y_median_df.Metadata_pert_name, check_names=True)

# Are the cells aligned?
pd.testing.assert_series_equal(x_median_df.Metadata_cell_line,
                               y_median_df.Metadata_cell_line, check_names=True)

## B. Apply the MODZ Consensus Aggregation

### 1) To the Cell Painting Data

In [15]:
x_consensus_df = modz(
    x_df,
    replicate_columns=["Metadata_cell_line", "Metadata_pert_name"],
    precision=5
)

x_consensus_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_MajorAxisLength,Cells_AreaShape_MaxFeretDiameter,Cells_AreaShape_MaximumRadius,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
Metadata_cell_line,Metadata_pert_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A549,AKT1-1,0.42668,0.016416,-0.057528,-0.178226,-0.083852,0.172003,-0.280522,0.380334,0.377695,0.437981,...,0.71846,0.87726,0.811628,1.068138,0.37077,0.287482,0.38751,0.264552,0.25883,0.354644
A549,AKT1-2,0.29864,0.371576,0.295355,-0.176274,-0.003109,0.319832,0.247319,0.19708,0.209207,0.28362,...,0.413436,0.281365,0.152485,0.386029,-0.062285,-0.011781,-0.011991,0.124322,0.221694,-0.008421
A549,ARID1B-1,0.566238,0.707406,-0.277417,0.560425,0.513895,-0.271136,0.222813,0.604164,0.617983,0.45649,...,-0.408473,0.02352,0.176094,-0.031756,-0.084702,0.037931,-0.27555,0.272115,0.253081,0.282218
A549,ARID1B-2,0.885068,0.027469,0.209359,0.370753,0.60686,-0.254413,0.021339,0.92479,0.930532,0.710241,...,-0.02628,-0.227188,-0.197654,-0.12428,-0.13167,-0.130329,0.016847,0.478789,0.442927,0.499079
A549,ATF4-1,4.524627,-0.245799,-0.074467,-0.050822,0.453385,0.446055,-1.67263,4.191498,4.263122,3.843347,...,0.347311,-0.155483,0.078138,-0.017778,-0.024974,-0.22279,0.206561,1.083604,2.080401,0.176088


In [16]:
x_consensus_df = (
    x_consensus_df
    .reset_index()
    .query("Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()")
    .query("Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()")
    .reset_index(drop=True)
    .reset_index()
    .rename({"index": "Metadata_profile_id"}, axis='columns')
)
x_consensus_df.Metadata_profile_id = ["profile_{}".format(x) for x in x_consensus_df.Metadata_profile_id]

print(x_consensus_df.shape)
x_consensus_df.head(5)

(357, 1601)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,profile_0,A549,AKT1-1,0.42668,0.016416,-0.057528,-0.178226,-0.083852,0.172003,-0.280522,...,0.71846,0.87726,0.811628,1.068138,0.37077,0.287482,0.38751,0.264552,0.25883,0.354644
1,profile_1,A549,AKT1-2,0.29864,0.371576,0.295355,-0.176274,-0.003109,0.319832,0.247319,...,0.413436,0.281365,0.152485,0.386029,-0.062285,-0.011781,-0.011991,0.124322,0.221694,-0.008421
2,profile_2,A549,ARID1B-1,0.566238,0.707406,-0.277417,0.560425,0.513895,-0.271136,0.222813,...,-0.408473,0.02352,0.176094,-0.031756,-0.084702,0.037931,-0.27555,0.272115,0.253081,0.282218
3,profile_3,A549,ARID1B-2,0.885068,0.027469,0.209359,0.370753,0.60686,-0.254413,0.021339,...,-0.02628,-0.227188,-0.197654,-0.12428,-0.13167,-0.130329,0.016847,0.478789,0.442927,0.499079
4,profile_4,A549,ATF4-1,4.524627,-0.245799,-0.074467,-0.050822,0.453385,0.446055,-1.67263,...,0.347311,-0.155483,0.078138,-0.017778,-0.024974,-0.22279,0.206561,1.083604,2.080401,0.176088


### 2) To the Cell Health Assay Data

In [17]:
y_consensus_df = modz(
    y_df,
    features=cell_health_features,
    replicate_columns=cell_health_meta_features,
    precision=5)

print(y_consensus_df.shape)
y_consensus_df.head()

(364, 70)


Unnamed: 0_level_0,Unnamed: 1_level_0,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,cc_cc_edu_pos_mean,cc_cc_g1_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
cell_id,guide,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A549,AKT1-1,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,0.040322,0.091643,0.043915,...,0.438339,0.059414,0.082424,0.0,-0.06505,-0.020236,-0.00797,0.020263,0.408214,0.654575
A549,AKT1-2,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,-0.182571,0.893453,-1.023968,...,0.067568,0.256141,0.132834,0.386327,0.575026,0.225091,0.220461,-0.224965,0.284962,0.567898
A549,ARID1B-1,0.118598,1.198685,0.16514,0.330071,0.417723,0.514065,-0.308749,-0.172735,0.455015,-0.606599,...,0.100365,0.299229,0.249557,0.283246,0.501941,0.055517,0.047697,-0.055445,-0.363766,-0.205937
A549,ARID1B-2,-0.072919,0.317079,-0.089281,0.155305,-0.385316,-0.300279,-0.120261,-0.299718,0.131195,-0.221492,...,0.283802,0.143096,0.13679,0.111855,0.493883,-0.084415,-0.085658,0.084439,0.198285,-0.162976
A549,ATF4-1,4.286179,0.007467,3.284383,-2.246762,3.015881,3.065773,2.854124,1.246508,-1.623145,0.923187,...,-2.434136,0.315381,0.332291,0.192878,0.509217,0.100668,0.104857,-0.100596,-0.811271,-0.092505


In [18]:
y_consensus_df = (
    y_consensus_df
    .reset_index()
    .reset_index(drop=True)
    .merge(
        x_consensus_df.loc[:, y_meta_merge_cols],
        left_on=["guide", "cell_id"],
        right_on=["Metadata_pert_name", "Metadata_cell_line"],
        how="right"
    )
    .loc[:, y_columns]
    .drop(["guide", "cell_id"], axis="columns")
)

print(y_consensus_df.shape)
y_consensus_df.head(5)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,...,0.438339,0.059414,0.082424,0.0,-0.06505,-0.020236,-0.00797,0.020263,0.408214,0.654575
1,profile_1,AKT1-2,A549,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,...,0.067568,0.256141,0.132834,0.386327,0.575026,0.225091,0.220461,-0.224965,0.284962,0.567898
2,profile_2,ARID1B-1,A549,0.118598,1.198685,0.16514,0.330071,0.417723,0.514065,-0.308749,...,0.100365,0.299229,0.249557,0.283246,0.501941,0.055517,0.047697,-0.055445,-0.363766,-0.205937
3,profile_3,ARID1B-2,A549,-0.072919,0.317079,-0.089281,0.155305,-0.385316,-0.300279,-0.120261,...,0.283802,0.143096,0.13679,0.111855,0.493883,-0.084415,-0.085658,0.084439,0.198285,-0.162976
4,profile_4,ATF4-1,A549,4.286179,0.007467,3.284383,-2.246762,3.015881,3.065773,2.854124,...,-2.434136,0.315381,0.332291,0.192878,0.509217,0.100668,0.104857,-0.100596,-0.811271,-0.092505


In [19]:
# Confirm that matrices are aligned
pd.testing.assert_series_equal(x_consensus_df.Metadata_profile_id,
                               y_consensus_df.Metadata_profile_id, check_names=True)

# Are the guides aligned?
pd.testing.assert_series_equal(x_consensus_df.Metadata_pert_name,
                               y_consensus_df.Metadata_pert_name, check_names=True)

# Are the cells aligned?
pd.testing.assert_series_equal(x_consensus_df.Metadata_cell_line,
                               y_consensus_df.Metadata_cell_line, check_names=True)

## Output Median and MODZ Consensus Signatures

In [20]:
consensus_dir = os.path.join("data", "consensus")

file = os.path.join(consensus_dir, "cell_painting_median.tsv.gz")
x_median_df.to_csv(file, sep="\t", index=False)

file = os.path.join(consensus_dir, "cell_health_median.tsv.gz")
y_median_df.to_csv(file, sep="\t", index=False)

file = os.path.join(consensus_dir, "cell_painting_modz.tsv.gz")
x_consensus_df.to_csv(file, sep="\t", index=False)

file = os.path.join(consensus_dir, "cell_health_modz.tsv.gz")
y_consensus_df.to_csv(file, sep="\t", index=False)