## Apply UMAP to CRISPR perturbations

These are the Cell Painting profiles of the CRISPR perturbations used to train and test each Cell Health model

In [1]:
import umap
import pathlib
import numpy as np
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features

In [2]:
np.random.seed(123)

In [3]:
# Set constants and file names
consensus = "modz"

data_dir = pathlib.Path("..", "1.generate-profiles", "data", "consensus")
cell_process_dir = pathlib.Path("..", "1.generate-profiles", "tables")
results_dir = pathlib.Path("results")
shiny_app_dir = pathlib.Path("..", "4.apply", "repurposing_cellhealth_shiny", "data")

profile_file = pathlib.Path(data_dir, f"cell_painting_{consensus}.tsv.gz")
cell_health_file = pathlib.Path(data_dir, f"cell_health_{consensus}.tsv.gz")

cell_process_file = pathlib.Path(cell_process_dir, "supplementary_table_1_perturbation_details.tsv")
output_file = pathlib.Path(shiny_app_dir, f"profile_umap_with_cell_health_{consensus}.tsv")

In [4]:
# Load profile data
df = (
    pd.read_csv(profile_file, sep="\t")
    .sort_values(by="Metadata_profile_id")
    .reset_index(drop=True)
)

cp_features = infer_cp_features(df)

print(df.shape)
df.head()

(357, 952)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,profile_0,A549,AKT1-1,-0.18016,-0.155631,0.014646,0.188053,1.231056,0.031064,-0.585477,...,0.562585,0.988876,0.87995,0.904785,0.906875,0.923143,0.944998,0.984938,1.122724,0.961945
1,profile_1,A549,AKT1-2,0.370572,-0.247842,-0.030773,0.433778,0.062456,0.26686,0.838679,...,0.018933,0.446225,0.359496,0.557998,0.631931,0.504751,0.407462,0.522251,0.64437,0.519441
2,profile_10,A549,BCL2-2,-0.0419,-0.252931,-0.299617,0.559805,1.18016,0.232533,-0.049973,...,-0.654379,-0.565796,-0.666583,-1.153182,-0.780638,-1.193731,-0.642472,-1.215133,-0.938655,-1.246239
3,profile_100,A549,RAF1-2,-0.299418,0.074748,-0.059569,-0.162925,-0.029864,-0.281596,-0.410077,...,0.316486,-0.066283,-0.218801,0.863651,1.15553,0.849225,-0.303395,0.576961,0.619277,0.688809
4,profile_101,A549,RHOA-1,0.35182,0.115802,0.144107,0.229938,-0.860244,-0.413477,0.02779,...,0.53751,-0.190554,-0.30321,0.212603,0.663154,0.111954,-0.428024,-0.088491,-0.035262,0.071793


In [5]:
# Load cell health data
cell_health_df = (
    pd.read_csv(cell_health_file, sep="\t")
    .sort_values(by="Metadata_profile_id")
    .reset_index(drop=True)
)

print(cell_health_df.shape)
cell_health_df.head()

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,...,0.438339,0.059414,-0.06505,-0.020236,-0.00797,0.082424,0.0,0.020263,0.408214,0.654575
1,profile_1,AKT1-2,A549,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,...,0.067568,0.256141,0.575026,0.225091,0.220461,0.132834,0.386327,-0.224965,0.284962,0.567898
2,profile_10,BCL2-2,A549,-0.182172,0.270253,-0.165335,1.022081,-0.443078,-0.456076,-0.595775,...,0.529686,0.051138,-0.056488,-0.015343,-0.013721,0.0,0.118264,0.015366,0.232668,1.311557
3,profile_100,RAF1-2,A549,0.06411,0.025085,0.01467,-0.231155,0.05395,0.05855,0.459818,...,-0.575798,0.128562,0.517263,-0.013249,-0.016064,0.081094,0.16382,0.013285,-1.057565,-0.888677
4,profile_101,RHOA-1,A549,1.767355,1.501962,0.626605,-0.328481,1.599174,2.183969,0.467122,...,-0.208857,0.060459,0.34681,-0.205431,-0.204576,0.083873,0.0,0.205403,-0.574627,-0.314264


In [6]:
# Load cell process annotation file
cell_process_df = pd.read_csv(cell_process_file, sep="\t")
cell_process_df.columns = [f"Metadata_{x}" for x in cell_process_df.columns]

print(cell_process_df.shape)
cell_process_df.head()

(127, 6)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_process,Metadata_cell_health_data,Metadata_cell_painting_data
0,AKT1,AKT1-1,BRDN0001054908,PIK3CA,True,True
1,AKT1,AKT1-2,BRDN0001055115,PIK3CA,True,True
2,ARID1B,ARID1B-1,,Chromatin Modifiers,True,True
3,ARID1B,ARID1B-2,,Chromatin Modifiers,True,True
4,ATF4,ATF4-1,,ER Stress/UPR,True,True


In [7]:
# Ensure data and predictions are aligned
assert df.Metadata_profile_id.tolist() == cell_health_df.Metadata_profile_id.tolist()

In [8]:
# Apply UMAP
reducer = umap.UMAP(random_state=1234, n_components=2)

predict_embedding_df = pd.DataFrame(
    reducer.fit_transform(df.loc[:, cp_features]),
    columns=["umap_x", "umap_y"]
)

print(predict_embedding_df.shape)
predict_embedding_df.head()

(357, 2)


Unnamed: 0,umap_x,umap_y
0,6.63646,10.349184
1,6.886117,10.342171
2,4.860858,10.511171
3,4.533332,9.541967
4,3.937612,9.187695


In [9]:
# Combine data to form a single output file
output_df = (
    predict_embedding_df
    .merge(
        cell_health_df,
        left_index=True,
        right_index=True
    )
    .merge(
        cell_process_df,
        left_on="Metadata_pert_name",
        right_on="Metadata_pert_name",
        how="left"
    )
    # Drops 3 redundant "Empty" pert IDs
    .drop_duplicates(subset="Metadata_profile_id")
)

print(output_df.shape)
output_df.head()

(357, 80)


Unnamed: 0,umap_x,umap_y,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,...,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean,Metadata_gene_name,Metadata_broad_sample,Metadata_cell_process,Metadata_cell_health_data,Metadata_cell_painting_data
0,6.63646,10.349184,profile_0,AKT1-1,A549,-0.005795,0.580351,0.013975,0.381958,0.150696,...,0.082424,0.0,0.020263,0.408214,0.654575,AKT1,BRDN0001054908,PIK3CA,True,True
1,6.886117,10.342171,profile_1,AKT1-2,A549,0.050169,1.27773,0.241808,0.577422,0.220829,...,0.132834,0.386327,-0.224965,0.284962,0.567898,AKT1,BRDN0001055115,PIK3CA,True,True
2,4.860858,10.511171,profile_10,BCL2-2,A549,-0.182172,0.270253,-0.165335,1.022081,-0.443078,...,0.0,0.118264,0.015366,0.232668,1.311557,BCL2,,Apoptosis,True,True
3,4.533332,9.541967,profile_100,RAF1-2,A549,0.06411,0.025085,0.01467,-0.231155,0.05395,...,0.081094,0.16382,0.013285,-1.057565,-0.888677,RAF1,,MAPK,True,True
4,3.937612,9.187695,profile_101,RHOA-1,A549,1.767355,1.501962,0.626605,-0.328481,1.599174,...,0.083873,0.0,0.205403,-0.574627,-0.314264,RHOA,BRDN0000990371,Cytoskeletal Re-org/Integrin,True,True


In [10]:
# Output to file
output_df.to_csv(output_file, sep="\t", index=False)