## Apply UMAP to visualize relationships between the CRISPR perturbations

In [9]:
import umap
import pathlib
import numpy as np
import pandas as pd
import plotnine as gg

from pycytominer.cyto_utils import infer_cp_features

In [2]:
np.random.seed(123)

In [3]:
# Set constants and file names
consensus = "modz"

data_dir = pathlib.Path("..", "1.generate-profiles", "data", "consensus")
results_dir = pathlib.Path("results")

profile_file = pathlib.Path(data_dir, f"cell_painting_{consensus}.tsv.gz")
pred_file = pathlib.Path(results_dir, f"all_model_predictions_{consensus}.tsv")
output_file = pathlib.Path(results_dir, f"profile_umap_with_predictions_{consensus}.tsv")

In [4]:
# Load profile data
df = (
    pd.read_csv(profile_file, sep="\t")
    .sort_values(by="Metadata_profile_id")
    .reset_index(drop=True)
)

cp_features = infer_cp_features(df)

print(df.shape)
df.head()

(357, 952)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,profile_0,A549,AKT1-1,-0.18016,-0.155631,0.014646,0.188053,1.231056,0.031064,-0.585477,...,0.562585,0.988876,0.87995,0.904785,0.906875,0.923143,0.944998,0.984938,1.122724,0.961945
1,profile_1,A549,AKT1-2,0.370572,-0.247842,-0.030773,0.433778,0.062456,0.26686,0.838679,...,0.018933,0.446225,0.359496,0.557998,0.631931,0.504751,0.407462,0.522251,0.64437,0.519441
2,profile_10,A549,BCL2-2,-0.0419,-0.252931,-0.299617,0.559805,1.18016,0.232533,-0.049973,...,-0.654379,-0.565796,-0.666583,-1.153182,-0.780638,-1.193731,-0.642472,-1.215133,-0.938655,-1.246239
3,profile_100,A549,RAF1-2,-0.299418,0.074748,-0.059569,-0.162925,-0.029864,-0.281596,-0.410077,...,0.316486,-0.066283,-0.218801,0.863651,1.15553,0.849225,-0.303395,0.576961,0.619277,0.688809
4,profile_101,A549,RHOA-1,0.35182,0.115802,0.144107,0.229938,-0.860244,-0.413477,0.02779,...,0.53751,-0.190554,-0.30321,0.212603,0.663154,0.111954,-0.428024,-0.088491,-0.035262,0.071793


In [5]:
# Load cell health model predictions
pred_df = (
    pd.read_csv(pred_file, sep="\t")
    .sort_values(by="Metadata_profile_id")
    .reset_index(drop=True)
)

print(pred_df.shape)
pred_df.head()

(357, 74)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Metadata_data_type,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,A549,AKT1-1,train,0.219139,0.347752,-0.014601,0.151728,0.273698,0.216029,...,0.021872,0.144581,0.306686,0.150832,0.201712,0.150316,0.058777,-0.15146,0.281601,0.317835
1,profile_1,A549,AKT1-2,train,0.180354,0.313098,0.242201,0.215245,0.109491,0.138052,...,0.036097,0.085218,0.281135,0.190021,0.250567,0.110907,0.15619,-0.190896,0.081968,0.392189
2,profile_10,A549,BCL2-2,train,0.449809,0.192186,0.544346,-0.036563,0.325523,0.27989,...,-0.123482,0.396847,0.38423,0.371537,0.412755,0.201875,0.397542,-0.371221,0.2039,0.671339
3,profile_100,A549,RAF1-2,train,0.436651,0.490893,-0.169349,-0.132025,0.446476,0.506515,...,-0.033119,0.074532,0.253269,-0.181613,-0.221795,0.109818,0.202373,0.181713,-0.655237,-0.74869
4,profile_101,A549,RHOA-1,train,0.511667,0.965652,0.849319,-0.3506,0.606788,0.819342,...,-0.489086,-0.013456,0.290836,-0.11364,-0.196848,0.103972,-0.15283,0.112496,-0.160598,-0.148568


In [6]:
# Ensure data and predictions are aligned
assert df.Metadata_profile_id.tolist() == pred_df.Metadata_profile_id.tolist()

In [7]:
# Apply UMAP
reducer = umap.UMAP(random_state=1234, n_components=2)

predict_embedding_df = pd.DataFrame(
    reducer.fit_transform(df.loc[:, cp_features]),
    columns=["umap_x", "umap_y"]
)

predict_embedding_df = (
    predict_embedding_df
    .merge(
        pred_df,
        left_index=True,
        right_index=True
    )
)

print(predict_embedding_df.shape)
predict_embedding_df.head()

Compilation is falling back to object mode WITH looplifting enabled because Function "fuzzy_simplicial_set" failed type inference due to: Untyped global name 'nearest_neighbors': cannot determine Numba type of <class 'function'>

File "../../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/umap_.py", line 446:
def fuzzy_simplicial_set(
    <source elided>
    if knn_indices is None or knn_dists is None:
        knn_indices, knn_dists, _ = nearest_neighbors(
        ^

  @numba.jit()

File "../../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/umap_.py", line 329:
@numba.jit()
def fuzzy_simplicial_set(
^

  self.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "../../../../../..

(357, 76)


Unnamed: 0,umap_x,umap_y,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Metadata_data_type,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,-1.220839,8.164794,profile_0,A549,AKT1-1,train,0.219139,0.347752,-0.014601,0.151728,...,0.021872,0.144581,0.306686,0.150832,0.201712,0.150316,0.058777,-0.15146,0.281601,0.317835
1,-0.793706,8.108212,profile_1,A549,AKT1-2,train,0.180354,0.313098,0.242201,0.215245,...,0.036097,0.085218,0.281135,0.190021,0.250567,0.110907,0.15619,-0.190896,0.081968,0.392189
2,-3.017267,8.141235,profile_10,A549,BCL2-2,train,0.449809,0.192186,0.544346,-0.036563,...,-0.123482,0.396847,0.38423,0.371537,0.412755,0.201875,0.397542,-0.371221,0.2039,0.671339
3,-3.329382,7.089287,profile_100,A549,RAF1-2,train,0.436651,0.490893,-0.169349,-0.132025,...,-0.033119,0.074532,0.253269,-0.181613,-0.221795,0.109818,0.202373,0.181713,-0.655237,-0.74869
4,-3.895667,6.63448,profile_101,A549,RHOA-1,train,0.511667,0.965652,0.849319,-0.3506,...,-0.489086,-0.013456,0.290836,-0.11364,-0.196848,0.103972,-0.15283,0.112496,-0.160598,-0.148568


In [8]:
# Output to file
predict_embedding_df.to_csv(output_file, sep="\t", index=False)