## Write all Profiles to GCT for Heatmap Visualization

**Gregory Way, 2020**

I also build consensus signatures for all unique treatments and output associated files.

In [1]:
import os
import pandas as pd

from pycytominer import (
    feature_select,
    write_gct
)

from pycytominer.consensus import modz
from pycytominer.cyto_utils import infer_cp_features

from scripts.processing_utils import load_data

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers"
]
gct_dir = os.path.join("data", "gct_files")
profile_dir = os.path.join("..", "0.generate-profiles", "profiles")
cell_count_dir = os.path.join("..", "0.generate-profiles", "cell_counts")
output_dir = os.path.join("data", "merged")

suffix = "normalized.csv.gz"

batches = [x for x in os.listdir(profile_dir) if x != ".DS_Store"]
batches

['2019_02_15_Batch1_40X',
 '2019_03_20_Batch2',
 '2019_06_25_Batch3',
 '2019_11_11_Batch4',
 '2019_11_20_Batch6',
 '2019_02_15_Batch1_20X',
 '2019_11_19_Batch5',
 '2019_11_22_Batch7']

In [3]:
profile_batches = {}
for batch in batches:
    # Build output information
    output_gct_dir = os.path.join(gct_dir, batch)
    os.makedirs(output_gct_dir, exist_ok=True)
    output_gct_file = os.path.join(
        output_gct_dir, "{}_feature_select.gct".format(batch)
    )
    
    # Load the profile data and add cell counts
    df = load_data(
        batch=batch,
        suffix=suffix,
        profile_dir=profile_dir,
        combine_dfs=True,
        add_cell_count=True,
        cell_count_dir=cell_count_dir
    )
    
    # Save normalized and non-feature selected data
    profile_batches[batch] = df
    
    # Apply feature selection again - this is particularly important for batches
    # with multiple plates
    df = feature_select(df, operation=feature_select_ops)
    
    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=df, output_file=output_gct_file)

## Merge Profiles Together and Output

In [4]:
all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True)

meta_features = infer_cp_features(all_profiles_df, metadata=True)
cp_cols = infer_cp_features(all_profiles_df, metadata=False)

all_profiles_df = all_profiles_df.reindex(meta_features + cp_cols, axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(10062, 4334)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,...,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_50_00,Nuclei_Texture_Variance_RNA_50_01,Nuclei_Texture_Variance_RNA_50_02,Nuclei_Texture_Variance_RNA_50_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,WT,0.0,HCT116bortezomib,1,B03,2019_02_15_Batch1_40X,387,,,,...,-1.247347,-1.240386,-1.041955,-1.01431,-1.081468,-1.040797,,,,
1,WT,0.0,HCT116bortezomib,2,B03,2019_02_15_Batch1_40X,377,,,,...,-1.092795,-1.098571,-1.335448,-1.209457,-1.328965,-1.213524,,,,
2,WT,0.0,HCT116bortezomib,3,B03,2019_02_15_Batch1_40X,403,,,,...,-0.827655,-0.818969,-0.834124,-0.949275,-0.808757,-0.842403,,,,
3,WT,0.0,HCT116bortezomib,4,B03,2019_02_15_Batch1_40X,414,,,,...,-0.873986,-0.903269,-1.046058,-1.094919,-1.035641,-1.136826,,,,
4,WT,0.0,HCT116bortezomib,5,B03,2019_02_15_Batch1_40X,413,,,,...,-0.875399,-0.871332,-1.335448,-1.209457,-1.328965,-1.213524,,,,


In [5]:
all_profiles_df = feature_select(all_profiles_df, operation=feature_select_ops)

print(all_profiles_df.shape)
all_profiles_df.head()

(10062, 317)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,...,Nuclei_Texture_Correlation_Mito_20_02,Nuclei_Texture_Correlation_RNA_20_01,Nuclei_Texture_Correlation_RNA_20_03,Nuclei_Texture_InfoMeas1_DNA_10_01,Nuclei_Texture_InfoMeas1_DNA_10_02,Nuclei_Texture_InfoMeas1_DNA_20_02,Nuclei_Texture_InfoMeas1_ER_20_03,Nuclei_Texture_InfoMeas1_Mito_10_00,Nuclei_Texture_InfoMeas1_Mito_10_01,Nuclei_Texture_InfoMeas1_Mito_20_03
0,WT,0.0,HCT116bortezomib,1,B03,2019_02_15_Batch1_40X,387,,,,...,-0.470456,0.048037,-1.021192,1.280061,1.1731,1.179094,0.604495,0.49135,0.554789,0.631287
1,WT,0.0,HCT116bortezomib,2,B03,2019_02_15_Batch1_40X,377,,,,...,-0.487478,0.199301,-0.794176,1.477292,1.348551,1.19778,0.262709,0.232968,0.361193,-0.160685
2,WT,0.0,HCT116bortezomib,3,B03,2019_02_15_Batch1_40X,403,,,,...,-0.191161,0.230228,-1.095142,1.245091,1.064989,1.163397,0.323375,0.409219,0.308227,0.106943
3,WT,0.0,HCT116bortezomib,4,B03,2019_02_15_Batch1_40X,414,,,,...,-0.678452,-1.460303,-0.93553,1.141819,1.511604,1.186359,0.509416,0.655411,0.704067,0.272486
4,WT,0.0,HCT116bortezomib,5,B03,2019_02_15_Batch1_40X,413,,,,...,-0.356237,0.206316,-0.867236,0.892838,1.125734,0.768944,0.448945,0.731992,0.620672,0.133346


In [6]:
output_file = os.path.join(output_dir, "all_merged_profiles.csv.gz")
all_profiles_df.to_csv(output_file, index=False, compression="gzip")

## Generate Consensus Signatures

In [7]:
consensus_data = {}
for batch in profile_batches:
    meta_features = infer_cp_features(profile_batches[batch], metadata=True)
    meta_features = [x for x in meta_features if "well" not in x.lower()]
    meta_features = [x for x in meta_features if "site" not in x.lower()]
    
    consensus_df = (
        profile_batches[batch]
        .groupby(meta_features)
        .median()
        .drop("Metadata_Site", axis="columns")
        .reset_index(drop=False)
    )
    
    consensus_data[batch] = consensus_df.reset_index()

In [8]:
full_consensus_df = (
    pd.concat(consensus_data.values(), sort=True)
    .reset_index(drop=True)
)

meta_features = infer_cp_features(full_consensus_df, metadata=True)
cp_cols = infer_cp_features(full_consensus_df, metadata=False)

full_consensus_df = (
    full_consensus_df
    .reindex(meta_features + cp_cols, axis="columns")
    .drop("Metadata_cell_count", axis="columns")
)

print(full_consensus_df.shape)
full_consensus_df.head()

(9548, 4331)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_batch,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Cells_AreaShape_Area,...,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_50_00,Nuclei_Texture_Variance_RNA_50_01,Nuclei_Texture_Variance_RNA_50_02,Nuclei_Texture_Variance_RNA_50_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.44548,...,-1.062497,-1.059947,-0.748613,-0.619886,-0.726331,-0.67838,,,,
1,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.214707,...,-0.909872,-0.926507,-0.6889,-0.593487,-0.693049,-0.613265,,,,
2,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,-0.180964,...,-1.032693,-1.033343,-0.650021,-0.503968,-0.680283,-0.444709,,,,
3,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.045045,...,-1.043915,-1.013009,-0.735652,-0.585777,-0.692562,-0.619315,,,,
4,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.046495,...,-1.239337,-1.241671,-0.971648,-0.943969,-0.984278,-1.00206,,,,


In [9]:
consensus_df = feature_select(full_consensus_df, operation=feature_select_ops)

print(consensus_df.shape)
consensus_df.head()

(9548, 311)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_batch,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Cells_AreaShape_Eccentricity,...,Nuclei_Texture_Correlation_Mito_20_02,Nuclei_Texture_Correlation_RNA_20_01,Nuclei_Texture_Correlation_RNA_20_03,Nuclei_Texture_InfoMeas1_DNA_10_01,Nuclei_Texture_InfoMeas1_DNA_10_02,Nuclei_Texture_InfoMeas1_DNA_20_02,Nuclei_Texture_InfoMeas1_ER_20_03,Nuclei_Texture_InfoMeas1_Mito_10_00,Nuclei_Texture_InfoMeas1_Mito_10_01,Nuclei_Texture_InfoMeas1_Mito_20_03
0,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.215897,...,-0.843256,-1.310903,0.412465,1.14364,0.775286,0.981503,1.541588,0.96023,1.194925,1.028573
1,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.516477,...,-0.861476,-0.481976,0.089565,0.847193,0.797284,1.065048,1.468746,0.880934,0.939595,0.84181
2,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.547577,...,-0.990443,0.025102,-0.565973,0.984887,0.883996,1.116078,1.89679,1.077582,1.299896,1.482821
3,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.737125,...,-0.813235,-0.144382,-0.765505,0.925841,0.93626,1.154712,1.706626,1.223876,1.217978,1.327418
4,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,1.262829,...,-0.837698,-0.410006,-1.624857,1.126458,1.357034,1.128997,1.131859,1.017592,1.06858,0.93469


In [10]:
output_gct_file = os.path.join(gct_dir, "consensus_feature_select.gct")
write_gct(profiles=consensus_df, output_file=output_gct_file)