## Write all Profiles to GCT for Heatmap Visualization

**Gregory Way, 2020**

I also build consensus signatures for all unique treatments and output associated files.

In [1]:
import os
import pandas as pd

from pycytominer import (
    feature_select,
    write_gct
)

from pycytominer.consensus import modz
from pycytominer.cyto_utils import infer_cp_features

from scripts.processing_utils import load_data

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers"
]
gct_dir = os.path.join("data", "gct_files")
profile_dir = os.path.join("..", "0.generate-profiles", "profiles")
cell_count_dir = os.path.join("..", "0.generate-profiles", "cell_counts")
output_dir = os.path.join("data", "merged")

suffix = "normalized.csv.gz"

batches = [x for x in os.listdir(profile_dir) if x != ".DS_Store"]
batches

['2019_02_15_Batch1_40X',
 '2019_03_20_Batch2',
 '2019_06_25_Batch3',
 '2019_11_11_Batch4',
 '2019_11_20_Batch6',
 '2019_02_15_Batch1_20X',
 '2019_11_19_Batch5',
 '2019_11_22_Batch7']

In [3]:
profile_batches = {}
for batch in batches:
    # Build output information
    output_gct_dir = os.path.join(gct_dir, batch)
    os.makedirs(output_gct_dir, exist_ok=True)
    output_gct_file = os.path.join(
        output_gct_dir, "{}_feature_select.gct".format(batch)
    )
    
    # Load the profile data and add cell counts
    df = load_data(
        batch=batch,
        suffix=suffix,
        profile_dir=profile_dir,
        combine_dfs=True,
        add_cell_count=True,
        cell_count_dir=cell_count_dir
    )
    
    # Save normalized and non-feature selected data
    profile_batches[batch] = df
    
    # Apply feature selection again - this is particularly important for batches
    # with multiple plates
    df = feature_select(df, operation=feature_select_ops)
    
    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=df, output_file=output_gct_file)

## Merge Profiles Together and Output

In [4]:
all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True)

meta_features = infer_cp_features(all_profiles_df, metadata=True)
cp_cols = infer_cp_features(all_profiles_df, metadata=False)

all_profiles_df = all_profiles_df.reindex(meta_features + cp_cols, axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(633, 4333)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,...,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_50_00,Nuclei_Texture_Variance_RNA_50_01,Nuclei_Texture_Variance_RNA_50_02,Nuclei_Texture_Variance_RNA_50_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,WT,0.0,HCT116bortezomib,B03,2019_02_15_Batch1_40X,7166,,,,PlateMap_HCT116bortezomib,...,-1.00083,-1.00863,-1.051993,-1.100879,-1.069711,-1.10915,,,,
1,WT,0.0,HCT116bortezomib,B04,2019_02_15_Batch1_40X,6718,,,,PlateMap_HCT116bortezomib,...,-0.670609,-0.675413,-0.850818,-0.924916,-0.861662,-0.933925,,,,
2,WT,0.0,HCT116bortezomib,B05,2019_02_15_Batch1_40X,6421,,,,PlateMap_HCT116bortezomib,...,-0.693224,-0.706368,-0.79145,-0.815241,-0.774766,-0.826583,,,,
3,CloneA,0.0,HCT116bortezomib,B06,2019_02_15_Batch1_40X,6384,,,,PlateMap_HCT116bortezomib,...,-0.967157,-0.97077,-0.902676,-0.905242,-0.901244,-0.873114,,,,
4,CloneA,0.0,HCT116bortezomib,B07,2019_02_15_Batch1_40X,6586,,,,PlateMap_HCT116bortezomib,...,-1.157542,-1.152628,-1.160002,-1.135083,-1.150242,-1.109386,,,,


In [5]:
all_profiles_df = feature_select(all_profiles_df, operation=feature_select_ops)

print(all_profiles_df.shape)
all_profiles_df.head()

(633, 285)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,...,Nuclei_Texture_Correlation_DNA_10_03,Nuclei_Texture_Correlation_DNA_20_00,Nuclei_Texture_Correlation_DNA_20_03,Nuclei_Texture_Correlation_ER_20_02,Nuclei_Texture_Correlation_Mito_10_01,Nuclei_Texture_Correlation_Mito_20_02,Nuclei_Texture_Correlation_RNA_10_03,Nuclei_Texture_Correlation_RNA_20_00,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_InfoMeas1_DNA_10_02
0,WT,0.0,HCT116bortezomib,B03,2019_02_15_Batch1_40X,7166,,,,PlateMap_HCT116bortezomib,...,-0.830788,-0.374677,0.616864,-0.563396,-0.326782,-0.377877,-1.430643,-1.098005,-1.949559,1.134156
1,WT,0.0,HCT116bortezomib,B04,2019_02_15_Batch1_40X,6718,,,,PlateMap_HCT116bortezomib,...,-0.656543,-0.485547,0.490874,-0.724942,-0.357694,-0.464324,-0.899586,-0.868361,-1.102183,0.879197
2,WT,0.0,HCT116bortezomib,B05,2019_02_15_Batch1_40X,6421,,,,PlateMap_HCT116bortezomib,...,-0.586268,-0.563266,0.357584,-0.60269,-0.44764,-0.48397,-0.560636,-0.530838,-1.03459,0.754502
3,CloneA,0.0,HCT116bortezomib,B06,2019_02_15_Batch1_40X,6384,,,,PlateMap_HCT116bortezomib,...,-1.344829,-1.227813,-0.62252,-1.267774,-0.718975,-0.764489,-1.059889,-1.157662,-1.124696,1.19123
4,CloneA,0.0,HCT116bortezomib,B07,2019_02_15_Batch1_40X,6586,,,,PlateMap_HCT116bortezomib,...,-1.640214,-1.617616,-0.637429,-1.178447,-0.822978,-0.765152,-0.942117,-0.865444,-0.921934,1.196038


In [6]:
output_file = os.path.join(output_dir, "all_merged_profiles.csv.gz")
all_profiles_df.to_csv(output_file, index=False, compression="gzip")

## Generate Consensus Signatures

In [7]:
consensus_data = {}
for batch in profile_batches:
    meta_features = infer_cp_features(profile_batches[batch], metadata=True)
    meta_features = [x for x in meta_features if "well" not in x.lower()]
    meta_features = [x for x in meta_features if "site" not in x.lower()]
    
    consensus_df = (
        profile_batches[batch]
        .groupby(meta_features)
        .median()
        .reset_index(drop=False)
    )
    
    consensus_data[batch] = consensus_df.reset_index()

In [8]:
full_consensus_df = (
    pd.concat(consensus_data.values(), sort=True)
    .reset_index(drop=True)
)

meta_features = infer_cp_features(full_consensus_df, metadata=True)
cp_cols = infer_cp_features(full_consensus_df, metadata=False)

full_consensus_df = (
    full_consensus_df
    .reindex(meta_features + cp_cols, axis="columns")
    .drop("Metadata_cell_count", axis="columns")
)

print(full_consensus_df.shape)
full_consensus_df.head()

(633, 4331)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_batch,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Cells_AreaShape_Area,...,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_50_00,Nuclei_Texture_Variance_RNA_50_01,Nuclei_Texture_Variance_RNA_50_02,Nuclei_Texture_Variance_RNA_50_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,-0.698887,...,-1.074325,-1.070013,-0.941575,-0.905685,-0.943621,-0.886827,,,,
1,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,-0.928542,...,-0.967157,-0.97077,-0.902676,-0.905242,-0.901244,-0.873114,,,,
2,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,-1.053205,...,-1.157542,-1.152628,-1.160002,-1.135083,-1.150242,-1.109386,,,,
3,CloneA,0.7,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,1.539363,...,-1.077198,-1.064422,-0.781498,-0.669055,-0.778658,-0.657093,,,,
4,CloneA,0.7,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,-1.060789,...,-0.919956,-0.9111,-1.014027,-0.951112,-0.995318,-0.980548,,,,


In [9]:
consensus_df = feature_select(full_consensus_df, operation=feature_select_ops)

print(consensus_df.shape)
consensus_df.head()

(633, 283)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_batch,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Cells_AreaShape_Compactness,...,Nuclei_Texture_Correlation_DNA_10_03,Nuclei_Texture_Correlation_DNA_20_00,Nuclei_Texture_Correlation_DNA_20_03,Nuclei_Texture_Correlation_ER_20_02,Nuclei_Texture_Correlation_Mito_10_01,Nuclei_Texture_Correlation_Mito_20_02,Nuclei_Texture_Correlation_RNA_10_03,Nuclei_Texture_Correlation_RNA_20_00,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_InfoMeas1_DNA_10_02
0,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.471212,...,-1.244542,-1.329819,0.489244,-1.308251,-1.212452,-0.985233,-1.1802,-1.038099,-0.96888,1.188811
1,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.411564,...,-1.344829,-1.227813,-0.62252,-1.267774,-0.718975,-0.764489,-1.059889,-1.157662,-1.124696,1.19123
2,CloneA,0.0,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,0.488648,...,-1.640214,-1.617616,-0.637429,-1.178447,-0.822978,-0.765152,-0.942117,-0.865444,-0.921934,1.196038
3,CloneA,0.7,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,1.822111,...,-0.144076,0.023856,1.298568,-0.610659,-1.367032,-1.020629,-0.749865,-0.728012,-0.953453,0.612121
4,CloneA,0.7,HCT116bortezomib,2019_02_15_Batch1_40X,,,,PlateMap_HCT116bortezomib,,1.058081,...,-1.419423,-1.178322,0.347429,-0.69531,-0.557807,-0.484975,-0.599939,-0.330812,-0.985439,1.300751


In [10]:
output_gct_file = os.path.join(gct_dir, "consensus_feature_select.gct")
write_gct(profiles=consensus_df, output_file=output_gct_file)