## Write all Profiles to GCT for Heatmap Visualization

**Gregory Way, 2020**

I also build consensus signatures for all unique treatments and output associated files.

In [1]:
import os
import pandas as pd

from pycytominer import (
    feature_select,
    write_gct
)

from pycytominer.consensus import modz
from pycytominer.cyto_utils import infer_cp_features

from scripts.processing_utils import load_data

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers"
]
gct_dir = os.path.join("data", "gct_files")
profile_dir = os.path.join("..", "0.generate-profiles", "profiles")
cell_count_dir = os.path.join("..", "0.generate-profiles", "cell_counts")
output_dir = os.path.join("data", "merged")

suffix = "normalized.csv.gz"

batches = [x for x in os.listdir(profile_dir) if x != ".DS_Store"]
batches

['2019_02_15_Batch1_40X',
 '2019_03_20_Batch2',
 '2019_06_25_Batch3',
 '2019_11_11_Batch4',
 '2019_11_20_Batch6',
 '2019_02_15_Batch1_20X',
 '2019_11_19_Batch5',
 '2019_11_22_Batch7']

In [3]:
profile_batches = {}
for batch in batches:
    # Build output information
    output_gct_dir = os.path.join(gct_dir, batch)
    os.makedirs(output_gct_dir, exist_ok=True)
    output_gct_file = os.path.join(
        output_gct_dir, "{}_feature_select.gct".format(batch)
    )
    
    # Load the profile data
    df = load_data(
        batch=batch, suffix=suffix, profile_dir=profile_dir, combine_dfs=True
    )
    
    # Save normalized and non-feature selected data
    profile_batches[batch] = df
    
    # Apply feature selection again - this is particularly important for batches
    # with multiple plates
    df = feature_select(df, operation=feature_select_ops)
    
    # Load cell counts for the specific plates
    count_files = [
        os.path.join(cell_count_dir, x) for x in os.listdir(cell_count_dir) if batch in x
    ]   
    all_plate_dfs = []
    for count_file in count_files:
        plate = os.path.basename(count_file)
        plate = plate.replace(batch, "").replace("cell_count.tsv", "").strip("_")

        plate_df = (
            pd.read_csv(count_file, sep='\t')
            .rename(
                {
                    plate: "Metadata_cell_count"
                },
                axis="columns"
            )
        )
        all_plate_dfs.append(plate_df)

    # Merge all plates and append cell count information as a metadata feature
    plate_df = pd.concat(all_plate_dfs, sort=True)
    df = (
        plate_df
        .merge(
            df,
            on=plate_df.drop("Metadata_cell_count", axis="columns").columns.tolist()
        )
    )
    
    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=df, output_file=output_gct_file)

## Merge Profiles Together and Output

In [4]:
all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True)

meta_features = infer_cp_features(all_profiles_df, metadata=True)
cp_cols = infer_cp_features(all_profiles_df, metadata=False)

all_profiles_df = all_profiles_df.reindex(meta_features + cp_cols, axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(633, 4331)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Well,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Cells_AreaShape_Area,...,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_50_00,Nuclei_Texture_Variance_RNA_50_01,Nuclei_Texture_Variance_RNA_50_02,Nuclei_Texture_Variance_RNA_50_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,WT,0.0,HCT116bortezomib,B03,,,,PlateMap_HCT116bortezomib,,-1.665382,...,-1.00083,-1.00863,-1.051993,-1.100879,-1.069711,-1.10915,,,,
1,WT,0.0,HCT116bortezomib,B04,,,,PlateMap_HCT116bortezomib,,-1.424351,...,-0.670609,-0.675413,-0.850818,-0.924916,-0.861662,-0.933925,,,,
2,WT,0.0,HCT116bortezomib,B05,,,,PlateMap_HCT116bortezomib,,-1.138526,...,-0.693224,-0.706368,-0.79145,-0.815241,-0.774766,-0.826583,,,,
3,CloneA,0.0,HCT116bortezomib,B06,,,,PlateMap_HCT116bortezomib,,-0.928542,...,-0.967157,-0.97077,-0.902676,-0.905242,-0.901244,-0.873114,,,,
4,CloneA,0.0,HCT116bortezomib,B07,,,,PlateMap_HCT116bortezomib,,-1.053205,...,-1.157542,-1.152628,-1.160002,-1.135083,-1.150242,-1.109386,,,,


In [5]:
all_profiles_df = feature_select(all_profiles_df, operation=feature_select_ops)

print(all_profiles_df.shape)
all_profiles_df.head()

(633, 283)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Well,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Cells_AreaShape_Compactness,...,Nuclei_Texture_Correlation_DNA_10_03,Nuclei_Texture_Correlation_DNA_20_00,Nuclei_Texture_Correlation_DNA_20_03,Nuclei_Texture_Correlation_ER_20_02,Nuclei_Texture_Correlation_Mito_10_01,Nuclei_Texture_Correlation_Mito_20_02,Nuclei_Texture_Correlation_RNA_10_03,Nuclei_Texture_Correlation_RNA_20_00,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_InfoMeas1_DNA_10_02
0,WT,0.0,HCT116bortezomib,B03,,,,PlateMap_HCT116bortezomib,,-0.33257,...,-0.830788,-0.374677,0.616864,-0.563396,-0.326782,-0.377877,-1.430643,-1.098005,-1.949559,1.134156
1,WT,0.0,HCT116bortezomib,B04,,,,PlateMap_HCT116bortezomib,,-0.299568,...,-0.656543,-0.485547,0.490874,-0.724942,-0.357694,-0.464324,-0.899586,-0.868361,-1.102183,0.879197
2,WT,0.0,HCT116bortezomib,B05,,,,PlateMap_HCT116bortezomib,,-0.248175,...,-0.586268,-0.563266,0.357584,-0.60269,-0.44764,-0.48397,-0.560636,-0.530838,-1.03459,0.754502
3,CloneA,0.0,HCT116bortezomib,B06,,,,PlateMap_HCT116bortezomib,,0.411564,...,-1.344829,-1.227813,-0.62252,-1.267774,-0.718975,-0.764489,-1.059889,-1.157662,-1.124696,1.19123
4,CloneA,0.0,HCT116bortezomib,B07,,,,PlateMap_HCT116bortezomib,,0.488648,...,-1.640214,-1.617616,-0.637429,-1.178447,-0.822978,-0.765152,-0.942117,-0.865444,-0.921934,1.196038


In [6]:
output_file = os.path.join(output_dir, "all_merged_profiles.csv")
all_profiles_df.to_csv(output_file, index=False)

## Generate Consensus Signatures

In [7]:
consensus_data = {}
for batch in profile_batches:
    meta_features = infer_cp_features(profile_batches[batch], metadata=True)
    meta_features = [x for x in meta_features if "well" not in x.lower()]
    
    consensus_df = modz(
        profile_batches[batch],
        replicate_columns = meta_features,
    )
    
    consensus_data[batch] = consensus_df.reset_index()

In [8]:
full_consensus_df = pd.concat(consensus_data.values(), sort=True).reset_index(drop=True)

meta_features = infer_cp_features(full_consensus_df, metadata=True)
cp_cols = infer_cp_features(full_consensus_df, metadata=False)

full_consensus_df = full_consensus_df.reindex(meta_features + cp_cols, axis="columns")

print(full_consensus_df.shape)
full_consensus_df.head()

(197, 4330)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Cells_AreaShape_Area,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_50_00,Nuclei_Texture_Variance_RNA_50_01,Nuclei_Texture_Variance_RNA_50_02,Nuclei_Texture_Variance_RNA_50_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,CloneA,0.0,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,-0.893895,0.255437,...,-1.066815,-1.064924,-1.002073,-0.982593,-0.999003,-0.957047,,,,
1,CloneA,0.7,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,-0.693841,-0.270798,...,-0.93451,-0.929737,-1.002195,-0.963557,-0.989579,-0.97799,,,,
2,CloneA,7.0,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,0.90795,0.9056,...,-0.215912,-0.212157,0.022518,-0.053756,0.020729,-0.046313,,,,
3,CloneA,70.0,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,0.97447,0.107325,...,0.608321,0.585632,0.950771,1.061018,0.943978,1.004792,,,,
4,CloneE,0.0,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,0.012068,-0.1583,...,-0.786629,-0.783663,-0.721301,-0.717333,-0.715175,-0.695062,,,,


In [9]:
consensus_df = feature_select(full_consensus_df, operation=feature_select_ops)

print(consensus_df.shape)
consensus_df.head()

(197, 239)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,...,Nuclei_Texture_Correlation_AGP_20_02,Nuclei_Texture_Correlation_AGP_20_03,Nuclei_Texture_Correlation_DNA_10_03,Nuclei_Texture_Correlation_DNA_20_03,Nuclei_Texture_Correlation_Mito_10_01,Nuclei_Texture_Correlation_Mito_20_02,Nuclei_Texture_Correlation_RNA_10_03,Nuclei_Texture_Correlation_RNA_20_00,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_InfoMeas1_DNA_10_01
0,CloneA,0.0,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,0.457331,-0.015568,...,-1.301552,-1.352057,-1.410633,-0.257088,-0.918336,-0.838264,-1.060417,-1.019667,-1.00467,1.076234
1,CloneA,0.7,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,0.717219,0.161159,...,-0.900436,-0.939445,-1.17414,0.304955,-0.72016,-0.628171,-0.674569,-0.514953,-0.978249,1.018162
2,CloneA,7.0,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,0.189762,0.59285,...,0.632659,0.739319,0.404211,-0.045157,0.218847,0.163968,1.223269,1.006203,0.339973,0.015188
3,CloneA,70.0,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,1.966712,1.670742,...,1.21073,1.183234,1.038936,1.631791,1.678181,1.729909,1.111597,1.1016,1.084341,-0.797373
4,CloneE,0.0,HCT116bortezomib,,,,PlateMap_HCT116bortezomib,,-0.240478,-0.1472,...,-0.969682,-0.694606,-0.841988,-1.07066,-1.12214,-0.910489,-1.009617,-1.175506,-0.60687,0.80061


In [10]:
output_gct_file = os.path.join(gct_dir, "consensus_feature_select.gct")
write_gct(profiles=consensus_df, output_file=output_gct_file)