## Write all Profiles to GCT for Heatmap Visualization

**Gregory Way, 2020**

I also build consensus signatures for all unique treatments and output associated files.

In [1]:
import os
import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features, write_gct

from scripts.processing_utils import load_data

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers"
]

gct_dir = os.path.join("data", "gct_files")
profile_dir = os.path.join("..", "0.generate-profiles", "profiles")
cell_count_dir = os.path.join("..", "0.generate-profiles", "cell_counts")
output_dir = os.path.join("data", "merged")

suffix = "normalized.csv.gz"

# Ignore 40X batch
batches = [x for x in os.listdir(profile_dir) if x not in [".DS_Store", "2019_02_15_Batch1_40X"]]
batches

['2019_03_20_Batch2',
 '2019_06_25_Batch3',
 '2020_07_02_Batch8',
 '2019_11_11_Batch4',
 '2019_11_20_Batch6',
 '2019_02_15_Batch1_20X',
 '2019_11_19_Batch5',
 '2019_11_22_Batch7']

In [3]:
profile_batches = {}
for batch in batches:
    # Build output information
    output_gct_dir = os.path.join(gct_dir, batch)
    os.makedirs(output_gct_dir, exist_ok=True)
    output_gct_file = os.path.join(
        output_gct_dir, "{}_feature_select.gct".format(batch)
    )
    
    # Load the profile data and add cell counts
    df = load_data(
        batch=batch,
        suffix=suffix,
        profile_dir=profile_dir,
        combine_dfs=True,
        add_cell_count=True,
        harmonize_cols=True,
        cell_count_dir=cell_count_dir
    )

    # Save normalized and non-feature selected data
    profile_batches[batch] = df
    
    # Apply feature selection
    feature_select_df = feature_select(df, operation=feature_select_ops)
        
    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=feature_select_df, output_file=output_gct_file)

## Merge Profiles Together and Output

In [4]:
all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True)

all_profiles_df = all_profiles_df.assign(Metadata_clone_type="resistant")
all_profiles_df.loc[all_profiles_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "wildtype"

meta_features = infer_cp_features(all_profiles_df, metadata=True)
cp_cols = infer_cp_features(all_profiles_df, metadata=False)

all_profiles_df = all_profiles_df.reindex(meta_features + cp_cols, axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(837, 3540)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,207106_exposure320,B02,2019_03_20_Batch2,7616,,WT_parental,,,PlateMap_207106_exposure320,0.1% DMSO,...,-0.460955,-0.471566,-0.472902,-0.463908,-0.491301,-0.491651,-0.46513,-0.463746,-0.458918,-0.461595
1,207106_exposure320,B03,2019_03_20_Batch2,7316,,WT_parental,,,PlateMap_207106_exposure320,0.1% DMSO,...,-0.407693,-0.409836,-0.41875,-0.400586,-0.431729,-0.435196,-0.407739,-0.408274,-0.404893,-0.403287
2,207106_exposure320,B04,2019_03_20_Batch2,7082,,WT_parental,,,PlateMap_207106_exposure320,0.1% DMSO,...,-0.418464,-0.430647,-0.440995,-0.44215,-0.445753,-0.467471,-0.423247,-0.419649,-0.414115,-0.415869
3,207106_exposure320,B05,2019_03_20_Batch2,11872,,CloneA,,,PlateMap_207106_exposure320,0.1% DMSO,...,-0.821523,-0.818418,-0.812761,-0.830032,-0.805281,-0.798594,-0.821887,-0.822436,-0.825891,-0.824499
4,207106_exposure320,B06,2019_03_20_Batch2,7898,,CloneA,,,PlateMap_207106_exposure320,0.1% DMSO,...,-0.862363,-0.858491,-0.852563,-0.877956,-0.843264,-0.838066,-0.863207,-0.863628,-0.867076,-0.866256


In [5]:
all_profiles_df = feature_select(all_profiles_df, operation=feature_select_ops)

all_profiles_df = all_profiles_df.drop(["Metadata_plate_ID", "Metadata_plate_filename"], axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(837, 450)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_clone_number,Metadata_plate_map_name,Metadata_treatment,Metadata_treatment_time,Metadata_clone_type,...,Nuclei_Texture_InfoMeas1_Mito_10_01,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas1_RNA_20_03,Nuclei_Texture_InfoMeas2_AGP_5_02,Nuclei_Texture_InfoMeas2_ER_5_02,Nuclei_Texture_InfoMeas2_Mito_5_00,Nuclei_Texture_InfoMeas2_RNA_5_02,Nuclei_Texture_InverseDifferenceMoment_ER_20_03,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01,Nuclei_Texture_SumAverage_DNA_20_03
0,207106_exposure320,B02,2019_03_20_Batch2,7616,,WT_parental,PlateMap_207106_exposure320,0.1% DMSO,,wildtype,...,-0.893567,-0.239818,0.195564,0.130019,-0.178045,0.653219,0.036653,-0.151917,-0.502186,-0.552138
1,207106_exposure320,B03,2019_03_20_Batch2,7316,,WT_parental,PlateMap_207106_exposure320,0.1% DMSO,,wildtype,...,-0.854676,-0.142585,0.275259,0.020791,-0.205324,0.62601,-0.003572,-0.090079,-0.540892,-0.422352
2,207106_exposure320,B04,2019_03_20_Batch2,7082,,WT_parental,PlateMap_207106_exposure320,0.1% DMSO,,wildtype,...,-0.771894,-0.174641,0.234954,-0.000766,-0.301922,0.630362,-0.060523,0.01785,-0.375019,-0.254405
3,207106_exposure320,B05,2019_03_20_Batch2,11872,,CloneA,PlateMap_207106_exposure320,0.1% DMSO,,resistant,...,-0.008874,0.271534,1.257187,-1.408223,-1.432898,0.264408,-1.417631,1.423207,1.062621,-1.357518
4,207106_exposure320,B06,2019_03_20_Batch2,7898,,CloneA,PlateMap_207106_exposure320,0.1% DMSO,,resistant,...,0.304577,0.306925,1.301993,-1.551025,-1.424243,0.070937,-1.449556,1.308897,1.403606,-1.3248


In [6]:
output_file = os.path.join(output_dir, "all_merged_profiles.csv.gz")
all_profiles_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "all_merged_profiles.gct")
write_gct(profiles=all_profiles_df, output_file=output_gct_file)

## Collapse replicates into consensus profiles

In [7]:
median_consensus_df = (
    all_profiles_df.groupby(["Metadata_clone_number", "Metadata_treatment"])
    .median()
    .reset_index()
)

print(median_consensus_df.shape)
median_consensus_df.head()

(62, 443)


Unnamed: 0,Metadata_clone_number,Metadata_treatment,Metadata_cell_count,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_FormFactor,Cells_AreaShape_Orientation,Cells_AreaShape_Solidity,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_InfoMeas1_Mito_10_01,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas1_RNA_20_03,Nuclei_Texture_InfoMeas2_AGP_5_02,Nuclei_Texture_InfoMeas2_ER_5_02,Nuclei_Texture_InfoMeas2_Mito_5_00,Nuclei_Texture_InfoMeas2_RNA_5_02,Nuclei_Texture_InverseDifferenceMoment_ER_20_03,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01,Nuclei_Texture_SumAverage_DNA_20_03
0,BZ001,0.1% DMSO,22309.0,7.9e-05,0.274804,0.655391,0.237734,0.802531,0.011388,0.087811,...,-0.626803,-1.31567,-0.74707,1.19036,1.238898,0.579792,1.148875,-0.964271,-0.944357,-0.039049
1,BZ001,21 nM bortezomib,24448.0,0.163763,0.030651,0.257632,-0.345442,0.047413,0.158036,-0.895545,...,-0.264567,0.36055,0.09591,-0.141483,-0.31918,0.214372,-0.370733,-0.426399,0.056538,0.586299
2,BZ001,Untreated,8173.0,-0.175639,0.309249,0.689368,0.059856,0.718336,0.106474,0.254899,...,0.134517,-1.016777,-1.003935,0.960564,1.045144,0.521635,0.929576,-0.787173,-0.962902,-0.337988
3,BZ002,Untreated,26721.0,-0.226119,-0.982649,-1.382389,0.616968,-1.232016,0.657342,0.168716,...,2.383163,2.968393,1.323178,-1.978941,-2.241185,-3.040321,-1.5,2.482429,2.017309,-1.78013
4,BZ003,Untreated,6100.0,2.345599,2.174142,-0.628086,0.096371,0.093406,-2.200576,-2.264208,...,0.692943,-0.713152,-0.381738,0.81968,0.66314,-0.25311,0.723661,-0.075372,-0.373831,-1.342225


In [8]:
output_file = os.path.join(output_dir, "consensus_profiles.csv.gz")
median_consensus_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "consensus_profiles.gct")
write_gct(profiles=median_consensus_df, output_file=output_gct_file)