## Write all Profiles to GCT for Heatmap Visualization

**Gregory Way, 2020**

I also build consensus signatures for all unique treatments and output associated files.

In [1]:
import os
import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features, write_gct

from scripts.processing_utils import load_data

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers"
]

gct_dir = os.path.join("data", "gct_files")
profile_dir = os.path.join("..", "0.generate-profiles", "profiles")
cell_count_dir = os.path.join("..", "0.generate-profiles", "cell_counts")
output_dir = os.path.join("data", "merged")

suffix = "normalized.csv.gz"

# Ignore 40X batch
batches = [x for x in os.listdir(profile_dir) if x not in [".DS_Store", "2019_02_15_Batch1_40X"]]
batches

['2021_03_03_Batch14',
 '2021_03_03_Batch13',
 '2021_03_03_Batch12',
 '2021_03_03_Batch15',
 '2020_08_24_Batch9',
 '2021_03_05_Batch16',
 '2021_02_08_Batch11',
 '2019_03_20_Batch2',
 '2019_06_25_Batch3',
 '2020_09_08_Batch10',
 '2020_07_02_Batch8',
 '2019_11_11_Batch4',
 '2019_11_20_Batch6',
 '2019_02_15_Batch1_20X',
 '2019_11_19_Batch5',
 '2019_11_22_Batch7']

In [3]:
profile_batches = {}
for batch in batches:
    # Build output information
    output_gct_dir = os.path.join(gct_dir, batch)
    os.makedirs(output_gct_dir, exist_ok=True)
    output_gct_file = os.path.join(
        output_gct_dir, "{}_feature_select.gct".format(batch)
    )
    
    # Load the profile data and add cell counts
    df = load_data(
        batch=batch,
        suffix=suffix,
        profile_dir=profile_dir,
        combine_dfs=True,
        add_cell_count=True,
        harmonize_cols=True,
        cell_count_dir=cell_count_dir
    )

    # Save normalized and non-feature selected data
    profile_batches[batch] = df
    
    # Apply feature selection
    feature_select_df = feature_select(df, operation=feature_select_ops)
        
    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=feature_select_df, output_file=output_gct_file)

## Merge Profiles Together and Output

In [4]:
all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True)

all_profiles_df = all_profiles_df.assign(Metadata_clone_type="resistant")
all_profiles_df.loc[all_profiles_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "wildtype"

meta_features = infer_cp_features(all_profiles_df, metadata=True)
cp_cols = infer_cp_features(all_profiles_df, metadata=False)

all_profiles_df = all_profiles_df.reindex(meta_features + cp_cols, axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(2397, 3544)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_ID,Metadata_plate_filename,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,219905,B02,2021_03_03_Batch14,4269,2.5x10^3 cells/well,1.0,WT_parental,20210219.0,,,...,-1.007074,-1.010473,-1.018616,-1.038237,-1.005599,-1.015286,-1.020732,-1.009446,-1.019474,-1.012719
1,219905,B03,2021_03_03_Batch14,1688,2.5x10^3 cells/well,2.0,CloneA,20210219.0,,,...,-0.986309,-1.000248,-1.02219,-1.045947,-1.016479,-1.038013,-0.999631,-0.999447,-0.987759,-0.977462
2,219905,B04,2021_03_03_Batch14,2238,2.5x10^3 cells/well,3.0,Clone E,20210219.0,,,...,-1.371187,-1.358646,-1.353107,-1.355997,-1.357253,-1.352782,-1.360769,-1.364501,-1.366883,-1.360609
3,219905,B05,2021_03_03_Batch14,3358,2.5x10^3 cells/well,4.0,WT clone 01,20210219.0,,,...,-1.454741,-1.437857,-1.441065,-1.458476,-1.441738,-1.481096,-1.456043,-1.463368,-1.462129,-1.456112
4,219905,B06,2021_03_03_Batch14,291,2.5x10^3 cells/well,5.0,WT clone 02,20210219.0,,,...,0.334975,0.403553,0.461756,0.434181,0.315446,0.58655,0.342533,0.354969,0.379586,0.36145


In [5]:
all_profiles_df = feature_select(all_profiles_df, operation=feature_select_ops)

all_profiles_df = all_profiles_df.drop(["Metadata_plate_ID", "Metadata_plate_filename"], axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(2397, 444)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,Nuclei_Texture_Correlation_RNA_20_02,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_Entropy_ER_20_03,Nuclei_Texture_InfoMeas1_AGP_5_00,Nuclei_Texture_InfoMeas1_DNA_5_02,Nuclei_Texture_InfoMeas1_ER_5_00,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas2_Mito_5_00,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01,Nuclei_Texture_SumAverage_DNA_20_03
0,219905,B02,2021_03_03_Batch14,4269,2.5x10^3 cells/well,1.0,WT_parental,20210219.0,219905,48 hr,...,-0.656457,-1.525862,-0.689886,1.751063,1.38123,1.683708,0.890482,-1.186232,0.755303,-0.654685
1,219905,B03,2021_03_03_Batch14,1688,2.5x10^3 cells/well,2.0,CloneA,20210219.0,219905,48 hr,...,1.040639,-0.737619,-0.873918,-0.668798,0.133264,-0.753028,-0.729984,-0.378977,0.973475,-1.68899
2,219905,B04,2021_03_03_Batch14,2238,2.5x10^3 cells/well,3.0,Clone E,20210219.0,219905,48 hr,...,1.151987,-1.144326,-2.081855,-0.578513,0.613646,0.124588,0.677733,-1.351052,2.183734,-1.776875
3,219905,B05,2021_03_03_Batch14,3358,2.5x10^3 cells/well,4.0,WT clone 01,20210219.0,219905,48 hr,...,-0.534683,-1.467039,-1.865624,1.584778,1.895769,1.285182,0.735492,-1.582528,2.297254,-1.205825
4,219905,B06,2021_03_03_Batch14,291,2.5x10^3 cells/well,5.0,WT clone 02,20210219.0,219905,48 hr,...,1.228002,1.874837,0.763804,-0.381752,-1.437746,-0.150566,-0.905429,1.400678,-0.268371,1.571506


In [6]:
output_file = os.path.join(output_dir, "all_merged_profiles.csv.gz")
all_profiles_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "all_merged_profiles.gct")
write_gct(profiles=all_profiles_df, output_file=output_gct_file)

## Collapse replicates into consensus profiles

In [7]:
median_consensus_df = (
    all_profiles_df.groupby(["Metadata_clone_number", "Metadata_treatment"])
    .median()
    .reset_index()
)

print(median_consensus_df.shape)
median_consensus_df.head()

(143, 436)


Unnamed: 0,Metadata_clone_number,Metadata_treatment,Metadata_cell_count,Metadata_celltype_shorthand_from_plate_graph,Metadata_date,Metadata_treatment_shorthand_from_plate_graph,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_FormFactor,Cells_AreaShape_MaximumRadius,...,Nuclei_Texture_Correlation_RNA_20_02,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_Entropy_ER_20_03,Nuclei_Texture_InfoMeas1_AGP_5_00,Nuclei_Texture_InfoMeas1_DNA_5_02,Nuclei_Texture_InfoMeas1_ER_5_00,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas2_Mito_5_00,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01,Nuclei_Texture_SumAverage_DNA_20_03
0,BZ001,0.1% DMSO,3901.0,9.0,20210222.0,1.0,-0.257844,0.118679,0.981971,-0.07604,...,-0.635365,0.294644,0.949884,-1.193691,-0.783082,-1.196832,-0.889893,0.933164,-1.028981,-0.032877
1,BZ001,0.78 nM bortezomib,855.0,9.0,20210222.0,2.0,-0.328213,0.012501,0.613288,-0.129423,...,-0.99344,0.199651,0.264583,-0.670404,-0.632004,-0.695834,-0.824063,0.634478,-0.638031,-0.076202
2,BZ001,2.33 nM bortezomib,911.0,9.0,20210222.0,2.0,-0.731109,-0.105897,0.757219,-0.237247,...,-0.951312,-0.156238,0.249815,-0.267378,0.021592,-0.574679,-0.360796,0.556962,-0.64181,-0.34714
3,BZ001,21 nM bortezomib,24448.0,,,,0.163763,0.030651,0.257632,-0.122719,...,-0.716287,0.254292,-0.148769,0.120423,0.189309,0.170067,0.36055,0.214372,0.056538,0.586299
4,BZ001,7 nM bortezomib,886.0,9.0,20210222.0,2.0,-0.60056,-0.227341,0.963467,-0.395344,...,-1.199652,-0.698344,0.629368,-0.768096,-0.223588,-0.947327,-1.147544,0.947677,-0.917935,-0.518066


In [8]:
output_file = os.path.join(output_dir, "consensus_profiles.csv.gz")
median_consensus_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "consensus_profiles.gct")
write_gct(profiles=median_consensus_df, output_file=output_gct_file)