## Write all Profiles to GCT for Heatmap Visualization

**Gregory Way, 2020**

I also build consensus signatures for all unique treatments and output associated files.

In [1]:
import os
import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features, write_gct

from scripts.processing_utils import load_data

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers"
]

gct_dir = os.path.join("data", "gct_files")
profile_dir = os.path.join("..", "0.generate-profiles", "profiles")
cell_count_dir = os.path.join("..", "0.generate-profiles", "cell_counts")
output_dir = os.path.join("data", "merged")

suffix = "normalized.csv.gz"

# Ignore 40X batch
batches = [x for x in os.listdir(profile_dir) if x not in [".DS_Store", "2019_02_15_Batch1_40X"]]
batches

['2020_08_24_Batch9',
 '2021_02_08_Batch11',
 '2019_03_20_Batch2',
 '2019_06_25_Batch3',
 '2020_09_08_Batch10',
 '2020_07_02_Batch8',
 '2019_11_11_Batch4',
 '2019_11_20_Batch6',
 '2019_02_15_Batch1_20X',
 '2019_11_19_Batch5',
 '2019_11_22_Batch7']

In [3]:
profile_batches = {}
for batch in batches:
    # Build output information
    output_gct_dir = os.path.join(gct_dir, batch)
    os.makedirs(output_gct_dir, exist_ok=True)
    output_gct_file = os.path.join(
        output_gct_dir, "{}_feature_select.gct".format(batch)
    )
    
    # Load the profile data and add cell counts
    df = load_data(
        batch=batch,
        suffix=suffix,
        profile_dir=profile_dir,
        combine_dfs=True,
        add_cell_count=True,
        harmonize_cols=True,
        cell_count_dir=cell_count_dir
    )

    # Save normalized and non-feature selected data
    profile_batches[batch] = df
    
    # Apply feature selection
    feature_select_df = feature_select(df, operation=feature_select_ops)
        
    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=feature_select_df, output_file=output_gct_file)

## Merge Profiles Together and Output

In [4]:
all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True)

all_profiles_df = all_profiles_df.assign(Metadata_clone_type="resistant")
all_profiles_df.loc[all_profiles_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "wildtype"

meta_features = infer_cp_features(all_profiles_df, metadata=True)
cp_cols = infer_cp_features(all_profiles_df, metadata=False)

all_profiles_df = all_profiles_df.reindex(meta_features + cp_cols, axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(1737, 3544)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_ID,Metadata_plate_filename,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,218775,B02,2020_08_24_Batch9,15706,2.5x10^3 cells/well,1.0,WT_parental,20200823.0,,,...,-0.851626,-0.856488,-0.826184,-0.869362,-0.838123,-0.875987,-0.857426,-0.850151,-0.864484,-0.852818
1,218775,B03,2020_08_24_Batch9,15075,2.5x10^3 cells/well,2.0,WT clone 04,20200823.0,,,...,-0.195527,-0.185573,-0.212889,-0.267897,-0.203758,-0.278983,-0.21156,-0.205353,-0.210934,-0.202412
2,218775,B04,2020_08_24_Batch9,11376,2.5x10^3 cells/well,3.0,WT clone 05,20200823.0,,,...,-0.483175,-0.476485,-0.462694,-0.533479,-0.454691,-0.533141,-0.498262,-0.485512,-0.494864,-0.486388
3,218775,B05,2020_08_24_Batch9,17344,2.5x10^3 cells/well,4.0,WT clone 06,20200823.0,,,...,-0.930589,-0.930296,-0.900038,-0.932495,-0.907216,-0.94669,-0.934647,-0.932163,-0.935771,-0.930488
4,218775,B06,2020_08_24_Batch9,8212,2.5x10^3 cells/well,5.0,WT clone 07,20200823.0,,,...,-0.011289,-0.012983,0.075844,0.055304,0.064201,0.041423,-0.02291,-0.020309,-0.024352,-0.012677


In [5]:
all_profiles_df = feature_select(all_profiles_df, operation=feature_select_ops)

all_profiles_df = all_profiles_df.drop(["Metadata_plate_ID", "Metadata_plate_filename"], axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(1737, 400)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_Entropy_RNA_20_03,Nuclei_Texture_InfoMeas1_AGP_5_00,Nuclei_Texture_InfoMeas1_DNA_10_01,Nuclei_Texture_InfoMeas1_DNA_5_02,Nuclei_Texture_InfoMeas1_ER_5_00,Nuclei_Texture_InfoMeas1_Mito_20_03,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas2_Mito_5_00,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01
0,218775,B02,2020_08_24_Batch9,15706,2.5x10^3 cells/well,1.0,WT_parental,20200823.0,218775,72 hr,...,-0.908673,-0.781888,1.52218,1.491122,1.361473,1.40389,1.382169,1.301569,-1.759342,1.572735
1,218775,B03,2020_08_24_Batch9,15075,2.5x10^3 cells/well,2.0,WT clone 04,20200823.0,218775,72 hr,...,-1.059297,-0.150116,0.7131,-0.173273,0.146974,0.21818,-0.726868,0.709481,-0.24525,-0.363203
2,218775,B04,2020_08_24_Batch9,11376,2.5x10^3 cells/well,3.0,WT clone 05,20200823.0,218775,72 hr,...,0.258403,-0.249873,0.318414,0.786991,0.629246,0.130438,0.331432,0.456838,-0.322296,0.529083
3,218775,B05,2020_08_24_Batch9,17344,2.5x10^3 cells/well,4.0,WT clone 06,20200823.0,218775,72 hr,...,-0.834195,-1.176168,1.073112,0.769149,1.139623,1.032126,0.235015,1.260055,-0.846602,1.511606
4,218775,B06,2020_08_24_Batch9,8212,2.5x10^3 cells/well,5.0,WT clone 07,20200823.0,218775,72 hr,...,0.92222,0.451641,0.050115,-0.479176,-0.743397,-0.14445,-0.699011,0.844333,0.488508,-0.24235


In [6]:
output_file = os.path.join(output_dir, "all_merged_profiles.csv.gz")
all_profiles_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "all_merged_profiles.gct")
write_gct(profiles=all_profiles_df, output_file=output_gct_file)

## Collapse replicates into consensus profiles

In [7]:
median_consensus_df = (
    all_profiles_df.groupby(["Metadata_clone_number", "Metadata_treatment"])
    .median()
    .reset_index()
)

print(median_consensus_df.shape)
median_consensus_df.head()

(104, 392)


Unnamed: 0,Metadata_clone_number,Metadata_treatment,Metadata_cell_count,Metadata_celltype_shorthand_from_plate_graph,Metadata_date,Metadata_treatment_shorthand_from_plate_graph,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_FormFactor,Cells_AreaShape_MaximumRadius,...,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_Entropy_RNA_20_03,Nuclei_Texture_InfoMeas1_AGP_5_00,Nuclei_Texture_InfoMeas1_DNA_10_01,Nuclei_Texture_InfoMeas1_DNA_5_02,Nuclei_Texture_InfoMeas1_ER_5_00,Nuclei_Texture_InfoMeas1_Mito_20_03,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas2_Mito_5_00,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01
0,BZ001,0.1% DMSO,20898.0,9.0,20210205.0,1.0,-0.050926,0.177409,0.981971,-0.115291,...,0.205719,0.996802,-1.543196,-0.326894,-0.964639,-1.305482,-0.543705,-0.997025,0.992328,-0.995768
1,BZ001,21 nM bortezomib,24448.0,,,,0.163763,0.030651,0.257632,-0.122719,...,0.254292,-0.452921,0.120423,-0.735252,0.189309,0.170067,-0.306509,0.36055,0.214372,0.056538
2,BZ001,Untreated,8173.0,,,,-0.175639,0.309249,0.689368,0.841431,...,0.755299,1.271127,-0.971689,0.229886,-0.828853,-1.106801,-0.275005,-1.016777,0.521635,-0.962902
3,BZ002,0.1% DMSO,6731.0,10.0,20210205.0,1.0,-0.07654,-0.284017,-0.030171,0.38704,...,0.269052,-0.147975,0.236602,0.408315,-0.22622,0.245273,0.770121,0.751107,-0.695531,0.160829
4,BZ002,Untreated,26721.0,,,,-0.226119,-0.982649,-1.382389,-0.630871,...,-1.317973,-1.534522,1.933899,1.777682,1.568107,2.146568,2.321129,2.968393,-3.040321,2.017309


In [8]:
output_file = os.path.join(output_dir, "consensus_profiles.csv.gz")
median_consensus_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "consensus_profiles.gct")
write_gct(profiles=median_consensus_df, output_file=output_gct_file)