## Write all Profiles to GCT for Heatmap Visualization

**Gregory Way, 2020**

I also build consensus signatures for all unique treatments and output associated files.

In [1]:
import os
import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features, write_gct

from scripts.processing_utils import load_data

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers"
]

gct_dir = os.path.join("data", "gct_files")
profile_dir = os.path.join("..", "0.generate-profiles", "profiles")
cell_count_dir = os.path.join("..", "0.generate-profiles", "cell_counts")
output_dir = os.path.join("data", "merged")

suffix = "normalized.csv.gz"

# Ignore 40X batch
batches = [x for x in os.listdir(profile_dir) if x not in [".DS_Store", "2019_02_15_Batch1_40X"]]
batches

['2021_03_03_Batch13',
 '2021_03_03_Batch12',
 '2020_08_24_Batch9',
 '2021_02_08_Batch11',
 '2019_03_20_Batch2',
 '2019_06_25_Batch3',
 '2020_09_08_Batch10',
 '2020_07_02_Batch8',
 '2019_11_11_Batch4',
 '2019_11_20_Batch6',
 '2019_02_15_Batch1_20X',
 '2019_11_19_Batch5',
 '2019_11_22_Batch7']

In [3]:
profile_batches = {}
for batch in batches:
    # Build output information
    output_gct_dir = os.path.join(gct_dir, batch)
    os.makedirs(output_gct_dir, exist_ok=True)
    output_gct_file = os.path.join(
        output_gct_dir, "{}_feature_select.gct".format(batch)
    )
    
    # Load the profile data and add cell counts
    df = load_data(
        batch=batch,
        suffix=suffix,
        profile_dir=profile_dir,
        combine_dfs=True,
        add_cell_count=True,
        harmonize_cols=True,
        cell_count_dir=cell_count_dir
    )

    # Save normalized and non-feature selected data
    profile_batches[batch] = df
    
    # Apply feature selection
    feature_select_df = feature_select(df, operation=feature_select_ops)
        
    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=feature_select_df, output_file=output_gct_file)

## Merge Profiles Together and Output

In [4]:
all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True)

all_profiles_df = all_profiles_df.assign(Metadata_clone_type="resistant")
all_profiles_df.loc[all_profiles_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "wildtype"

meta_features = infer_cp_features(all_profiles_df, metadata=True)
cp_cols = infer_cp_features(all_profiles_df, metadata=False)

all_profiles_df = all_profiles_df.reindex(meta_features + cp_cols, axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(1857, 3544)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_ID,Metadata_plate_filename,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,219973,B02,2021_03_03_Batch13,3767,2.5x10^3 cells/well,1.0,WT_parental,20210226.0,,,...,0.084843,0.102437,-0.030491,-0.075074,0.001607,0.022429,0.098508,0.117252,0.133341,0.108706
1,219973,B03,2021_03_03_Batch13,1998,2.5x10^3 cells/well,2.0,CloneA,20210226.0,,,...,-0.223822,-0.238489,-0.136545,-0.07884,-0.193393,-0.096617,-0.147783,-0.132505,-0.144746,-0.166323
2,219973,B04,2021_03_03_Batch13,1892,2.5x10^3 cells/well,3.0,Clone E,20210226.0,,,...,-1.517928,-1.510258,-1.450743,-1.430143,-1.448436,-1.42149,-1.51013,-1.510141,-1.502031,-1.490307
3,219973,B05,2021_03_03_Batch13,3392,2.5x10^3 cells/well,4.0,WT clone 01,20210226.0,,,...,-1.154861,-1.140066,-1.114752,-1.099559,-1.152448,-1.047178,-1.156163,-1.141424,-1.152997,-1.144164
4,219973,B06,2021_03_03_Batch13,3203,2.5x10^3 cells/well,5.0,WT clone 02,20210226.0,,,...,-0.211744,-0.222014,-0.213728,-0.153409,-0.203818,-0.152359,-0.222704,-0.227339,-0.231151,-0.223477


In [5]:
all_profiles_df = feature_select(all_profiles_df, operation=feature_select_ops)

all_profiles_df = all_profiles_df.drop(["Metadata_plate_ID", "Metadata_plate_filename"], axis="columns")

print(all_profiles_df.shape)
all_profiles_df.head()

(1857, 407)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,Nuclei_Texture_Correlation_RNA_20_00,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_Entropy_RNA_20_03,Nuclei_Texture_InfoMeas1_AGP_5_00,Nuclei_Texture_InfoMeas1_DNA_10_01,Nuclei_Texture_InfoMeas1_DNA_5_02,Nuclei_Texture_InfoMeas1_ER_5_00,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas2_Mito_5_00,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01
0,219973,B02,2021_03_03_Batch13,3767,2.5x10^3 cells/well,1.0,WT_parental,20210226.0,219973,48 hr,...,0.289131,-1.728045,-0.100393,1.121394,-0.022202,1.531376,1.331481,-3.10409,2.279966,-0.624593
1,219973,B03,2021_03_03_Batch13,1998,2.5x10^3 cells/well,2.0,CloneA,20210226.0,219973,48 hr,...,0.436948,-0.64633,1.004583,0.457522,1.972381,-0.054827,0.44468,0.815532,-0.047137,0.405071
2,219973,B04,2021_03_03_Batch13,1892,2.5x10^3 cells/well,3.0,Clone E,20210226.0,219973,48 hr,...,1.157071,-1.564313,-1.669906,0.251932,2.174847,-0.073711,0.604467,1.677229,-1.687651,2.476248
3,219973,B05,2021_03_03_Batch13,3392,2.5x10^3 cells/well,4.0,WT clone 01,20210226.0,219973,48 hr,...,0.29559,-1.252088,-0.779466,1.952128,1.474118,0.968177,0.491561,-0.171471,-0.077082,1.196919
4,219973,B06,2021_03_03_Batch13,3203,2.5x10^3 cells/well,5.0,WT clone 02,20210226.0,219973,48 hr,...,0.363286,-0.194208,0.118434,0.721196,-0.092052,0.396447,0.715941,0.139263,-0.163614,-0.141617


In [6]:
output_file = os.path.join(output_dir, "all_merged_profiles.csv.gz")
all_profiles_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "all_merged_profiles.gct")
write_gct(profiles=all_profiles_df, output_file=output_gct_file)

## Collapse replicates into consensus profiles

In [7]:
median_consensus_df = (
    all_profiles_df.groupby(["Metadata_clone_number", "Metadata_treatment"])
    .median()
    .reset_index()
)

print(median_consensus_df.shape)
median_consensus_df.head()

(104, 399)


Unnamed: 0,Metadata_clone_number,Metadata_treatment,Metadata_cell_count,Metadata_celltype_shorthand_from_plate_graph,Metadata_date,Metadata_treatment_shorthand_from_plate_graph,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_FormFactor,Cells_AreaShape_Orientation,...,Nuclei_Texture_Correlation_RNA_20_00,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_Entropy_RNA_20_03,Nuclei_Texture_InfoMeas1_AGP_5_00,Nuclei_Texture_InfoMeas1_DNA_10_01,Nuclei_Texture_InfoMeas1_DNA_5_02,Nuclei_Texture_InfoMeas1_ER_5_00,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas2_Mito_5_00,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01
0,BZ001,0.1% DMSO,12491.0,9.0,20210205.0,1.0,-0.257844,0.128931,0.981971,0.187379,...,-0.4895,0.492163,1.009886,-1.236462,-0.519807,-0.935643,-1.255208,-1.034675,1.043896,-1.126122
1,BZ001,21 nM bortezomib,24448.0,,,,0.163763,0.030651,0.257632,-0.345442,...,-0.568693,0.254292,-0.452921,0.120423,-0.735252,0.189309,0.170067,0.36055,0.214372,0.056538
2,BZ001,Untreated,8173.0,,,,-0.175639,0.309249,0.689368,0.059856,...,-0.57418,0.755299,1.271127,-0.971689,0.229886,-0.828853,-1.106801,-1.016777,0.521635,-0.962902
3,BZ002,0.1% DMSO,2895.0,10.0,20210205.0,1.0,-0.07654,-0.280775,-0.030171,-0.350826,...,0.677405,0.13477,-0.13837,-0.269741,0.653912,-0.337982,-0.052744,0.266448,-0.270937,0.153387
4,BZ002,Untreated,26721.0,,,,-0.226119,-0.982649,-1.382389,0.616968,...,1.111341,-1.317973,-1.534522,1.933899,1.777682,1.568107,2.146568,2.968393,-3.040321,2.017309


In [8]:
output_file = os.path.join(output_dir, "consensus_profiles.csv.gz")
median_consensus_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "consensus_profiles.gct")
write_gct(profiles=median_consensus_df, output_file=output_gct_file)