# Aggregate single cells of EMPTY wells into bulk profiles using standard median aggregation

In this notebook, we isolate **single-cell** profiles of the `EMPTY` perturbation from the Cell Health experiment and use the standard median aggregation method to form bulk profiles. 

EMPTY wells were excluded from the initial single-cell grit calculations, therefore, grit-informed aggregation techniques are not applied. 

In [1]:
import os
import glob
import gzip
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from scipy.special import softmax 

from pycytominer import aggregate, get_na_columns
from pycytominer.cyto_utils import infer_cp_features
from cytominer_eval import evaluate
from scripts.utils import calculate_weighted_agg

In [2]:
def merge_metadata(cell_line, level3_profile):
    # load metadata file from Cell Health  data
    commit = "07e4b40c39dd27084be36fbef4d64c5654b2960f"
    base_url = f"https://github.com/broadinstitute/cell-health/raw/{commit}"
    url = f"{base_url}/1.generate-profiles/data/metadata/platemap/DEPENDENCIES1_{cell_line}.csv"
    platemap = pd.read_csv(url, sep=",")
    platemap.columns = ['Metadata_' + str(x) for x in platemap.columns]

    # merge with the aggregated files
    meta_df = pd.merge(level3_profile, platemap, left_on='Metadata_Well', right_on='Metadata_well_position')
    # # reorder columns for metadata to be in front
    meta_df = meta_df[sorted(meta_df, key = lambda x: x not in meta_df.filter(like="Metadata").columns)]

    return meta_df

## Load level 2 data

In [3]:
plate_dict = {
    "ES2": ["SQ00014613","SQ00014614", "SQ00014615"], 
    "A549": ["SQ00014610", "SQ00014611", "SQ00014612"],
    "HCC44": ["SQ00014616", "SQ00014617", "SQ00014618"]
}


In [4]:
# take the same columns as original Cell Health paper did
commit = "2916770cc9cc9e75b693348b683aa398987fb9f9"
base_url = f"https://github.com/broadinstitute/grit-benchmark/raw/{commit}"
url = f"{base_url}/1.calculate-metrics/cell-health/data/cell_health_merged_feature_select.csv.gz"

df = pd.read_csv(url, sep=",")
print(df.shape)
df.head(2)

cols_to_keep = infer_cp_features(df)

(3456, 402)


In [None]:
%%time
results_folder = 'data/aggregated-profiles/'
    
for cell_line in ['ES2', 'HCC44', 'A549']: 
    ####### read in single-cell cell painting profiles #######
    profile_folder = '../../../0.download-data/data/cell_health/normalized/' 
    profile_files = glob.glob(profile_folder+'*normalized.csv.gz')

    scprofiles_df = []
    for file in profile_files:
        plate_name=file.split('/')[-1].split('_')[0]
        if plate_name in plate_dict[cell_line]:
            print(f"adding scprofiles of {plate_name} to list of {cell_line}")
            scprofile_plate = (pd.read_csv(file, sep=',', low_memory=False)
                               .reset_index()
                               .rename({'index':'Metadata_cell_identity'}, axis='columns')
                              ).assign(cell_line = cell_line)
            plate_cols = infer_cp_features(scprofile_plate)
            drop_cols = [x for x in plate_cols if x not in cols_to_keep]
            scprofile_plate.query("Metadata_gene_name == 'EMPTY' & Metadata_pert_name == 'EMPTY' ", 
                                  inplace=True)
            scprofile_plate.drop(columns = drop_cols, inplace=True)
            scprofiles_df.append(scprofile_plate)
    scprofiles_df = pd.concat(scprofiles_df, sort=False)
    print(f'total shape of scprofiles_df for {cell_line} is: {scprofiles_df.shape}')

    # remove columns with any NA entries
    na_cols_to_drop = get_na_columns(scprofiles_df, cutoff=0)
    print(f"Dropping {len(na_cols_to_drop)} columns because of missing data")
    scprofiles_df = scprofiles_df.drop(na_cols_to_drop, axis="columns")
    print(f"FINAL shape of merged data {scprofiles_df.shape}")
   
    ###### standard median aggregation ######
    start_agg = datetime.now()
    agg_df = aggregate(
        population_df = scprofiles_df,
        strata = ["Metadata_Plate", "Metadata_Well"],
        features = "infer",
        operation ="median"
    ).assign(Metadata_agg_method = 'median', cell_line = cell_line)
    agg_meta_df = merge_metadata(cell_line, agg_df)
    display(agg_meta_df.head())
    # writing data
    agg_meta_df.to_csv(Path(results_folder + cell_line + "_median_EMPTY.tsv"), index=False, sep='\t')
    
#     ###### grit-informed aggregation methods ######
#     ### raw grit as weights ###
#     agg_df = (calculate_weighted_agg(
#         population_df = scprofiles_df,
#         columns = ['Metadata_Plate', 'Metadata_Well'],
#         features = 'infer',
#         transform = 'weighted_grit', weight = 'Metadata_grit')
#                     ).assign(Metadata_agg_method = 'weighted', cell_line = cell_line)
#     agg_meta_df = merge_metadata(cell_line, agg_df)
#     display(agg_meta_df.head())
#     # writing data
#     agg_meta_df.to_csv(Path(results_folder + cell_line + "_weighted.tsv"), index=False, sep='\t')

#     ### grit that is softmax-transformed as weights ###
#     agg_df = (calculate_weighted_agg(
#         population_df = scprofiles_df,
#         columns = ['Metadata_Plate', 'Metadata_Well'],
#         features = 'infer',
#         transform = 'softmax_grit', weight = 'Metadata_grit')
#                    ).assign(Metadata_agg_method = 'softmax', cell_line = cell_line)
#     agg_meta_df = merge_metadata(cell_line, agg_df)
#     # writing data
#     agg_meta_df.to_csv(Path(results_folder + cell_line + "_softmax.tsv"), index=False, sep='\t')
    
#     ### grit clipped to 0 (as lowest values), as weights ###
#     agg_df = (calculate_weighted_agg(
#         population_df = scprofiles_df,
#         columns = ['Metadata_Plate', 'Metadata_Well'], 
#         features = 'infer',
#         transform = 'weighted_grit', weight='Metadata_clipped_grit', lower_threshold=0)
#                        ).assign(Metadata_agg_method = 'clipped0_weighted', cell_line = cell_line)
#     agg_meta_df = merge_metadata(cell_line, agg_df)
#     # writing data
#     agg_meta_df.to_csv(Path(results_folder + cell_line + "_clipped0_weighted.tsv"), index=False, sep='\t')
    
#     ### grit clipped to 0 (as lowest values), then softmax-transfored, as weights ###
#     agg_df = (calculate_weighted_agg(
#         population_df = scprofiles_df, 
#         columns = ['Metadata_Plate', 'Metadata_Well'], 
#         features = 'infer',
#         transform = 'softmax_grit', weight='Metadata_clipped_grit', lower_threshold=0)
#                        ).assign(Metadata_agg_method = 'clipped0_softmax')
#     agg_meta_df = merge_metadata(cell_line, agg_df)
#     # writing data
#     agg_meta_df.to_csv(Path(results_folder + cell_line + "_clipped0_softmax.tsv"), index=False, sep='\t')
    

    print(f"TOTAL TIME performing aggregation for cell_line {cell_line} : {str(datetime.now()-start_agg)}")


adding scprofiles of SQ00014613 to list of ES2
adding scprofiles of SQ00014615 to list of ES2
adding scprofiles of SQ00014614 to list of ES2
total shape of scprofiles_df for ES2 is: (357419, 410)
Dropping 4 columns because of missing data
FINAL shape of merged data (357419, 406)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0,cell_line
0,SQ00014613,A01,median,A,1,A01,EMPTY,EMPTY,,ES2,...,0.074865,0.081375,-0.18641,0.18801,0.006715,0.260625,0.150855,0.11631,-0.04441,ES2
1,SQ00014614,A01,median,A,1,A01,EMPTY,EMPTY,,ES2,...,-6.5e-05,0.02294,0.53998,-0.32356,-0.50933,-0.24021,-0.14203,-0.279705,-0.08708,ES2
2,SQ00014615,A01,median,A,1,A01,EMPTY,EMPTY,,ES2,...,0.0102,0.11628,0.59835,-0.14867,-0.24256,-0.11796,0.0621,-0.05909,-0.13158,ES2
3,SQ00014613,A06,median,A,6,A06,EMPTY,EMPTY,,ES2,...,0.22587,0.21693,0.16312,0.03754,-0.14398,-0.05396,0.05857,-0.07664,-0.20989,ES2
4,SQ00014614,A06,median,A,6,A06,EMPTY,EMPTY,,ES2,...,0.121515,0.06512,0.285865,-0.10645,-0.05129,-0.08578,0.1015,0.10926,-0.00947,ES2


TOTAL TIME performing aggregation for cell_line ES2 : 0:00:05.960605
adding scprofiles of SQ00014617 to list of HCC44
adding scprofiles of SQ00014618 to list of HCC44
adding scprofiles of SQ00014616 to list of HCC44
total shape of scprofiles_df for HCC44 is: (396287, 410)
Dropping 4 columns because of missing data
FINAL shape of merged data (396287, 406)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0,cell_line
0,SQ00014616,A01,median,A,1,A01,EMPTY,EMPTY,,HCC44,...,0.00066,-0.00939,0.64333,-0.29511,-0.34944,-0.17226,-0.07515,-0.06384,0.08315,HCC44
1,SQ00014617,A01,median,A,1,A01,EMPTY,EMPTY,,HCC44,...,0.16212,0.03496,0.54961,-0.21965,-0.45371,-0.40244,-0.00983,-0.15204,-0.13723,HCC44
2,SQ00014618,A01,median,A,1,A01,EMPTY,EMPTY,,HCC44,...,-0.07714,-0.00642,0.41968,-0.52551,-0.56298,-0.32615,-0.24438,-0.19062,0.11452,HCC44
3,SQ00014616,A06,median,A,6,A06,EMPTY,EMPTY,,HCC44,...,-0.12656,-0.06985,0.22345,-0.16012,-0.22411,-0.09953,0.12275,0.00518,0.12212,HCC44
4,SQ00014617,A06,median,A,6,A06,EMPTY,EMPTY,,HCC44,...,0.0791,0.08545,0.25111,-0.2632,-0.19289,-0.26599,0.05323,0.06072,0.05336,HCC44


TOTAL TIME performing aggregation for cell_line HCC44 : 0:00:06.197306
adding scprofiles of SQ00014611 to list of A549
adding scprofiles of SQ00014610 to list of A549
adding scprofiles of SQ00014612 to list of A549
