# Aggregate single cells into bulk profiles using grit information

We use normalized, **single-cell profiles** from the Cell Health experiment and **single-cell grit scores** (calculated with respect to Chr2 cutting controls and using normalized, feature selected profiles). 

We only used the same cell-painting feature columns of Cell Health data in the **Grit Benchmarking project** to ensure consistency across analyses.

Here we aggregate profiles from the Cell Health experiments using several "grit-informed" methods:
1. Standard median aggregation
2. Weighted mean, weighting by raw grit scores
3. Weighted mean, weighting by softmax-transformed grit scores
4. Weighted mean, weighting by grit scores that are clipped to 0. This assigns a minimum grit score of 0 to any cell with grit scores < 0. 
5. Weighted mean, weighting by grit scores that are clipped to 0 then softmax-transformed. 

Methods 3-5 were exploratory and did not yield dramatically improved results ("improved" assessed by replicate reproducibility measures). Therefore, they are commented out in this notebook to save on runtime and compute.

Note: depending on AWS instance size, some cell lines were aggregated with a python script of the same name in `scripts/`.

In [1]:
import os
import glob
import gzip
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from scipy.special import softmax 

from pycytominer import aggregate, get_na_columns
from pycytominer.cyto_utils import infer_cp_features
from cytominer_eval import evaluate
from scripts.utils import calculate_weighted_agg

In [2]:
def merge_metadata(cell_line, level3_profile):
    # load metadata file from Cell Health  data
    commit = "07e4b40c39dd27084be36fbef4d64c5654b2960f"
    base_url = f"https://github.com/broadinstitute/cell-health/raw/{commit}"
    url = f"{base_url}/1.generate-profiles/data/metadata/platemap/DEPENDENCIES1_{cell_line}.csv"
    platemap = pd.read_csv(url, sep=",")
    platemap.columns = ['Metadata_' + str(x) for x in platemap.columns]

    # merge with the aggregated files
    meta_df = pd.merge(level3_profile, platemap, left_on='Metadata_Well', right_on='Metadata_well_position')
    # # reorder columns for metadata to be in front
    meta_df = meta_df[sorted(meta_df, key = lambda x: x not in meta_df.filter(like="Metadata").columns)]

    return meta_df

## Load level 2 data

In [3]:
plate_dict = {
    "ES2": ["SQ00014613","SQ00014614", "SQ00014615"], 
    "A549": ["SQ00014610", "SQ00014611", "SQ00014612"],
    "HCC44": ["SQ00014616", "SQ00014617", "SQ00014618"]
}


In [4]:
# take the same columns as Cell Health data in rest of Grit Benchmark project
commit = "2916770cc9cc9e75b693348b683aa398987fb9f9"
base_url = f"https://github.com/broadinstitute/grit-benchmark/raw/{commit}"
url = f"{base_url}/1.calculate-metrics/cell-health/data/cell_health_merged_feature_select.csv.gz"

df = pd.read_csv(url, sep=",")
print(df.shape)
df.head(2)

cols_to_keep = infer_cp_features(df)

(3456, 402)


In [None]:
%%time
results_folder = 'data/aggregated-profiles/'
    
for cell_line in ['ES2', 'HCC44', 'A549']: 
    ####### read in single-cell grit data #######
    start_merge = datetime.now()
    grit_folder = '../../../1.calculate-metrics/cell-health/results/'
    grit_files = glob.glob(grit_folder+'*single_cell_grit*.tsv.gz')

    scgrit_df = []
    for file in grit_files:
        plate_name=file.split('/')[-1].split('_')[-2]
        if plate_name in plate_dict[cell_line]:
            print(f"adding scrgrit of {plate_name} to list of {cell_line}")
            scgrit_plate = pd.read_csv(file, sep='\t').assign(plate=plate_name, cell_line = cell_line)
            print(scgrit_plate.shape)
            scgrit_df.append(scgrit_plate)
    scgrit_df = pd.concat(scgrit_df)
    scgrit_df['cell_identity'] = scgrit_df.perturbation.str.split("_", expand=True)[1].astype(int)
    scgrit_df.columns = ['Metadata_'  + str(col) for col in scgrit_df.columns]
    print(f"total shape of of scgrit_df for {cell_line} is: {scgrit_df.shape}")
    
    ####### read in single-cell cell painting profiles #######
    profile_folder = '../../../0.download-data/data/cell_health/normalized/' 
    profile_files = glob.glob(profile_folder+'*normalized.csv.gz')

    scprofiles_df = []
    for file in profile_files:
        plate_name=file.split('/')[-1].split('_')[0]
        if plate_name in plate_dict[cell_line]:
            print(f"adding scprofiles of {plate_name} to list of {cell_line}")
            scprofile_plate = (pd.read_csv(file, sep=',', low_memory=False)
                               .reset_index()
                               .rename({'index':'Metadata_cell_identity'}, axis='columns')
                              ).assign(cell_line = cell_line)
            plate_cols = infer_cp_features(scprofile_plate)
            drop_cols = [x for x in plate_cols if x not in cols_to_keep]
            scprofile_plate.drop(columns = drop_cols, inplace=True)
            scprofiles_df.append(scprofile_plate)
    scprofiles_df = pd.concat(scprofiles_df, sort=False)
    print(f'total shape of scprofiles_df for {cell_line} is: {scprofiles_df.shape}')

    ####### merge scgrit scores + cell painting profiles #######
    scprofiles_df = (pd.merge(scprofiles_df, scgrit_df, 
         left_on=['Metadata_cell_identity', 'Metadata_Plate', 'Metadata_pert_name'], 
                 right_on=['Metadata_cell_identity', 'Metadata_plate', 'Metadata_group'])
        )
    del scgrit_df
    print(f"total shape of sc_df for {cell_line} is: {scprofiles_df.shape}")
    # remove columns with any NA entries
    na_cols_to_drop = get_na_columns(scprofiles_df, cutoff=0)
    print(f"Dropping {len(na_cols_to_drop)} columns because of missing data")
    scprofiles_df = scprofiles_df.drop(na_cols_to_drop, axis="columns")
    print(f"FINAL shape of merged data {scprofiles_df.shape}")

    print(f"TOTAL TIME constructing merged df for cell_line {cell_line} : {str(datetime.now()-start_merge)}")
    
    
    ###### standard median aggregation ######
    start_agg = datetime.now()
    agg_df = aggregate(
        population_df = scprofiles_df,
        strata = ["Metadata_Plate", "Metadata_Well"],
        features = "infer",
        operation ="median"
    ).assign(Metadata_agg_method = 'median', cell_line = cell_line)
    agg_meta_df = merge_metadata(cell_line, agg_df)
    display(agg_meta_df.head())
    # writing data
    agg_meta_df.to_csv(Path(results_folder + cell_line + "_median.tsv"), index=False, sep='\t')
    
    ###### grit-informed aggregation methods ######
    ### raw grit as weights ###
    agg_df = (calculate_weighted_agg(
        population_df = scprofiles_df,
        columns = ['Metadata_Plate', 'Metadata_Well'],
        features = 'infer',
        transform = 'weighted_grit', weight = 'Metadata_grit')
                    ).assign(Metadata_agg_method = 'weighted', cell_line = cell_line)
    agg_meta_df = merge_metadata(cell_line, agg_df)
    display(agg_meta_df.head())
    # writing data
    agg_meta_df.to_csv(Path(results_folder + cell_line + "_weighted.tsv"), index=False, sep='\t')
    
#     ### grit that is softmax-transformed as weights ###
#     agg_df = (calculate_weighted_agg(
#         population_df = scprofiles_df,
#         columns = ['Metadata_Plate', 'Metadata_Well'],
#         features = 'infer',
#         transform = 'softmax_grit', weight = 'Metadata_grit')
#                    ).assign(Metadata_agg_method = 'softmax', cell_line = cell_line)
#     agg_meta_df = merge_metadata(cell_line, agg_df)
#     # writing data
#     agg_meta_df.to_csv(Path(results_folder + cell_line + "_softmax.tsv"), index=False, sep='\t')
    
#     ### grit clipped to 0 (as lowest values), as weights ###
#     agg_df = (calculate_weighted_agg(
#         population_df = scprofiles_df,
#         columns = ['Metadata_Plate', 'Metadata_Well'], 
#         features = 'infer',
#         transform = 'weighted_grit', weight='Metadata_clipped_grit', lower_threshold=0)
#                        ).assign(Metadata_agg_method = 'clipped0_weighted', cell_line = cell_line)
#     agg_meta_df = merge_metadata(cell_line, agg_df)
#     # writing data
#     agg_meta_df.to_csv(Path(results_folder + cell_line + "_clipped0_weighted.tsv"), index=False, sep='\t')
    
#     ### grit clipped to 0 (as lowest values), then softmax-transfored, as weights ###
#     agg_df = (calculate_weighted_agg(
#         population_df = scprofiles_df, 
#         columns = ['Metadata_Plate', 'Metadata_Well'], 
#         features = 'infer',
#         transform = 'softmax_grit', weight='Metadata_clipped_grit', lower_threshold=0)
#                        ).assign(Metadata_agg_method = 'clipped0_softmax')
#     agg_meta_df = merge_metadata(cell_line, agg_df)
#     # writing data
#     agg_meta_df.to_csv(Path(results_folder + cell_line + "_clipped0_softmax.tsv"), index=False, sep='\t')
    

    print(f"TOTAL TIME performing aggregation for cell_line {cell_line} : {str(datetime.now()-start_agg)}")


adding scrgrit of SQ00014614 to list of ES2
(988494, 9)
adding scrgrit of SQ00014613 to list of ES2
(876530, 9)
adding scrgrit of SQ00014615 to list of ES2
(966400, 9)
total shape of of scgrit_df for ES2 is: (2831424, 10)
adding scprofiles of SQ00014613 to list of ES2
adding scprofiles of SQ00014615 to list of ES2
adding scprofiles of SQ00014614 to list of ES2
total shape of scprofiles_df for ES2 is: (2182688, 410)
total shape of sc_df for ES2 is: (2831424, 419)
Dropping 9 columns because of missing data
FINAL shape of merged data (2831424, 410)
TOTAL TIME constructing merged df for cell_line ES2 : 0:32:45.990254


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0,cell_line
0,SQ00014613,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,0.20615,0.090475,0.5224,0.134695,-0.089,0.06109,0.188135,0.02096,-0.064245,ES2
1,SQ00014614,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,0.35814,0.20426,0.58672,-0.067,-0.21768,-0.10394,0.06454,0.01784,-0.16262,ES2
2,SQ00014615,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,0.00897,0.09371,0.33165,-0.25913,-0.36323,-0.28684,0.0172,-0.08949,-0.162,ES2
3,SQ00014613,A03,median,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,0.066575,-0.06607,0.51927,0.47574,0.395135,0.60169,0.42106,0.47417,0.245865,ES2
4,SQ00014614,A03,median,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,0.25445,0.10768,0.4144,-0.04705,-0.11018,-0.02567,0.07987,0.06338,-0.09919,ES2


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0,cell_line
0,SQ00014613,A02,weighted,A,2,A02,MCL1,MCL1-5,,ES2,...,0.1828,0.24016,0.422827,-0.295515,-0.608752,-0.040608,-0.588076,-0.661759,-0.435866,ES2
1,SQ00014614,A02,weighted,A,2,A02,MCL1,MCL1-5,,ES2,...,0.557861,0.638709,0.565629,-0.799344,-1.038416,-1.113257,-0.804987,-1.143842,-1.211263,ES2
2,SQ00014615,A02,weighted,A,2,A02,MCL1,MCL1-5,,ES2,...,-0.207994,0.179889,0.230644,-0.649638,-0.821169,-0.8222,-0.469439,-0.772563,-0.661172,ES2
3,SQ00014613,A03,weighted,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,-0.003206,-0.379703,0.925545,1.029963,0.911331,1.109489,0.762895,0.867513,0.655954,ES2
4,SQ00014614,A03,weighted,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,0.525038,0.223702,0.655633,0.208878,0.087791,0.056018,0.109554,0.024908,-0.139356,ES2


TOTAL TIME performing aggregation for cell_line ES2 : 0:00:57.239114
adding scrgrit of SQ00014616 to list of HCC44
(808243, 9)
adding scrgrit of SQ00014618 to list of HCC44
(819330, 9)
adding scrgrit of SQ00014617 to list of HCC44
(836453, 9)
total shape of of scgrit_df for HCC44 is: (2464026, 10)
adding scprofiles of SQ00014617 to list of HCC44
adding scprofiles of SQ00014618 to list of HCC44
adding scprofiles of SQ00014616 to list of HCC44
total shape of scprofiles_df for HCC44 is: (2462075, 410)
total shape of sc_df for HCC44 is: (2464026, 419)
Dropping 16 columns because of missing data
FINAL shape of merged data (2464026, 403)
TOTAL TIME constructing merged df for cell_line HCC44 : 0:37:14.274309


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0,cell_line
0,SQ00014616,A02,median,A,2,A02,MCL1,MCL1-5,,HCC44,...,0.03954,-0.02786,0.514825,-0.13396,-0.24867,-0.050715,0.07277,0.097245,0.134875,HCC44
1,SQ00014617,A02,median,A,2,A02,MCL1,MCL1-5,,HCC44,...,-0.012115,0.002385,0.430125,-0.227555,-0.214985,-0.23921,0.043055,0.023655,0.03988,HCC44
2,SQ00014618,A02,median,A,2,A02,MCL1,MCL1-5,,HCC44,...,0.09071,-0.01719,0.73299,-0.09675,-0.19868,-0.05966,0.13852,0.06072,0.12089,HCC44
3,SQ00014616,A03,median,A,3,A03,AKT1,AKT1-1,BRDN0001054908,HCC44,...,-0.156895,-0.064795,0.274815,-0.25718,-0.26587,-0.17986,-0.007395,-0.053745,0.082905,HCC44
4,SQ00014617,A03,median,A,3,A03,AKT1,AKT1-1,BRDN0001054908,HCC44,...,-0.01701,0.025245,0.257695,-0.278425,-0.314695,-0.290535,-0.013745,-0.10082,-0.02989,HCC44


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0,cell_line
0,SQ00014616,A02,weighted,A,2,A02,MCL1,MCL1-5,,HCC44,...,0.09443,-0.120016,0.765638,-0.074387,-0.198808,-0.037867,-0.043378,0.04733,0.141164,HCC44
1,SQ00014617,A02,weighted,A,2,A02,MCL1,MCL1-5,,HCC44,...,-0.246802,-0.104734,0.351034,-0.296622,-0.33969,-0.492832,-0.241436,-0.317508,-0.356073,HCC44
2,SQ00014618,A02,weighted,A,2,A02,MCL1,MCL1-5,,HCC44,...,0.296493,0.039699,0.808674,0.142087,-0.035677,0.032796,0.174022,0.127833,0.136506,HCC44
3,SQ00014616,A03,weighted,A,3,A03,AKT1,AKT1-1,BRDN0001054908,HCC44,...,-1.006866,-0.554451,0.014792,-0.558036,-0.652535,-0.826281,-0.61019,-0.935854,-0.6045,HCC44
4,SQ00014617,A03,weighted,A,3,A03,AKT1,AKT1-1,BRDN0001054908,HCC44,...,-0.488312,-0.125402,0.004596,-0.56201,-0.760358,-0.937186,-0.494233,-0.851214,-0.742682,HCC44


TOTAL TIME performing aggregation for cell_line HCC44 : 0:00:49.526009
adding scrgrit of SQ00014612 to list of A549
(1408094, 9)
adding scrgrit of SQ00014610 to list of A549
(1460518, 9)
adding scrgrit of SQ00014611 to list of A549
(1429783, 9)
total shape of of scgrit_df for A549 is: (4298395, 10)
adding scprofiles of SQ00014611 to list of A549
