# Aggregate single cells into bulk profiles using grit information

We use normalized, **single-cell profiles** from the Cell Health experiment and **single-cell grit scores** (calculated with respect to Chr2 cutting controls and using normalized, feature selected profiles).

Here we aggregate profiles from the Cell Health experiments using several "grit-informed" methods:
1. Standard median aggregation
2. Weighted mean, weighting by raw grit scores
3. Weighted mean, weighting by softmax-transformed grit scores
4. Weighted mean, weighting by grit scores clipped to 0. This assigns a minimum grit score of 0 to any cell with grit scores < 0. 

In [2]:
import os
import glob
import gzip
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from scipy.special import softmax

from pycytominer import aggregate, get_na_columns
from pycytominer.cyto_utils import infer_cp_features
from cytominer_eval import evaluate
from scripts.utils import calculate_weighted_agg

In [3]:
def merge_metadata(cell_line, level3_profile):
    # load metadata file from Cell Health  data
    commit = "07e4b40c39dd27084be36fbef4d64c5654b2960f"
    base_url = f"https://github.com/broadinstitute/cell-health/raw/{commit}"
    url = f"{base_url}/1.generate-profiles/data/metadata/platemap/DEPENDENCIES1_{cell_line}.csv"
    platemap = pd.read_csv(url, sep=",")
    platemap.columns = ["Metadata_" + str(x) for x in platemap.columns]

    # merge with the aggregated files
    meta_df = pd.merge(
        level3_profile,
        platemap,
        left_on="Metadata_Well",
        right_on="Metadata_well_position",
    )
    # # reorder columns for metadata to be in front
    meta_df = meta_df[
        sorted(meta_df, key=lambda x: x not in meta_df.filter(like="Metadata").columns)
    ]

    return meta_df

## Load level 2 data

In [4]:
plate_dict = {
    "ES2": ["SQ00014613", "SQ00014614", "SQ00014615"],
    "A549": ["SQ00014610", "SQ00014611", "SQ00014612"],
    "HCC44": ["SQ00014616", "SQ00014617", "SQ00014618"],
}

In [21]:
%%time
results_folder = "data/aggregated_profiles/"

for cell_line in [
    "ES2",
    "A549",
    "HCC44",
]:
    ####### read in single-cell grit data #######
    start_merge = datetime.now()
    grit_folder = "../../../1.calculate-metrics/cell-health/results/"
    grit_files = glob.glob(grit_folder + "*single_cell_grit*.tsv.gz")

    scgrit_df = []
    for file in grit_files:
        plate_name = file.split("/")[-1].split("_")[-2]
        if plate_name in plate_dict[cell_line]:
            print(f"adding scrgrit of {plate_name} to list of {cell_line}")
            scgrit_plate = pd.read_csv(file, sep="\t").assign(
                plate=plate_name, cell_line=cell_line
            )
            print(scgrit_plate.shape)
            scgrit_df.append(scgrit_plate)
    scgrit_df = pd.concat(scgrit_df)
    scgrit_df["cell_identity"] = scgrit_df.perturbation.str.split("_", expand=True)[
        1
    ].astype(int)
    scgrit_df.columns = ["Metadata_" + str(col) for col in scgrit_df.columns]
    print(f"total shape of of scgrit_df for {cell_line} is: {scgrit_df.shape}")

    ####### read in single-cell cell painting profiles #######
    profile_folder = "../../../0.download-data/data/cell_health/normalized/"
    profile_files = glob.glob(profile_folder + "*normalized.csv.gz")

    scprofiles_df = []
    for file in profile_files:
        plate_name = file.split("/")[-1].split("_")[0]
        if plate_name in plate_dict[cell_line]:
            print(f"adding scprofiles of {plate_name} to list of {cell_line}")
            scprofile_plate = (
                pd.read_csv(file, sep=",", low_memory=False)
                .reset_index()
                .rename({"index": "Metadata_cell_identity"}, axis="columns")
            ).assign(cell_line=cell_line)
            scprofiles_df.append(scprofile_plate)
    scprofiles_df = pd.concat(scprofiles_df, sort=False)
    print(f"total shape of scprofiles_df for {cell_line} is: {scprofiles_df.shape}")

    ####### merge scgrit scores + cell painting profiles #######
    scprofiles_df = pd.merge(
        scprofiles_df,
        scgrit_df,
        left_on=["Metadata_cell_identity", "Metadata_Plate", "Metadata_pert_name"],
        right_on=["Metadata_cell_identity", "Metadata_plate", "Metadata_group"],
    )
    del scgrit_df
    print(f"total shape of sc_df for {cell_line} is: {scprofiles_df.shape}")
    # remove columns with any NA entries
    na_cols_to_drop = get_na_columns(scprofiles_df, cutoff=0)
    print(f"Dropping {len(na_cols_to_drop)} columns because of missing data")
    scprofiles_df = scprofiles_df.drop(na_cols_to_drop, axis="columns")
    print(f"FINAL shape of merged data {scprofiles_df.shape}")

    print(
        f"TOTAL TIME constructing merged df for cell_line {cell_line} : {str(datetime.now()-start_merge)}"
    )

    ###### standard median aggregation ######
    start_agg = datetime.now()
    agg_df = aggregate(
        population_df=scprofiles_df,
        strata=["Metadata_Plate", "Metadata_Well"],
        features="infer",
        operation="median",
    ).assign(Metadata_agg_method="median", cell_line=cell_line)
    agg_meta_df = merge_metadata(cell_line, agg_df)
    display(agg_meta_df.head())
    # writing data
    agg_meta_df.to_csv(
        Path(results_folder + cell_line + "_median.tsv"), index=False, sep="\t"
    )

    ###### grit-informed aggregation methods ######
    ### raw grit as weights ###
    agg_df = (
        calculate_weighted_agg(
            population_df=scprofiles_df,
            columns=["Metadata_Plate", "Metadata_Well"],
            features="infer",
            transform="weighted_grit",
            weight="Metadata_grit",
        )
    ).assign(Metadata_agg_method="weighted", cell_line=cell_line)
    agg_meta_df = merge_metadata(cell_line, agg_df)
    display(agg_meta_df.head())
    # writing data
    agg_meta_df.to_csv(
        Path(results_folder + cell_line + "_weighted.tsv"), index=False, sep="\t"
    )

    #     ### grit that is softmax-transformed as weights ###
    #     agg_df = (calculate_weighted_agg(
    #         population_df = scprofiles_df,
    #         columns = ['Metadata_Plate', 'Metadata_Well'],
    #         features = 'infer',
    #         transform = 'softmax_grit', weight = 'Metadata_grit')
    #                    ).assign(Metadata_agg_method = 'softmax', cell_line = cell_line)
    #     agg_meta_df = merge_metadata(cell_line, agg_df)
    #     # writing data
    #     agg_meta_df.to_csv(Path(results_folder + cell_line + "_softmax.tsv"), index=False, sep='\t')

    #     ### grit clipped to 0 (as lowest values), as weights ###
    #     agg_df = (calculate_weighted_agg(
    #         population_df = scprofiles_df,
    #         columns = ['Metadata_Plate', 'Metadata_Well'],
    #         features = 'infer',
    #         transform = 'weighted_grit', weight='Metadata_clipped_grit', lower_threshold=0)
    #                        ).assign(Metadata_agg_method = 'clipped0_weighted', cell_line = cell_line)
    #     agg_meta_df = merge_metadata(cell_line, agg_df)
    #     # writing data
    #     agg_meta_df.to_csv(Path(results_folder + cell_line + "_clipped0_weighted.tsv"), index=False, sep='\t')

    #     ### grit clipped to 0 (as lowest values), then softmax-transfored, as weights ###
    #     agg_df = (calculate_weighted_agg(
    #         population_df = scprofiles_df,
    #         columns = ['Metadata_Plate', 'Metadata_Well'],
    #         features = 'infer',
    #         transform = 'softmax_grit', weight='Metadata_clipped_grit', lower_threshold=0)
    #                        ).assign(Metadata_agg_method = 'clipped0_softmax')
    #     agg_meta_df = merge_metadata(cell_line, agg_df)
    #     # writing data
    #     agg_meta_df.to_csv(Path(results_folder + cell_line + "_clipped0_softmax.tsv"), index=False, sep='\t')

    print(
        f"TOTAL TIME performing aggregation for cell_line {cell_line} : {str(datetime.now()-start_agg)}"
    )

adding scrgrit of SQ00014613 to list of ES2
(876530, 9)
total shape of of scgrit_df for ES2 is: (876530, 10)
adding scprofiles of SQ00014613 to list of ES2
total shape of scprofiles_df for ES2 is: (685505, 1796)
total shape of sc_df for ES2 is: (876530, 1805)
Dropping 39 columns because of missing data
FINAL shape of merged data (876530, 1766)
TOTAL TIME constructing merged df for cell_line ES2 : 0:10:54.915196


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0,cell_line
0,SQ00014613,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,0.14335,0.143565,0.09113,-0.157715,-0.148025,-0.172415,-0.20107,-0.09962,-0.27301,ES2
1,SQ00014613,A03,median,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,0.04613,0.002785,0.06012,0.066905,-0.011635,0.095225,-0.17611,-0.125825,-0.17387,ES2
2,SQ00014613,A04,median,A,4,A04,KRAS,KRAS-2B,,ES2,...,-0.01182,-0.08123,0.00711,-0.00319,-0.068795,0.0118,-0.15107,-0.14304,-0.149895,ES2
3,SQ00014613,A05,median,A,5,A05,AKT1,AKT1-2,BRDN0001055115,ES2,...,0.0445,-0.01732,0.07302,0.01738,-0.04466,0.02401,-0.14602,-0.1439,-0.14173,ES2
4,SQ00014613,A07,median,A,7,A07,BRAF1,BRAF1-1,BRDN0000986682,ES2,...,-0.180925,-0.181965,-0.19075,-0.262165,-0.232625,-0.24843,-0.211485,-0.1628,-0.236195,ES2


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0,cell_line
0,SQ00014613,A02,weighted,A,2,A02,MCL1,MCL1-5,,ES2,...,-0.295424,-0.219383,-0.347669,-0.445315,-0.368547,-0.467934,-0.401504,-0.195283,-0.520941,ES2
1,SQ00014613,A03,weighted,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,0.287212,0.151165,0.386326,0.52235,0.376236,0.595809,0.288888,0.276258,0.385232,ES2
2,SQ00014613,A04,weighted,A,4,A04,KRAS,KRAS-2B,,ES2,...,0.043744,-0.021193,0.082772,0.126749,0.075433,0.146416,0.011163,-0.010858,0.042068,ES2
3,SQ00014613,A05,weighted,A,5,A05,AKT1,AKT1-2,BRDN0001055115,ES2,...,0.125474,0.022332,0.19779,0.207468,0.118959,0.242513,0.046306,0.017989,0.109033,ES2
4,SQ00014613,A07,weighted,A,7,A07,BRAF1,BRAF1-1,BRDN0000986682,ES2,...,-0.281803,-0.224593,-0.323946,-0.359573,-0.297367,-0.393332,-0.237927,-0.1938,-0.285573,ES2


TOTAL TIME performing aggregation for cell_line ES2 : 0:01:27.168997
CPU times: user 10min 18s, sys: 1min 26s, total: 11min 45s
Wall time: 12min 22s
