## Reprocess Cell Health profiles

Use a whole-plate normalization scheme instead of normalization by controls only.

We will use the control normalization in downstream analyses, but we are interested in comparing the impact of normalization strategy on grit calculations.

In [1]:
import sys
import pathlib

from tqdm import tqdm
import pandas as pd

from pycytominer import normalize, feature_select

sys.path.append("scripts")
from normalization import sphere

In [2]:
def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482bb664d9de1"):
    link = f"https://github.com/broadinstitute/cell-health/raw/{commit}/1.generate-profiles/data/profiles/{plate}/{plate}_augmented.csv.gz"

    annotate_df = pd.read_csv(link)

    norm_file = pathlib.Path(f"{output_dir}/{plate}_wholeplate_normalized.csv.gz")
    # feat_select_file = pathlib.Path(f"{output_dir}/{plate}_wholeplate_normalized_feature_selected.csv.gz")

    normalize(
        profiles=annotate_df,
        features="infer",
        meta_features=meta_features,
        samples="all",
        method="mad_robustize",
        output_file=norm_file,
        compression_options={"method": "gzip", "mtime": 1},
    )

In [3]:
# Define the plates
plates = [
    "SQ00014610",
    "SQ00014611",
    "SQ00014612",
    "SQ00014613",
    "SQ00014614",
    "SQ00014615",
    "SQ00014616",
    "SQ00014617",
    "SQ00014618",
]

# Define metadata features
meta_features = [
    "Image_Metadata_Plate",
    "Image_Metadata_Well",
    "Metadata_WellRow",
    "Metadata_WellCol",
    "Metadata_gene_name",
    "Metadata_pert_name",
    "Metadata_broad_sample",
    "Metadata_cell_line",
]

output_dir = pathlib.Path("data/cell-health/profiles")

commit = "cd91bd0daacef2b5ea25dcceb62482bb664d9de1"

In [4]:
for plate in tqdm(plates):
    normalize_profile(plate, output_dir, commit)

100%|██████████| 9/9 [01:30<00:00, 10.06s/it]


## Now form a single merged dataset to perform feature selection

In [5]:
# Load different normalized data
plate_files = [x for x in output_dir.iterdir() if "_normalized.csv.gz" in x.name]

In [6]:
# Concatentate all plates
x_df = (
    pd.concat([pd.read_csv(x) for x in plate_files], sort=True)
    .rename(
        {
            "Image_Metadata_Plate": "Metadata_Plate",
            "Image_Metadata_Well": "Metadata_Well",
        },
        axis="columns",
    )
    .drop(["Metadata_broad_sample"], axis="columns")
)

# Realign metadata column names
x_metadata_cols = x_df.columns[x_df.columns.str.startswith("Metadata")]
x_metadata_df = x_df.loc[:, x_metadata_cols]

x_df = x_df.drop(x_metadata_cols, axis="columns")
x_df = pd.concat([x_metadata_df, x_df], axis="columns")

print(x_df.shape)
x_df.head()

(3456, 1790)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,SQ00014612,A01,1,A,A549,EMPTY,EMPTY,-1.79196,0.035973,-0.422573,...,0.762343,2.211214,1.211959,2.46051,1.318201,0.904173,1.180967,0.283193,-0.604721,0.89125
1,SQ00014612,A02,2,A,A549,MCL1,MCL1-5,-1.650852,1.798642,0.0,...,0.484508,2.250169,1.126989,2.269059,1.464555,1.149692,1.259498,0.538925,-0.352901,0.961563
2,SQ00014612,A03,3,A,A549,AKT1,AKT1-1,-0.825819,1.906561,0.438825,...,0.911406,2.449983,2.086322,2.240322,1.551818,1.824093,1.246285,0.076692,-0.369901,0.570666
3,SQ00014612,A04,4,A,A549,KRAS,KRAS-2B,-2.052952,-0.071946,0.520089,...,0.978916,1.900017,0.468119,2.142986,1.111607,0.520477,1.00064,-0.689098,-1.738253,-0.276086
4,SQ00014612,A05,5,A,A549,AKT1,AKT1-2,-1.523107,0.971267,0.032506,...,0.222017,2.055557,1.554514,2.038328,1.041199,1.547912,0.584615,0.029147,-0.707015,0.414359


In [7]:
# Perform feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]

x_df = feature_select(profiles=x_df, operation=feature_select_ops, na_cutoff=0)

print(x_df.shape)
x_df

(3456, 517)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_ER_20_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_RNA_10_0
0,SQ00014612,A01,1,A,A549,EMPTY,EMPTY,-0.378191,-0.030782,2.089629,...,-2.462620,2.556333,0.635753,2.262901,1.020190,-1.051497,1.690149,1.211959,0.904173,0.283193
1,SQ00014612,A02,2,A,A549,MCL1,MCL1-5,0.739037,-0.466950,2.245620,...,-2.527320,2.099596,0.387399,1.606377,0.619004,-1.123289,1.319333,1.126989,1.149692,0.538925
2,SQ00014612,A03,3,A,A549,AKT1,AKT1-1,0.663179,-0.614422,1.552016,...,-0.861919,1.447807,0.372708,1.232601,0.744217,0.065222,1.338340,2.086322,1.824093,0.076692
3,SQ00014612,A04,4,A,A549,KRAS,KRAS-2B,1.142720,-1.444163,2.473148,...,-1.710899,2.102723,1.144608,2.397073,1.158156,-1.639257,1.082772,0.468119,0.520477,-0.689098
4,SQ00014612,A05,5,A,A549,AKT1,AKT1-2,0.506535,-0.384485,2.334123,...,-1.934975,1.380920,-0.091729,1.584126,0.110702,-0.332856,0.630777,1.554514,1.547912,0.029147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,SQ00014613,P20,20,P,ES2,Chr2,Chr2-4,-0.682856,1.065890,0.249829,...,1.426622,-1.377794,-1.181549,-0.949335,-1.054564,0.215409,0.147263,0.291476,-0.281496,1.065962
380,SQ00014613,P21,21,P,ES2,EMPTY,EMPTY,-0.167718,0.326432,-0.671362,...,0.575870,-2.570827,-1.134039,-0.703302,-1.251033,1.073462,0.888439,-1.108953,-1.346037,1.883433
381,SQ00014613,P22,22,P,ES2,POLR2D,POLR2D-2,0.825134,-1.343931,-1.244786,...,0.586443,0.112912,-1.565975,-0.367032,-1.076886,-0.862479,-0.640960,-0.096550,0.307688,-0.079216
382,SQ00014613,P23,23,P,ES2,PPIB,PPIB-2,-0.026582,0.409760,-1.885286,...,0.520229,-2.690358,-1.749121,-1.007546,-1.702999,1.137241,1.008825,-0.914257,-1.062819,2.591231


In [8]:
# Also drop Costes features
costes_cols_to_drop = [x for x in x_df.columns if "costes" in x.lower()]
print("Dropping {} costes features".format(len(costes_cols_to_drop)))
x_df = x_df.drop(costes_cols_to_drop, axis="columns")

print(x_df.shape)
x_df

Dropping 4 costes features
(3456, 513)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_ER_20_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_RNA_10_0
0,SQ00014612,A01,1,A,A549,EMPTY,EMPTY,-0.378191,-0.030782,2.089629,...,-2.462620,2.556333,0.635753,2.262901,1.020190,-1.051497,1.690149,1.211959,0.904173,0.283193
1,SQ00014612,A02,2,A,A549,MCL1,MCL1-5,0.739037,-0.466950,2.245620,...,-2.527320,2.099596,0.387399,1.606377,0.619004,-1.123289,1.319333,1.126989,1.149692,0.538925
2,SQ00014612,A03,3,A,A549,AKT1,AKT1-1,0.663179,-0.614422,1.552016,...,-0.861919,1.447807,0.372708,1.232601,0.744217,0.065222,1.338340,2.086322,1.824093,0.076692
3,SQ00014612,A04,4,A,A549,KRAS,KRAS-2B,1.142720,-1.444163,2.473148,...,-1.710899,2.102723,1.144608,2.397073,1.158156,-1.639257,1.082772,0.468119,0.520477,-0.689098
4,SQ00014612,A05,5,A,A549,AKT1,AKT1-2,0.506535,-0.384485,2.334123,...,-1.934975,1.380920,-0.091729,1.584126,0.110702,-0.332856,0.630777,1.554514,1.547912,0.029147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,SQ00014613,P20,20,P,ES2,Chr2,Chr2-4,-0.682856,1.065890,0.249829,...,1.426622,-1.377794,-1.181549,-0.949335,-1.054564,0.215409,0.147263,0.291476,-0.281496,1.065962
380,SQ00014613,P21,21,P,ES2,EMPTY,EMPTY,-0.167718,0.326432,-0.671362,...,0.575870,-2.570827,-1.134039,-0.703302,-1.251033,1.073462,0.888439,-1.108953,-1.346037,1.883433
381,SQ00014613,P22,22,P,ES2,POLR2D,POLR2D-2,0.825134,-1.343931,-1.244786,...,0.586443,0.112912,-1.565975,-0.367032,-1.076886,-0.862479,-0.640960,-0.096550,0.307688,-0.079216
382,SQ00014613,P23,23,P,ES2,PPIB,PPIB-2,-0.026582,0.409760,-1.885286,...,0.520229,-2.690358,-1.749121,-1.007546,-1.702999,1.137241,1.008825,-0.914257,-1.062819,2.591231


In [9]:
# Output
profile_file = pathlib.Path(f"{output_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected.tsv.gz")
x_df.to_csv(profile_file, index=False, sep="\t")

## Add sphering

In [11]:
# x_df = normalize(
#     profiles=x_df,
#     features="infer",
#     meta_features=x_df.filter(regex="Metadata_").columns.tolist(),
#     samples="Metadata_gene_name == 'EMPTY'",
#     method="spherize",
# )

x_df

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_ER_20_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_RNA_10_0
0,SQ00014612,A01,1,A,A549,EMPTY,EMPTY,-0.378191,-0.030782,2.089629,...,-2.462620,2.556333,0.635753,2.262901,1.020190,-1.051497,1.690149,1.211959,0.904173,0.283193
1,SQ00014612,A02,2,A,A549,MCL1,MCL1-5,0.739037,-0.466950,2.245620,...,-2.527320,2.099596,0.387399,1.606377,0.619004,-1.123289,1.319333,1.126989,1.149692,0.538925
2,SQ00014612,A03,3,A,A549,AKT1,AKT1-1,0.663179,-0.614422,1.552016,...,-0.861919,1.447807,0.372708,1.232601,0.744217,0.065222,1.338340,2.086322,1.824093,0.076692
3,SQ00014612,A04,4,A,A549,KRAS,KRAS-2B,1.142720,-1.444163,2.473148,...,-1.710899,2.102723,1.144608,2.397073,1.158156,-1.639257,1.082772,0.468119,0.520477,-0.689098
4,SQ00014612,A05,5,A,A549,AKT1,AKT1-2,0.506535,-0.384485,2.334123,...,-1.934975,1.380920,-0.091729,1.584126,0.110702,-0.332856,0.630777,1.554514,1.547912,0.029147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,SQ00014613,P20,20,P,ES2,Chr2,Chr2-4,-0.682856,1.065890,0.249829,...,1.426622,-1.377794,-1.181549,-0.949335,-1.054564,0.215409,0.147263,0.291476,-0.281496,1.065962
380,SQ00014613,P21,21,P,ES2,EMPTY,EMPTY,-0.167718,0.326432,-0.671362,...,0.575870,-2.570827,-1.134039,-0.703302,-1.251033,1.073462,0.888439,-1.108953,-1.346037,1.883433
381,SQ00014613,P22,22,P,ES2,POLR2D,POLR2D-2,0.825134,-1.343931,-1.244786,...,0.586443,0.112912,-1.565975,-0.367032,-1.076886,-0.862479,-0.640960,-0.096550,0.307688,-0.079216
382,SQ00014613,P23,23,P,ES2,PPIB,PPIB-2,-0.026582,0.409760,-1.885286,...,0.520229,-2.690358,-1.749121,-1.007546,-1.702999,1.137241,1.008825,-0.914257,-1.062819,2.591231


In [12]:
# profile_file = pathlib.Path(f"{output_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected_spherized.tsv.gz")
# x_df.to_csv(profile_file, index=False, sep="\t")

In [13]:
x_df = sphere(x_df, regularization=1e-5, mode="corr", column_norm="Metadata_gene_name", values_norm=["EMPTY"])[0]
x_df

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_ER_20_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_RNA_10_0
0,SQ00014612,A01,1,A,A549,EMPTY,EMPTY,-1.935996,-0.673692,0.847742,...,-1.532641,0.113030,0.433600,0.255874,-0.518409,-1.146619,1.118104,1.150425,0.567928,0.369171
1,SQ00014612,A02,2,A,A549,MCL1,MCL1-5,-5.766285,9.060817,-4.167489,...,-4.497974,-10.435532,0.824320,1.875632,4.994489,0.319888,0.934372,-0.002759,-1.218939,2.291609
2,SQ00014612,A03,3,A,A549,AKT1,AKT1-1,-1.598747,-5.361239,8.036478,...,-1.667609,0.590655,-12.083847,4.326957,3.125793,-0.597046,-6.557250,1.775098,-0.041881,3.093983
3,SQ00014612,A04,4,A,A549,KRAS,KRAS-2B,2.168759,-10.507735,-6.338868,...,3.232671,-4.023018,-4.859928,5.658337,3.489841,1.664227,-1.002914,3.163857,-5.282648,-0.744153
4,SQ00014612,A05,5,A,A549,AKT1,AKT1-2,2.246771,-7.710614,1.823458,...,-2.131146,-0.191313,-10.390250,1.773041,-8.999976,-0.078887,-6.427427,1.354188,5.611050,-2.634557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,SQ00014613,P20,20,P,ES2,Chr2,Chr2-4,-3.154688,-1.109687,7.828422,...,-1.921272,12.260365,-6.088907,-5.932748,-0.801599,3.603677,7.224991,0.144833,-1.720884,8.955756
380,SQ00014613,P21,21,P,ES2,EMPTY,EMPTY,0.229929,-0.652326,-0.467426,...,0.208389,0.885847,0.948840,0.444595,0.287073,0.548459,0.873887,-0.053018,-0.427338,1.206029
381,SQ00014613,P22,22,P,ES2,POLR2D,POLR2D-2,4.475561,3.567198,3.232490,...,-12.754598,-13.915999,25.838897,6.346540,18.824343,7.043592,-10.681416,-13.615297,-3.306112,4.882997
382,SQ00014613,P23,23,P,ES2,PPIB,PPIB-2,-11.605747,10.024449,-14.671390,...,0.451709,-10.725376,-19.136646,10.603513,5.215194,-5.147056,0.264905,-2.345584,1.625935,7.503229


In [14]:
profile_file = pathlib.Path(f"{output_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected_spherized.tsv.gz")
x_df.to_csv(profile_file, index=False, sep="\t")

## Save not normalized profiles

In [41]:
def load_raw_profiles(plate, commit="cd91bd0daacef2b5ea25dcceb62482bb664d9de1"):
    link = f"https://github.com/broadinstitute/cell-health/raw/{commit}/1.generate-profiles/data/profiles/{plate}/{plate}_augmented.csv.gz"

    return pd.read_csv(link)


In [42]:
raw_profiles = pd.concat([load_raw_profiles(plate, commit) for plate in tqdm(plates)], sort=True)

100%|██████████| 9/9 [00:07<00:00,  1.23it/s]


In [43]:
x_df = (
    raw_profiles
    .rename(
        {
            "Image_Metadata_Plate": "Metadata_Plate",
            "Image_Metadata_Well": "Metadata_Well",
        },
        axis="columns",
    )
    .drop(["Metadata_broad_sample"], axis="columns")
)

# Realign metadata column names
x_metadata_cols = x_df.columns[x_df.columns.str.startswith("Metadata")]
x_metadata_df = x_df.loc[:, x_metadata_cols]

x_df = x_df.drop(x_metadata_cols, axis="columns")
x_df = pd.concat([x_metadata_df, x_df], axis="columns").reset_index(drop=True)

print(x_df.shape)
x_df

(3456, 1790)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,SQ00014610,A01,1,A,A549,EMPTY,EMPTY,6142.0,1087.0,1108.0,...,3.559823,2.321827,2.215901,2.348993,2.067028,1.875259,2.230317,2.911926,2.641810,2.997173
1,SQ00014610,A02,2,A,A549,MCL1,MCL1-5,7981.5,1074.0,1084.0,...,3.461207,2.144722,1.968777,2.201527,1.892255,1.716484,2.094723,3.000435,2.806122,3.056962
2,SQ00014610,A03,3,A,A549,AKT1,AKT1-1,7772.5,1057.0,1122.5,...,3.330129,2.111943,1.962466,2.141256,1.832483,1.668058,1.987148,2.935527,2.720891,2.985796
3,SQ00014610,A04,4,A,A549,KRAS,KRAS-2B,7899.0,1042.5,1069.5,...,3.161432,2.047220,1.911537,2.111766,1.826544,1.669786,1.952193,2.918356,2.824358,2.854221
4,SQ00014610,A05,5,A,A549,AKT1,AKT1-2,6127.0,1081.0,1089.0,...,3.451520,2.047491,1.723013,2.139267,1.947429,1.699718,2.130986,2.898105,2.656090,2.934385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3451,SQ00014618,P20,20,P,HCC44,Chr2,Chr2-4,11255.5,1096.0,1078.5,...,3.041736,2.189158,2.099769,2.291720,1.865265,1.705550,1.989196,2.403242,2.294250,2.465146
3452,SQ00014618,P21,21,P,HCC44,EMPTY,EMPTY,11076.0,1086.0,1031.5,...,2.737910,2.170629,2.048944,2.258886,1.796615,1.638885,1.926364,2.462477,2.342575,2.511645
3453,SQ00014618,P22,22,P,HCC44,POLR2D,POLR2D-2,19025.0,1079.0,1002.0,...,3.162322,2.058264,1.872603,2.187862,1.857193,1.621113,2.032905,2.368877,2.311463,2.423977
3454,SQ00014618,P23,23,P,HCC44,PPIB,PPIB-2,12710.0,1110.0,1048.5,...,2.930082,2.204242,2.073079,2.289184,1.792738,1.628610,1.928525,2.452355,2.372163,2.490241


In [44]:
x_df = normalize(
    profiles=x_df,
    features="infer",
    meta_features=x_df.filter(regex="Metadata_").columns.tolist(),
    samples="all",
    method="standardize"
)
x_df

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,SQ00014610,A01,1,A,A549,EMPTY,EMPTY,-1.440620,-0.297351,0.713185,...,0.427678,1.282520,1.493875,0.844112,1.346465,1.535849,1.287128,1.289999,1.043189,1.364738
1,SQ00014610,A02,2,A,A549,MCL1,MCL1-5,-0.820022,-0.593748,0.157797,...,0.169831,0.076940,0.024301,-0.156710,0.383498,0.507997,0.591366,1.548969,1.647975,1.528259
2,SQ00014610,A03,3,A,A549,AKT1,AKT1-1,-0.890533,-0.981345,1.048732,...,-0.172889,-0.146191,-0.013228,-0.565758,0.054167,0.194504,0.039379,1.359054,1.334264,1.333620
3,SQ00014610,A04,4,A,A549,KRAS,KRAS-2B,-0.847855,-1.311942,-0.177750,...,-0.613971,-0.586773,-0.316083,-0.765898,0.021440,0.205691,-0.139984,1.308812,1.715099,0.973765
4,SQ00014610,A05,5,A,A549,AKT1,AKT1-2,-1.445681,-0.434150,0.273502,...,0.144505,-0.584928,-1.437178,-0.579254,0.687498,0.399458,0.777442,1.249560,1.095750,1.193013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3451,SQ00014618,P20,20,P,HCC44,Chr2,Chr2-4,0.284539,-0.092153,0.030520,...,-0.926931,0.379419,0.803273,0.455408,0.234789,0.437217,0.049888,-0.198370,-0.236087,-0.090345
3452,SQ00014618,P21,21,P,HCC44,EMPTY,EMPTY,0.223981,-0.320151,-1.057115,...,-1.721325,0.253292,0.501032,0.232570,-0.143463,0.005651,-0.272520,-0.025052,-0.058216,0.036826
3453,SQ00014618,P22,22,P,HCC44,POLR2D,POLR2D-2,2.905763,-0.479749,-1.739780,...,-0.611642,-0.511595,-0.547615,-0.249453,0.190314,-0.109398,0.274164,-0.298920,-0.172729,-0.202944
3454,SQ00014618,P23,23,P,HCC44,PPIB,PPIB-2,0.775249,0.227045,-0.663715,...,-1.218865,0.482099,0.644556,0.438197,-0.164824,-0.060865,-0.261433,-0.054668,0.050691,-0.021712


In [45]:
# Perform feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]

x_df = feature_select(profiles=x_df, operation=feature_select_ops, na_cutoff=0)

print(x_df.shape)
x_df

(3456, 405)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_FormFactor,Cells_AreaShape_Solidity,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_InverseDifferenceMoment_Mito_10_0,Nuclei_Texture_InverseDifferenceMoment_Mito_20_0,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_ER_20_0,Nuclei_Texture_SumAverage_RNA_20_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_Mito_20_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_Mito_20_0
0,SQ00014610,A01,1,A,A549,EMPTY,EMPTY,1.389069,0.544848,1.115782,...,-0.454312,-1.092728,-0.151705,-0.292525,-1.128694,-0.300575,0.517903,-0.491960,1.282520,1.535849
1,SQ00014610,A02,2,A,A549,MCL1,MCL1-5,0.589662,0.834696,0.389911,...,-0.000350,-0.647848,0.556318,-0.104040,-1.197435,-0.296174,0.275071,-0.854271,0.076940,0.507997
2,SQ00014610,A03,3,A,A549,AKT1,AKT1-1,0.468350,0.723579,0.855382,...,0.176164,-0.487622,0.451854,0.300909,-1.303741,-0.717634,-0.358357,-1.369487,-0.146191,0.194504
3,SQ00014610,A04,4,A,A549,KRAS,KRAS-2B,2.074089,0.907895,0.473071,...,0.040333,-0.662136,0.297309,0.404368,0.346135,-0.698184,-1.153863,-1.156340,-0.586773,0.205691
4,SQ00014610,A05,5,A,A549,AKT1,AKT1-2,0.990033,0.655080,0.401046,...,-0.480897,-1.114307,-0.149586,0.066005,-1.182267,-0.443063,0.199746,-0.616599,-0.584928,0.399458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3451,SQ00014618,P20,20,P,HCC44,Chr2,Chr2-4,0.112918,1.081731,0.604378,...,0.709441,0.939859,0.035536,-0.259901,-0.089036,0.392783,-0.296372,1.141510,0.379419,0.437217
3452,SQ00014618,P21,21,P,HCC44,EMPTY,EMPTY,-0.151096,1.318775,0.288216,...,0.852550,1.094910,-0.190898,-1.086341,-0.367832,0.044236,-1.551133,0.896821,0.253292,0.005651
3453,SQ00014618,P22,22,P,HCC44,POLR2D,POLR2D-2,-2.388129,0.240751,0.120395,...,1.999277,1.291891,2.273384,-0.905629,0.354684,0.973695,0.082808,0.824652,-0.511595,-0.109398
3454,SQ00014618,P23,23,P,HCC44,PPIB,PPIB-2,-0.421905,1.148067,0.241985,...,1.021808,1.052520,0.405216,-0.766357,-0.081998,0.254301,-0.576479,1.072265,0.482099,-0.060865


In [46]:
# Also drop Costes features
costes_cols_to_drop = [x for x in x_df.columns if "costes" in x.lower()]
print("Dropping {} costes features".format(len(costes_cols_to_drop)))
x_df = x_df.drop(costes_cols_to_drop, axis="columns")

print(x_df.shape)
x_df

Dropping 2 costes features
(3456, 403)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_FormFactor,Cells_AreaShape_Solidity,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_InverseDifferenceMoment_Mito_10_0,Nuclei_Texture_InverseDifferenceMoment_Mito_20_0,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_ER_20_0,Nuclei_Texture_SumAverage_RNA_20_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_Mito_20_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_Mito_20_0
0,SQ00014610,A01,1,A,A549,EMPTY,EMPTY,1.389069,0.544848,1.115782,...,-0.454312,-1.092728,-0.151705,-0.292525,-1.128694,-0.300575,0.517903,-0.491960,1.282520,1.535849
1,SQ00014610,A02,2,A,A549,MCL1,MCL1-5,0.589662,0.834696,0.389911,...,-0.000350,-0.647848,0.556318,-0.104040,-1.197435,-0.296174,0.275071,-0.854271,0.076940,0.507997
2,SQ00014610,A03,3,A,A549,AKT1,AKT1-1,0.468350,0.723579,0.855382,...,0.176164,-0.487622,0.451854,0.300909,-1.303741,-0.717634,-0.358357,-1.369487,-0.146191,0.194504
3,SQ00014610,A04,4,A,A549,KRAS,KRAS-2B,2.074089,0.907895,0.473071,...,0.040333,-0.662136,0.297309,0.404368,0.346135,-0.698184,-1.153863,-1.156340,-0.586773,0.205691
4,SQ00014610,A05,5,A,A549,AKT1,AKT1-2,0.990033,0.655080,0.401046,...,-0.480897,-1.114307,-0.149586,0.066005,-1.182267,-0.443063,0.199746,-0.616599,-0.584928,0.399458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3451,SQ00014618,P20,20,P,HCC44,Chr2,Chr2-4,0.112918,1.081731,0.604378,...,0.709441,0.939859,0.035536,-0.259901,-0.089036,0.392783,-0.296372,1.141510,0.379419,0.437217
3452,SQ00014618,P21,21,P,HCC44,EMPTY,EMPTY,-0.151096,1.318775,0.288216,...,0.852550,1.094910,-0.190898,-1.086341,-0.367832,0.044236,-1.551133,0.896821,0.253292,0.005651
3453,SQ00014618,P22,22,P,HCC44,POLR2D,POLR2D-2,-2.388129,0.240751,0.120395,...,1.999277,1.291891,2.273384,-0.905629,0.354684,0.973695,0.082808,0.824652,-0.511595,-0.109398
3454,SQ00014618,P23,23,P,HCC44,PPIB,PPIB-2,-0.421905,1.148067,0.241985,...,1.021808,1.052520,0.405216,-0.766357,-0.081998,0.254301,-0.576479,1.072265,0.482099,-0.060865


In [47]:
# Output
profile_file = pathlib.Path(f"{output_dir}/cell_health_profiles_merged_standardized_featureselected.tsv.gz")
x_df.to_csv(profile_file, index=False, sep="\t")