## Adjust batch effects with a spherize transform

Here, we load in all normalized profiles (level 4a) data across all plates and apply a spherize transform using the DMSO profiles as the background distribution.

We've previously observed that sphering (aka whitening) the data successfully adjusts for technical artifacts induced by batch to batch variation and plate position effects.

In [1]:
import os
import pathlib
import subprocess
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import output, infer_cp_features

In [2]:
batch = "2016_04_01_a549_48hr_batch1"
suffix = "_normalized.csv.gz"

plates = [x.name for x in pathlib.Path(batch).iterdir() if ".DS_Store" not in x.name]
files = [pathlib.Path(f"{batch}/{x}/{x}{suffix}") for x in plates]

feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers"
]

na_cut = 0
corr_threshold = 0.95
outlier_cutoff = 50

In [3]:
profile_df = pd.concat([pd.read_csv(x) for x in files]).reset_index(drop=True)

# Perform feature selection
profile_df = feature_select(
    profiles=profile_df,
    operation=feature_select_ops,
    na_cutoff=0,
    corr_threshold=corr_threshold,
    outlier_cutoff=outlier_cutoff
)

print(profile_df.shape)
profile_df.head()

(52223, 678)


Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,Metadata_pert_id,Metadata_pert_mfc_id,Metadata_pert_well,Metadata_pert_id_vendor,Metadata_cell_id,...,Nuclei_Texture_SumEntropy_DNA_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_RNA_20_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0
0,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A01,,A549,...,2.3769,-1.1715,-0.2945,-0.8074,3.1837,-0.26162,2.5106,1.497,-1.4594,-0.88395
1,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A02,,A549,...,1.4697,-0.61328,-1.5295,-1.8441,1.8186,-1.3297,1.5711,1.5446,-0.70841,0.032044
2,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A03,,A549,...,2.1943,-0.48598,-0.066499,-1.1862,2.5874,0.23626,2.3031,1.2753,-0.82674,-0.16491
3,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A04,,A549,...,1.8409,-0.56187,-0.2185,-0.52165,2.1313,0.63081,1.9742,0.89729,0.056799,0.74014
4,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A05,,A549,...,2.7893,0.62797,0.2565,0.12294,3.0817,0.5519,2.92,1.8428,-0.37866,0.10082


In [4]:
spherize_df = normalize(
    profiles=profile_df,
    features="infer",
    meta_features="infer",
    samples="Metadata_broad_sample == 'DMSO'",
    method="whiten",
)

print(spherize_df.shape)
spherize_df.head()

(52223, 678)


Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,Metadata_pert_id,Metadata_pert_mfc_id,Metadata_pert_well,Metadata_pert_id_vendor,Metadata_cell_id,...,Nuclei_Texture_SumEntropy_DNA_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_RNA_20_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0
0,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A01,,A549,...,0.814065,-1.416636,0.685388,0.218629,2.58371,0.782355,-0.73473,1.451314,0.032766,0.232238
1,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A02,,A549,...,0.233154,-0.690759,-0.741916,-1.870173,1.656417,-1.73091,-1.185649,1.95674,-0.038255,-0.885976
2,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A03,,A549,...,0.270186,-0.865425,1.993448,-1.992879,3.114174,1.25974,0.113122,0.468954,-1.452096,-0.785315
3,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A04,,A549,...,-0.512029,-0.301055,-0.627546,-0.662708,2.181266,1.653466,0.285741,0.215257,-1.069456,0.751037
4,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,,,A05,,A549,...,1.071772,-1.059712,0.112003,-0.355878,3.840473,0.212448,0.945978,1.506297,-0.598833,0.342307


In [5]:
output_file = f"{batch}_spherized_profiles.csv.gz"

output(
    df=spherize_df,
    output_filename=output_file
)