In [1]:
import pathlib
import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

In [2]:
data_dir = pathlib.Path("../0.generate-profiles/profiles")
output_dir = pathlib.Path("data")
batches = {
    "2019_11_11_Batch4": ["WTmut04hWed", "WTmut04hTh"],
    "2019_11_19_Batch5": ["217755"],
    "2019_11_20_Batch6": ["217760", "217762"],
    "2019_11_22_Batch7": ["217766", "217768"]
}

profile_suffix = "_normalized.csv.gz"
drop_cols = ["Metadata_plate_filename"]

feature_select_opts = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]
corr_threshold = 0.8
na_cutoff = 0

In [3]:
dfs = []
for batch in batches:
    for plate in batches[batch]:
        profile_file = pathlib.Path(f"{data_dir}/{batch}/{plate}/{plate}{profile_suffix}")
        profile_df = pd.read_csv(profile_file)
        profile_df = profile_df.assign(
            Metadata_batch=batch, Metadata_clone_type="resistant"
        )
        
        if batch != "2019_11_11_Batch4":
            profile_df = profile_df.drop(drop_cols, axis="columns")
        
        print(infer_cp_features(profile_df, metadata=True))
        
        profile_df.loc[profile_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "sensitive"
        dfs.append(profile_df)

['Metadata_plate_map_name', 'Metadata_clone_number', 'Metadata_plate_ID', 'Metadata_treatment', 'Metadata_Plate', 'Metadata_Well', 'Metadata_batch', 'Metadata_clone_type']
['Metadata_plate_map_name', 'Metadata_clone_number', 'Metadata_plate_ID', 'Metadata_treatment', 'Metadata_Plate', 'Metadata_Well', 'Metadata_batch', 'Metadata_clone_type']
['Metadata_plate_map_name', 'Metadata_clone_number', 'Metadata_plate_ID', 'Metadata_treatment', 'Metadata_Plate', 'Metadata_Well', 'Metadata_batch', 'Metadata_clone_type']
['Metadata_plate_map_name', 'Metadata_clone_number', 'Metadata_plate_ID', 'Metadata_treatment', 'Metadata_Plate', 'Metadata_Well', 'Metadata_batch', 'Metadata_clone_type']
['Metadata_plate_map_name', 'Metadata_clone_number', 'Metadata_plate_ID', 'Metadata_treatment', 'Metadata_Plate', 'Metadata_Well', 'Metadata_batch', 'Metadata_clone_type']
['Metadata_plate_map_name', 'Metadata_clone_number', 'Metadata_plate_ID', 'Metadata_treatment', 'Metadata_Plate', 'Metadata_Well', 'Metadata

In [4]:
bulk_df = pd.concat(dfs, sort=False).reset_index(drop=True)

# Reorder features
feat = infer_cp_features(bulk_df)
meta = infer_cp_features(bulk_df, metadata=True)
bulk_df = bulk_df.reindex(meta + feat, axis="columns")

print(bulk_df.shape)
bulk_df.head()

(420, 3536)


Unnamed: 0,Metadata_plate_map_name,Metadata_clone_number,Metadata_plate_ID,Metadata_treatment,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_clone_type,Cells_AreaShape_Area,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,WTmut04hWed,BZ017,217744,DMSO,WTmut04hWed,B02,2019_11_11_Batch4,resistant,-0.339587,-1.910937,...,1.62467,1.620789,1.480625,1.570777,1.473312,1.557271,1.627277,1.607243,1.629955,1.628896
1,WTmut04hWed,WT002,217744,DMSO,WTmut04hWed,B03,2019_11_11_Batch4,sensitive,0.617688,-1.715722,...,1.506541,1.515823,1.550858,1.562888,1.557828,1.559924,1.52536,1.506269,1.503591,1.520258
2,WTmut04hWed,WT008,217744,DMSO,WTmut04hWed,B04,2019_11_11_Batch4,sensitive,-0.286907,-0.284146,...,-0.268834,-0.296211,-0.29148,-0.240747,-0.274642,-0.216133,-0.261745,-0.272104,-0.266434,-0.274902
3,WTmut04hWed,WT009,217744,DMSO,WTmut04hWed,B05,2019_11_11_Batch4,sensitive,-0.67072,0.301499,...,-0.227733,-0.254327,-0.30041,-0.319758,-0.279814,-0.362015,-0.261662,-0.246862,-0.261455,-0.247199
4,WTmut04hWed,BZ018,217744,DMSO,WTmut04hWed,B06,2019_11_11_Batch4,resistant,0.43707,-0.316682,...,-0.436134,-0.400178,-0.347382,-0.253614,-0.355295,-0.240678,-0.477345,-0.469789,-0.476312,-0.461479


In [5]:
bulk_df = feature_select(
    bulk_df,
    operation=feature_select_opts,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
)

print(bulk_df.shape)
bulk_df.head()

(420, 162)


Unnamed: 0,Metadata_plate_map_name,Metadata_clone_number,Metadata_plate_ID,Metadata_treatment,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_clone_type,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,...,Nuclei_RadialDistribution_RadialCV_RNA_1of4,Nuclei_Texture_Correlation_AGP_20_00,Nuclei_Texture_Correlation_AGP_20_03,Nuclei_Texture_Correlation_AGP_5_00,Nuclei_Texture_InfoMeas1_DNA_10_02,Nuclei_Texture_InfoMeas1_DNA_5_00,Nuclei_Texture_InfoMeas1_Mito_10_00,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas2_DNA_5_00,Nuclei_Texture_SumAverage_DNA_20_00
0,WTmut04hWed,BZ017,217744,DMSO,WTmut04hWed,B02,2019_11_11_Batch4,resistant,-0.013031,-0.552375,...,-0.036359,0.267998,-0.479067,0.432208,1.294007,1.070871,0.454538,-0.388005,-1.567204,-1.124995
1,WTmut04hWed,WT002,217744,DMSO,WTmut04hWed,B03,2019_11_11_Batch4,sensitive,0.540148,-0.688571,...,0.524096,-0.846893,-0.886894,-0.753365,1.755599,-0.715581,1.596588,-0.130438,6.8e-05,-1.114912
2,WTmut04hWed,WT008,217744,DMSO,WTmut04hWed,B04,2019_11_11_Batch4,sensitive,-1.801681,0.967419,...,-0.990714,-0.518431,-0.545056,0.573946,0.788443,-0.418218,1.101691,-0.314686,-0.023801,-0.090529
3,WTmut04hWed,WT009,217744,DMSO,WTmut04hWed,B05,2019_11_11_Batch4,sensitive,-1.507963,1.590062,...,0.127708,-0.31985,0.06125,-1.289395,0.918946,1.180027,1.09335,1.125821,-1.388672,-0.27786
4,WTmut04hWed,BZ018,217744,DMSO,WTmut04hWed,B06,2019_11_11_Batch4,resistant,1.363978,-1.787466,...,0.531589,-0.152575,-1.076457,0.280363,1.182714,-0.153243,-0.138147,-1.455125,-0.593253,-1.372097


In [6]:
output_file = pathlib.Path(f"{output_dir}/bulk_profiles_four_clone.csv.gz")
bulk_df.to_csv(output_file, sep=",", compression="gzip", index=False)