In [1]:
import sys
import pathlib
import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

sys.path.insert(0, "../2.describe-data/scripts")
from processing_utils import load_data

In [2]:
data_dir = pathlib.Path("../0.generate-profiles/profiles")
cell_count_dir = pathlib.Path("../0.generate-profiles/cell_counts/")

output_dir = pathlib.Path("data")
batches = [
    "2019_11_11_Batch4",
    "2019_11_19_Batch5",
    "2019_11_20_Batch6",
    "2019_11_22_Batch7",
    "2020_07_02_Batch8",
]

profile_suffix = "normalized.csv.gz"
drop_cols = ["Metadata_plate_filename"]

feature_select_opts = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]
corr_threshold = 0.8
na_cutoff = 0

In [3]:
dfs = {"four_clone": [], "cloneAE": []}
for batch in batches:
    # Load and harmonize data
    df = load_data(
        batch=batch,
        profile_dir=data_dir,
        suffix=profile_suffix,
        combine_dfs=True,
        harmonize_cols=True,
        cell_count_dir=cell_count_dir
    )
    
    # Add important metadata features
    df = df.assign(Metadata_batch=batch, Metadata_clone_type="resistant")
    df.loc[df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "sensitive"

    # Store in dictionary
    if batch == "2020_07_02_Batch8":
        df_index = "cloneAE"
    else:
        df_index = "four_clone"

    dfs[df_index].append(df)

In [4]:
for dataset in dfs:
    bulk_df = pd.concat(dfs[dataset], sort=False).reset_index(drop=True)
    
    # Reorder features
    feat = infer_cp_features(bulk_df)
    meta = infer_cp_features(bulk_df, metadata=True)
    bulk_df = bulk_df.reindex(meta + feat, axis="columns").drop(drop_cols, axis="columns")

    dfs[dataset] = bulk_df
    print(dataset)
    print(bulk_df.shape)

four_clone
(420, 3536)
cloneAE
(240, 3538)


In [5]:
# Apply feature selection in only the four_clone dataset and reindex features in clone AE
dfs["four_clone"] = feature_select(
    dfs["four_clone"],
    operation=feature_select_opts,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
)

dfs["cloneAE"] = dfs["cloneAE"].reindex(dfs["four_clone"].columns, axis="columns")
dfs["cloneAE"].head()

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_map_name,Metadata_treatment,Metadata_clone_type,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,...,Nuclei_RadialDistribution_RadialCV_RNA_1of4,Nuclei_Texture_Correlation_AGP_20_00,Nuclei_Texture_Correlation_AGP_20_03,Nuclei_Texture_Correlation_AGP_5_00,Nuclei_Texture_InfoMeas1_DNA_10_02,Nuclei_Texture_InfoMeas1_DNA_5_00,Nuclei_Texture_InfoMeas1_Mito_10_00,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas2_DNA_5_00,Nuclei_Texture_SumAverage_DNA_20_00
0,218361,B02,2020_07_02_Batch8,WT_parental,218361,218361,0.1% DMSO,sensitive,0.849812,-0.624847,...,-0.173356,1.488903,0.509627,-0.119362,1.032186,0.960587,0.160435,0.206745,-0.908971,-1.415259
1,218361,B03,2020_07_02_Batch8,WT_parental,218361,218361,2.1 nM bortezomib,sensitive,-1.051046,1.524303,...,-0.372291,0.409426,0.784427,-1.684882,0.493551,0.515151,0.371197,0.510833,-0.285236,-0.558645
2,218361,B04,2020_07_02_Batch8,WT_parental,218361,218361,21 nM bortezomib,sensitive,-1.485347,1.882431,...,-1.301111,0.238388,3.391447,-0.94302,-0.500219,-0.874694,0.370207,0.456534,0.666356,0.406644
3,218361,B05,2020_07_02_Batch8,WT_parental,218361,218361,210 nM bortezomib,sensitive,-1.743359,2.415426,...,-1.268014,-1.852838,-0.1398,-1.298234,-0.733132,-0.80926,-1.640103,-1.59912,0.792672,0.712894
4,218361,B06,2020_07_02_Batch8,WT_parental,218361,218361,0.1% DMSO,sensitive,-0.178021,-0.344715,...,-0.701555,1.502827,0.523103,-0.37307,0.439813,0.320412,0.046112,0.141671,-0.099179,-0.685314


In [9]:
dfs["four_clone"].Metadata_clone_type.value_counts()

sensitive    252
resistant    168
Name: Metadata_clone_type, dtype: int64

In [6]:
print(dfs["four_clone"].shape)
print(dfs["cloneAE"].shape)

(420, 162)
(240, 162)


In [7]:
for dataset in dfs:
    output_file = pathlib.Path(f"{output_dir}/bulk_profiles_{dataset}.csv.gz")
    dfs[dataset].to_csv(output_file, sep=",", compression="gzip", index=False)