In [1]:
import sys
import pathlib
import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features, write_gct

sys.path.insert(0, "../2.describe-data/scripts")
from processing_utils import load_data

In [2]:
data_dir = pathlib.Path("../0.generate-profiles/profiles")
cell_count_dir = pathlib.Path("../0.generate-profiles/cell_counts/")

output_dir = pathlib.Path("data")
batches = [
    "2019_11_11_Batch4",
    "2019_11_19_Batch5",
    "2019_11_20_Batch6",
    "2019_11_22_Batch7",
    "2020_07_02_Batch8",
]

profile_suffix = "normalized.csv.gz"
drop_cols = ["Metadata_plate_filename"]

feature_select_opts = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]
corr_threshold = 0.95
na_cutoff = 0

In [3]:
dfs = {"four_clone": [], "cloneAE": []}
for batch in batches:
    # Load and harmonize data
    df = load_data(
        batch=batch,
        profile_dir=data_dir,
        suffix=profile_suffix,
        combine_dfs=True,
        harmonize_cols=True,
        cell_count_dir=cell_count_dir
    )
    
    # Add important metadata features
    df = df.assign(
        Metadata_batch=batch,
        Metadata_clone_type="resistant",
        Metadata_clone_type_indicator=1
    )
    df.loc[df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "sensitive"
    df.loc[df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type_indicator"] = 0

    # Store in dictionary
    if batch == "2020_07_02_Batch8":
        df_index = "cloneAE"
    else:
        df_index = "four_clone"

    dfs[df_index].append(df)

In [4]:
for dataset in dfs:
    bulk_df = pd.concat(dfs[dataset], sort=False).reset_index(drop=True)
    bulk_df = bulk_df.assign(
        Metadata_sample_index=[f"sample_index_{x}" for x in range(0, bulk_df.shape[0])]
    )
    
    # Reorder features
    feat = infer_cp_features(bulk_df)
    meta = infer_cp_features(bulk_df, metadata=True)
    bulk_df = bulk_df.reindex(meta + feat, axis="columns").drop(drop_cols, axis="columns")

    dfs[dataset] = bulk_df
    print(dataset)
    print(bulk_df.shape)

four_clone
(420, 3538)
cloneAE
(240, 3540)


In [5]:
# Apply feature selection in only the four_clone dataset and reindex features in clone AE
dfs["four_clone"] = feature_select(
    dfs["four_clone"],
    operation=feature_select_opts,
    na_cutoff=na_cutoff,
)

dfs["cloneAE"] = dfs["cloneAE"].reindex(dfs["four_clone"].columns, axis="columns")
dfs["cloneAE"].head()

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_map_name,Metadata_treatment,Metadata_clone_type,Metadata_clone_type_indicator,Metadata_sample_index,...,Nuclei_Texture_InfoMeas1_DNA_10_03,Nuclei_Texture_InfoMeas1_DNA_5_00,Nuclei_Texture_InfoMeas1_ER_20_03,Nuclei_Texture_InfoMeas1_ER_5_00,Nuclei_Texture_InfoMeas1_Mito_10_00,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_InfoMeas2_AGP_5_02,Nuclei_Texture_InfoMeas2_DNA_5_00,Nuclei_Texture_InfoMeas2_Mito_10_00,Nuclei_Texture_SumAverage_DNA_20_00
0,218361,B02,2020_07_02_Batch8,WT_parental,218361,218361,0.1% DMSO,sensitive,0,sample_index_0,...,1.040228,0.960587,-0.650456,-1.06689,0.160435,0.206745,1.190005,-0.908971,0.077366,-1.415259
1,218361,B03,2020_07_02_Batch8,WT_parental,218361,218361,2.1 nM bortezomib,sensitive,0,sample_index_1,...,0.438775,0.515151,-1.591195,0.024224,0.371197,0.510833,0.587124,-0.285236,0.029077,-0.558645
2,218361,B04,2020_07_02_Batch8,WT_parental,218361,218361,21 nM bortezomib,sensitive,0,sample_index_2,...,-0.432516,-0.874694,0.500517,0.380207,0.370207,0.456534,0.842274,0.666356,0.100272,0.406644
3,218361,B05,2020_07_02_Batch8,WT_parental,218361,218361,210 nM bortezomib,sensitive,0,sample_index_3,...,-0.799819,-0.80926,-0.221061,0.81485,-1.640103,-1.59912,0.571637,0.792672,0.834448,0.712894
4,218361,B06,2020_07_02_Batch8,WT_parental,218361,218361,0.1% DMSO,sensitive,0,sample_index_4,...,0.411099,0.320412,0.553904,-0.611991,0.046112,0.141671,1.072625,-0.099179,0.244864,-0.685314


In [6]:
print(dfs["four_clone"].shape)
print(dfs["cloneAE"].shape)

(420, 338)
(240, 338)


In [7]:
for dataset in dfs:
    output_file = pathlib.Path(f"{output_dir}/bulk_profiles_{dataset}.csv.gz")
    output_gct_file = pathlib.Path(f"{output_dir}/bulk_profiles_{dataset}.gct")
    
    dfs[dataset].to_csv(output_file, sep=",", compression="gzip", index=False)
    write_gct(profiles=dfs[dataset], output_file=output_gct_file)