## Compile dataset of clones resistant to other drugs

**Gregory Way, 2021**

**Yu Han, 2021**

This script is modified from Greg Way's original scripts of 8.compile-otherclone-dataset.

This dataset includes new batches of 24~27 including WT (10, 12-15) and Bz-resistant (6-10) clones.
They are DMSO-treated clones to later apply the bortezomib resistance signature.

In [2]:
import sys
import pathlib
import numpy as np
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features

sys.path.insert(0, "../2.describe-data/scripts")
from processing_utils import load_data

In [3]:
np.random.seed(1234)

In [4]:
data_dir = pathlib.Path("../0.generate-profiles/profiles")
cell_count_dir = pathlib.Path("../0.generate-profiles/cell_counts/")

output_dir = pathlib.Path("data")

profile_suffix = "normalized.csv.gz"

In [5]:
#Load new batches 24~27
datasets_val = {
    "WT": {
        "2021_08_02_Batch24": ["221057"],
        "2021_08_02_Batch25": ["221058"],
        "2021_08_03_Batch26": ["221093"],
        "2021_08_03_Batch27": ["221094"],
    },
    "BZ": {
        "2021_08_02_Batch24": ["221057"],
        "2021_08_02_Batch25": ["221058"],
    }
}

In [6]:
#added 'val' to each original variable names from 8.0

full_df_val = []
for dataset_val in datasets_val:
    dataset_df_val = []
    for batch in datasets_val[dataset_val]:
        plates = datasets_val[dataset_val][batch]
        
        df_val = load_data(
            batch=batch,
            plates=plates,
            profile_dir=data_dir,
            suffix=profile_suffix,
            combine_dfs=True,
            harmonize_cols=True,
            add_cell_count=True,
            cell_count_dir=cell_count_dir
        )
        
        # Add important metadata features
        df_val = df_val.assign(
            Metadata_dataset=dataset_val,
            Metadata_batch=batch,
            Metadata_clone_type="resistant",
            Metadata_clone_type_indicator=1,
            Metadata_model_split="otherclone"
        )

        df_val.loc[df_val.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "sensitive"
        df_val.loc[df_val.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type_indicator"] = 0
        dataset_df_val.append(df_val)

    # Merge plates of the same dataset together
    dataset_df_val = pd.concat(dataset_df_val, axis="rows", sort=False).reset_index(drop=True)
    
    # Generate a unique sample ID
    # (This will be used in singscore calculation)
    dataset_df_val = dataset_df_val.assign(
        Metadata_unique_sample_name=[f"profile_{x}_{dataset_val}" for x in range(0, dataset_df_val.shape[0])]
    )
    
    full_df_val.append(dataset_df_val)

#remove other clone types from the df
full_df_val = pd.concat(full_df_val, axis="rows", sort=False).reset_index(drop=True)
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'WT_parental']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'CloneA']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'CloneE']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 2']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 3']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 4']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 5']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 6']
full_df_val

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03,Metadata_dataset,Metadata_clone_type,Metadata_clone_type_indicator,Metadata_model_split,Metadata_unique_sample_name
3,221057,B05,2021_08_02_Batch24,1814,2.5x10^3 cells/well,4,WT clone 10,20210728,221057,48 hr,...,-0.375636,-0.134097,-0.152386,-0.146352,-0.120216,WT,sensitive,0,otherclone,profile_3_WT
4,221057,B06,2021_08_02_Batch24,5481,2.5x10^3 cells/well,5,WT clone 12,20210728,221057,48 hr,...,-0.419307,-0.405749,-0.424020,-0.407004,-0.413901,WT,sensitive,0,otherclone,profile_4_WT
8,221057,B10,2021_08_02_Batch24,1925,2.5x10^3 cells/well,4,WT clone 10,20210728,221057,48 hr,...,0.020198,0.150755,0.168786,0.191561,0.189532,WT,sensitive,0,otherclone,profile_8_WT
9,221057,B11,2021_08_02_Batch24,3910,2.5x10^3 cells/well,5,WT clone 12,20210728,221057,48 hr,...,-0.837201,-0.832439,-0.820963,-0.826323,-0.816295,WT,sensitive,0,otherclone,profile_9_WT
10,221057,C02,2021_08_02_Batch24,2230,2.5x10^3 cells/well,10,BZ007,20210728,221057,48 hr,...,0.900300,0.860706,0.872132,0.880284,0.884321,WT,resistant,1,otherclone,profile_10_WT
11,221057,C03,2021_08_02_Batch24,3910,2.5x10^3 cells/well,9,BZ006,20210728,221057,48 hr,...,-0.569203,-0.530383,-0.554569,-0.533412,-0.544110,WT,resistant,1,otherclone,profile_11_WT
12,221057,C04,2021_08_02_Batch24,3026,2.5x10^3 cells/well,8,WT clone 15,20210728,221057,48 hr,...,-0.921771,-0.885672,-0.858264,-0.871334,-0.858945,WT,sensitive,0,otherclone,profile_12_WT
13,221057,C05,2021_08_02_Batch24,3129,2.5x10^3 cells/well,7,WT clone 14,20210728,221057,48 hr,...,2.492172,2.475638,2.409932,2.409553,2.397484,WT,sensitive,0,otherclone,profile_13_WT
14,221057,C06,2021_08_02_Batch24,4511,2.5x10^3 cells/well,6,WT clone 13,20210728,221057,48 hr,...,1.141587,1.106881,1.066880,1.091601,1.084607,WT,sensitive,0,otherclone,profile_14_WT
15,221057,C07,2021_08_02_Batch24,2482,2.5x10^3 cells/well,10,BZ007,20210728,221057,48 hr,...,1.414635,1.425230,1.437950,1.402305,1.434483,WT,resistant,1,otherclone,profile_15_WT


In [7]:
# Reorder features
common_metadata = infer_cp_features(full_df_val, metadata=True)
morph_features = infer_cp_features(full_df_val)

full_df_val = full_df_val.reindex(common_metadata + morph_features, axis="columns")

print(full_df_val.shape)

(200, 3546)


In [21]:
pd.crosstab(full_df_val.Metadata_clone_type_indicator, full_df_val.Metadata_model_split)

Metadata_model_split,otherclone
Metadata_clone_type_indicator,Unnamed: 1_level_1
0,120
1,80


In [22]:
pd.crosstab(full_df_val.Metadata_clone_number, full_df_val.Metadata_model_split)

Metadata_model_split,otherclone
Metadata_clone_number,Unnamed: 1_level_1
BZ006,16
BZ007,16
BZ008,16
BZ009,16
BZ010,16
WT clone 10,24
WT clone 12,24
WT clone 13,24
WT clone 14,24
WT clone 15,24


In [25]:
#saved output file as 'otherclones_normalized_profiles_LAST_BATCH_VALIDATION.tsv.gz'
output_file = pathlib.Path(f"{output_dir}/otherclones_normalized_profiles_LAST_BATCH_VALIDATION.tsv.gz")
full_df_val.to_csv(output_file, sep="\t", index=False)