## Compile dataset of clones resistant to other drugs

**Gregory Way, 2021**

**Yu Han, 2021**

This script is modified from Greg Way's original scripts of 8.compile-otherclone-dataset.

This dataset includes new batches of 24~27 including WT (10, 12-15) and Bz-resistant (6-10) clones.
They are DMSO-treated clones to later apply the bortezomib resistance signature.

In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features

sys.path.insert(0, "../2.describe-data/scripts")
from processing_utils import load_data

In [2]:
np.random.seed(1234)

In [3]:
data_dir = pathlib.Path("../0.generate-profiles/profiles")
cell_count_dir = pathlib.Path("../0.generate-profiles/cell_counts/")

output_dir = pathlib.Path("data")

profile_suffix = "normalized.csv.gz"

In [4]:
datasets_val = {
        "2021_08_02_Batch24": ["221057"],
        "2021_08_02_Batch25": ["221058"],
        "2021_08_03_Batch26": ["221093"],
        "2021_08_03_Batch27": ["221094"],
}

In [12]:
#added 'val' to each original variable names from 8.0

full_df_val = []
for dataset_val in datasets_val:
    dataset_df_val = []
    for batch in datasets_val:
        plates = datasets_val[batch]
        
        df_val = load_data(
            batch=batch,
            plates=plates,
            profile_dir=data_dir,
            suffix=profile_suffix,
            combine_dfs=True,
            harmonize_cols=True,
            add_cell_count=True,
            cell_count_dir=cell_count_dir
        )
        
        # Add important metadata features
        df_val = df_val.assign(
            Metadata_dataset=dataset_val,
            Metadata_batch=batch,
            Metadata_clone_type="resistant",
            Metadata_clone_type_indicator=1,
            Metadata_model_split="otherclone"
        )

        df_val.loc[df_val.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "sensitive"
        df_val.loc[df_val.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type_indicator"] = 0
        dataset_df_val.append(df_val)

    # Merge plates of the same dataset together
    dataset_df_val = pd.concat(dataset_df_val, axis="rows", sort=False).reset_index(drop=True)
    
    # Generate a unique sample ID
    # (This will be used in singscore calculation)
    dataset_df_val = dataset_df_val.assign(
        Metadata_unique_sample_name=[f"profile_{x}_{dataset_val}" for x in range(0, dataset_df_val.shape[0])]
    )
    
    full_df_val.append(dataset_df_val)

#remove other clone types from the df
full_df_val = pd.concat(full_df_val, axis="rows", sort=False).reset_index(drop=True)
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'WT_parental']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'CloneA']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'CloneE']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 2']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 3']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 4']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 5']
full_df_val = full_df_val[full_df_val.Metadata_clone_number != 'TX clone 6']

In [11]:
full_df_val.head()

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
3,221057,B05,2021_08_02_Batch24,1814,2.5x10^3 cells/well,4,WT clone 10,20210728,221057,48 hr,...,-0.143612,-0.08332,-0.132236,-0.418824,-0.145102,-0.375636,-0.134097,-0.152386,-0.146352,-0.120216
4,221057,B06,2021_08_02_Batch24,5481,2.5x10^3 cells/well,5,WT clone 12,20210728,221057,48 hr,...,-0.429968,-0.433724,-0.457011,-0.373151,-0.479278,-0.419307,-0.405749,-0.42402,-0.407004,-0.413901
8,221057,B10,2021_08_02_Batch24,1925,2.5x10^3 cells/well,4,WT clone 10,20210728,221057,48 hr,...,0.188565,0.178594,0.208148,0.271487,0.15296,0.020198,0.150755,0.168786,0.191561,0.189532
9,221057,B11,2021_08_02_Batch24,3910,2.5x10^3 cells/well,5,WT clone 12,20210728,221057,48 hr,...,-0.794441,-0.806578,-0.774007,-0.88578,-0.794764,-0.837201,-0.832439,-0.820963,-0.826323,-0.816295
10,221057,C02,2021_08_02_Batch24,2230,2.5x10^3 cells/well,10,BZ007,20210728,221057,48 hr,...,0.88132,0.921265,1.033988,0.97868,0.970995,0.9003,0.860706,0.872132,0.880284,0.884321


In [7]:
# Reorder features
common_metadata = infer_cp_features(full_df_val, metadata=True)
morph_features = infer_cp_features(full_df_val)

full_df_val = full_df_val.reindex(common_metadata + morph_features, axis="columns")

print(full_df_val.shape)

(480, 3546)


In [8]:
pd.crosstab(full_df_val.Metadata_clone_type_indicator, full_df_val.Metadata_model_split)

Metadata_model_split,otherclone
Metadata_clone_type_indicator,Unnamed: 1_level_1
0,320
1,160


In [9]:
pd.crosstab(full_df_val.Metadata_clone_number, full_df_val.Metadata_model_split)

Metadata_model_split,otherclone
Metadata_clone_number,Unnamed: 1_level_1
BZ006,32
BZ007,32
BZ008,32
BZ009,32
BZ010,32
WT clone 10,64
WT clone 12,64
WT clone 13,64
WT clone 14,64
WT clone 15,64


In [10]:
#saved output file as 'otherclones_normalized_profiles_LAST_BATCH_VALIDATION.tsv.gz'
output_file = pathlib.Path(f"{output_dir}/otherclones_normalized_profiles_LAST_BATCH_VALIDATION.tsv.gz")
full_df_val.to_csv(output_file, sep="\t", index=False)