# Train Classifier Models to Predict Cell Health Phenotypes

**Gregory Way, 2019**

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier, ElasticNet
from sklearn.pipeline import Pipeline

from scripts.ml_utils import load_train_test, CellHealthPredict

In [3]:
np.random.seed(123)

## Load Data

In [4]:
consensus = "median"

In [5]:
x_train_df, x_test_df, y_train_df, y_test_df = (
    load_train_test(drop_metadata=True, consensus=consensus)
)
x_meta_train_df, x_meta_test_df, y_meta_train_df, y_meta_test_df = (
    load_train_test(output_metadata_only=True, consensus=consensus)
)

In [6]:
cell_lines = list(set(x_meta_train_df.Metadata_cell_line))
cell_lines

['ES2', 'A549', 'HCC44']

In [7]:
print(x_train_df.shape)
x_train_df.head(3)

(303, 1728)


Unnamed: 0_level_0,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_MajorAxisLength,Cells_AreaShape_MaxFeretDiameter,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
Metadata_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
profile_340,0.186717,-0.345053,0.373624,0.832899,-0.392186,0.0,-2.324244,-2.534054,0.209023,0.383929,...,-2.940099,-3.370468,-2.953973,-3.247083,-3.439135,-3.18017,-3.613814,-2.666236,-2.166383,-2.645068
profile_6,0.110125,0.089523,0.197536,0.284731,0.506326,0.0,-0.137631,0.062794,0.185235,0.187253,...,-0.398886,0.209164,0.293149,0.300067,0.55207,0.686105,0.46847,0.175433,0.227441,0.068244
profile_79,-0.028882,-0.063144,0.321617,1.861735,1.509734,0.0,-1.748299,-0.33905,0.195859,0.210499,...,-0.733359,-1.15317,-0.552812,-1.208141,-0.956042,-0.659138,-0.774468,-0.790275,-0.613076,-1.155496


## Setup Cross Validation

In [8]:
alphas = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
l1_ratios = [0.1, 0.12, 0.14, 0.16, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9]
n_folds = 5

In [9]:
regression_parameters = {
    'regress__alpha': alphas,
    'regress__l1_ratio': l1_ratios
}

clf_parameters = {
    'classify__loss': ['log'],
    'classify__penalty': ['elasticnet'],
    'classify__alpha': alphas,
    'classify__l1_ratio': l1_ratios
}

In [10]:
estimator_regressor = Pipeline(
    steps=[(
        "regress",
        ElasticNet(
            random_state=42,
            max_iter=2000,
            tol=1e-3
        )
        
    )]
)

estimator_classifier = Pipeline(
    steps=[(
        'classify',
        SGDClassifier(
            random_state=42,
            class_weight='balanced',
            max_iter=2000,
            shuffle=True,
            tol=1e-3
        )
    )]
)

In [11]:
# Y labels and transform instructions
cell_health_targets = y_train_df.columns.tolist()
y_transforms = ["raw", "binarize"]

## Train Models

In [12]:
%%time

cv_results_list = []
roc_results_list = []
pr_results_list = []
all_coefs_list = []
all_y_label_list = []
regression_results_list = []
cannot_fit_list = []

for cell_health_target in cell_health_targets[0:1]:
    print(cell_health_target)
    
    for y_transform in y_transforms:

        if y_transform == "binarize":
            estimator = estimator_classifier
            scoring = "roc_auc"
            parameters = clf_parameters
            decision_function = True
        else:
            estimator = estimator_regressor
            scoring = "r2"
            parameters = regression_parameters
            decision_function = False
    
        for shuffle_type in [True, False]:

            # Initialize class
            chp = CellHealthPredict(
                x_df=x_train_df,
                y_df=y_train_df,
                parameters=parameters,
                estimator=estimator,
                n_folds=n_folds,
                cv_scoring=scoring,
                shuffle=shuffle_type
            )

            # Fit model
            is_fit = chp.fit_cell_health_target(cell_health_target,
                                                y_transform=y_transform,
                                                binarize_fit="median")
            
            if not is_fit:
                cannot_fit_list.append([cell_health_target, y_transform, shuffle_type])
                continue

            # Get performance metrics for training and testing
            metric_a, metric_b, y_true, y_pred = chp.get_performance(
                decision_function=decision_function,
                return_y=True,
                binarize_fit="median"
            )
            metric_test_a, metric_test_b, y_test_true, y_test_pred = chp.get_performance(
                x_test=x_test_df,
                y_test=y_test_df,
                decision_function=decision_function,
                return_y=True,
                binarize_fit="median",
                data_fit_type="test",
            )

            # Get Cell Line Specific Performance
            cell_line_metrics_a = []
            cell_line_metrics_b = []
            for cell_line in cell_lines:
                meta_train_subset_df = x_meta_train_df.query("Metadata_cell_line == @cell_line")
                meta_test_subset_df = x_meta_test_df.query("Metadata_cell_line == @cell_line")

                # Get Cell Line Specific Training Performance
                x_cell_line_df = x_train_df.reindex(meta_train_subset_df.index, axis="rows")
                y_cell_line_df = y_train_df.reindex(meta_train_subset_df.index, axis="rows")

                metric_cell_train_a, metric_cell_train_b, y_cell_train_true, y_cell_train_pred = (
                    chp.get_performance(
                        x_test=x_cell_line_df,
                        y_test=y_cell_line_df,
                        decision_function=decision_function,
                        return_y=True,
                        binarize_fit="median",
                        cell_line=cell_line
                    )
                )

                # Get Cell Line Specific Test Performance
                x_cell_line_df = x_test_df.reindex(meta_test_subset_df.index, axis="rows")
                y_cell_line_df = y_test_df.reindex(meta_test_subset_df.index, axis="rows")

                metric_cell_test_a, metric_cell_test_b, y_cell_test_true, y_cell_test_pred = (
                    chp.get_performance(
                        x_test=x_cell_line_df,
                        y_test=y_cell_line_df,
                        decision_function=decision_function,
                        return_y=True,
                        binarize_fit="median",
                        data_fit_type="test",
                        cell_line=cell_line
                    )
                )

                cell_line_metrics_a += [metric_cell_train_a, metric_cell_test_a]
                cell_line_metrics_b += [metric_cell_train_b, metric_cell_test_b]

            # Combine training and testing results
            if y_transform == "binarize":
                roc_results_list.append(pd.concat([metric_a, metric_test_a], axis='rows'))
                roc_results_list.append(pd.concat(cell_line_metrics_a, axis="rows"))
                pr_results_list.append(pd.concat([metric_b, metric_test_b], axis='rows'))
                pr_results_list.append(pd.concat(cell_line_metrics_b, axis="rows"))
            else:
                regression_results_list.append(pd.concat([metric_a, metric_test_a], axis='rows'))
                regression_results_list.append(pd.concat([metric_b, metric_test_b], axis='rows'))
                regression_results_list.append(pd.concat(cell_line_metrics_a, axis='rows'))
                regression_results_list.append(pd.concat(cell_line_metrics_b, axis="rows"))

            # Save cross validation results
            cv_results_list.append(chp.get_cv_results())

            # Save the model coefficients
            model_file = "cell_health_{}_target_{}_shuffle_{}_transform_{}.joblib".format(
                consensus, cell_health_target, shuffle_type, y_transform
            )
            model_file = os.path.join("models", model_file)
            coef_df = chp.get_coefficients(save_model=True, model_file=model_file)
            all_coefs_list.append(coef_df)
        
            # Store y predictions recoded values
            all_y_label_list.append(pd.concat([y_true, y_test_true, y_pred, y_test_pred]))

cc_all_high_n_spots_h2ax_mean








CPU times: user 16min 3s, sys: 22.4 s, total: 16min 25s
Wall time: 1min 33s


In [12]:
# Acquire output metrics
full_cv_df = pd.concat(cv_results_list).reset_index(drop=True)
full_regression_results_df = pd.concat(regression_results_list).reset_index(drop=True)
full_roc_df = pd.concat(roc_results_list).reset_index(drop=True)
full_pr_df = pd.concat(pr_results_list).reset_index(drop=True)
full_coef_df = pd.concat(all_coefs_list).reset_index(drop=True)
full_y_df = pd.concat(all_y_label_list).reset_index(drop=True)

In [13]:
# Save all results
results_dir = "results"
os.makedirs(results_dir, exist_ok=True)

file = os.path.join(results_dir, "full_cell_health_cv_results_{}.tsv.gz".format(consensus))
full_cv_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_regression_{}.tsv.gz".format(consensus))
full_regression_results_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_roc_results_{}.tsv.gz".format(consensus))
full_roc_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_pr_results_{}.tsv.gz".format(consensus))
full_pr_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_coefficients_{}.tsv.gz".format(consensus))
full_coef_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_y_labels_{}.tsv.gz".format(consensus))
full_y_df.to_csv(file, sep='\t', index=False)