# Train Classifier Models to Predict Cell Health Phenotypes

**Gregory Way, 2019**

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier, ElasticNet
from sklearn.pipeline import Pipeline

from scripts.ml_utils import load_train_test, CellHealthPredict

In [2]:
np.random.seed(123)

## Load Data

In [3]:
x_train_df, x_test_df, y_train_df, y_test_df = load_train_test(drop_metadata=True)

## Setup Cross Validation

In [4]:
alphas = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
l1_ratios = [0.1, 0.12, 0.14, 0.16, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9]
n_folds = 5

In [5]:
regression_parameters = {
    'regress__alpha': alphas,
    'regress__l1_ratio': l1_ratios
}

clf_parameters = {
    'classify__loss': ['log'],
    'classify__penalty': ['elasticnet'],
    'classify__alpha': alphas,
    'classify__l1_ratio': l1_ratios
}

In [6]:
estimator_regressor = Pipeline(
    steps=[(
        "regress",
        ElasticNet(
            random_state=42,
            max_iter=2000,
            tol=1e-3
        )
        
    )]
)

estimator_classifier = Pipeline(
    steps=[(
        'classify',
        SGDClassifier(
            random_state=42,
            class_weight='balanced',
            max_iter=2000,
            shuffle=True,
            tol=1e-3
        )
    )]
)

In [7]:
# Y labels and transform instructions
cell_health_targets = y_train_df.columns.tolist()
y_transforms = ["raw", "zero-one", "binarize"]

## Train Models

In [8]:
%%time

cv_results_list = []
roc_results_list = []
pr_results_list = []
all_coefs_list = []
all_y_label_list = []
regression_results_list = []
cannot_fit_list = []

for cell_health_target in cell_health_targets:
    print(cell_health_target)
    
    for y_transform in y_transforms:

        if y_transform == "binarize":
            estimator = estimator_classifier
            scoring = "roc_auc"
            parameters = clf_parameters
            decision_function = True
        else:
            estimator = estimator_regressor
            scoring = "r2"
            parameters = regression_parameters
            decision_function = False
    
        for shuffle_type in [True, False]:

            # Initialize class
            chp = CellHealthPredict(
                x_df=x_train_df,
                y_df=y_train_df,
                parameters=parameters,
                estimator=estimator,
                n_folds=n_folds,
                cv_scoring=scoring,
                shuffle=shuffle_type
            )

            # Fit model
            is_fit = chp.fit_cell_health_target(cell_health_target,
                                                y_transform=y_transform,
                                                binarize_fit="median")
            
            if not is_fit:
                cannot_fit_list.append([cell_health_target, y_transform, shuffle_type])
                continue

            # Get performance metrics for training and testing
            metric_a, metric_b, y_true, y_pred = chp.get_performance(
                decision_function=decision_function,
                return_y=True,
                binarize_fit="median"
            )
            metric_test_a, metric_test_b, y_test_true, y_test_pred = chp.get_performance(
                x_test=x_test_df,
                y_test=y_test_df,
                decision_function=decision_function,
                return_y=True,
                binarize_fit="median"
            )

            # Combine training and testing results
            if y_transform == "binarize":
                roc_results_list.append(pd.concat([metric_a, metric_test_a], axis='rows'))
                pr_results_list.append(pd.concat([metric_b, metric_test_b], axis='rows'))
            else:
                regression_results_list.append(pd.concat([metric_a, metric_test_a], axis='rows'))
                regression_results_list.append(pd.concat([metric_b, metric_test_b], axis='rows'))

            # Save cross validation results
            cv_results_list.append(chp.get_cv_results())

            # Save the model coefficients
            model_file = "cell_health_target_{}_shuffle_{}_transform_{}.joblib".format(
                cell_health_target, shuffle_type, y_transform
            )
            model_file = os.path.join("models", model_file)
            coef_df = chp.get_coefficients(save_model=True, model_file=model_file)
            all_coefs_list.append(coef_df)
        
            # Store y predictions recoded values
            all_y_label_list.append(pd.concat([y_true, y_test_true, y_pred, y_test_pred]))

cc_all_high_n_spots_h2ax_mean






cc_all_large_notround_polynuclear_mean






cc_all_large_round_polyploid_mean








cc_all_n_objects






cc_all_n_spots_mean






cc_all_n_spots_per_nucleus_area_mean






cc_all_nucleus_area_mean






cc_all_nucleus_roundness_mean






cc_cc_edu_pos_mean






cc_cc_g1_mean






cc_cc_g2_ph3_neg_mean






cc_cc_g2_ph3_pos_early_mitosis_mean






cc_cc_high_n_spots_h2ax_mean






cc_cc_n_objects






cc_cc_n_spots_mean






cc_cc_n_spots_per_nucleus_area_mean






cc_cc_ph3_neg_hoechst_late_mitosis_mean






cc_cc_ph3_pos_hoechst_mitosis_mean






cc_edu_pos_alexa647_intensity_nucleus_area_mean






cc_edu_pos_alexa647_intensity_nucleus_area_sum






cc_edu_pos_high_n_spots_h2ax_mean






cc_edu_pos_n_objects






cc_edu_pos_n_spots_mean






cc_edu_pos_n_spots_per_nucleus_area_mean






cc_g1_high_n_spots_h2ax_mean






cc_g1_n_objects






cc_g1_n_spots_mean






cc_g1_n_spots_per_nucleus_area_mean






cc_g1_plus_g2






cc_g2_g1






cc_g2_ph3_neg_high_n_spots_h2ax_mean






cc_g2_ph3_neg_n_objects






cc_g2_ph3_neg_n_spots_mean






cc_g2_ph3_neg_n_spots_per_nucleus_area_mean






cc_g2_ph3_pos_high_n_spots_h2ax_mean






cc_g2_ph3_pos_n_objects






cc_g2_ph3_pos_n_spots_mean






cc_g2_ph3_pos_n_spots_per_nucleus_area_mean






cc_g2_plus_all_m






cc_infection_percentage






cc_mitosis_ph3_neg_high_n_spots_h2ax_mean










cc_mitosis_ph3_neg_n_objects






cc_mitosis_ph3_neg_n_spots_mean






cc_mitosis_ph3_neg_n_spots_per_nucleus_area_mean






cc_mitosis_ph3_pos_high_n_spots_h2ax_mean






cc_mitosis_ph3_pos_n_objects






cc_mitosis_ph3_pos_n_spots_mean






cc_mitosis_ph3_pos_n_spots_per_nucleus_area_mean






cc_polynuclear_high_n_spots_h2ax_mean






cc_polynuclear_n_objects






cc_polynuclear_n_spots_mean






cc_polynuclear_n_spots_per_nucleus_area_mean






cc_polyploid_high_n_spots_h2ax_mean






cc_polyploid_n_objects






cc_polyploid_n_spots_mean






cc_polyploid_n_spots_per_nucleus_area_mean






vb_infection_percentage






vb_live_cell_area




vb_live_cell_roundness




vb_live_cell_width_length






vb_num_live_cells






vb_percent_all_apoptosis






vb_percent_all_early_apoptosis






vb_percent_all_late_apoptosis






vb_percent_caspase_dead_only




vb_percent_dead






vb_percent_dead_only






vb_percent_live






vb_ros_back_mean








vb_ros_mean






CPU times: user 1h 40min 16s, sys: 23min 42s, total: 2h 3min 58s
Wall time: 17min 44s


In [9]:
# Acquire output metrics
full_cv_df = pd.concat(cv_results_list).reset_index(drop=True)
full_regression_results_df = pd.concat(regression_results_list).reset_index(drop=True)
full_roc_df = pd.concat(roc_results_list).reset_index(drop=True)
full_pr_df = pd.concat(pr_results_list).reset_index(drop=True)
full_coef_df = pd.concat(all_coefs_list).reset_index(drop=True)
full_y_df = pd.concat(all_y_label_list).reset_index(drop=True)

In [10]:
# Save all results
results_dir = "results"
os.makedirs(results_dir, exist_ok=True)

file = os.path.join(results_dir, "full_cell_health_cv_results.tsv.gz")
full_cv_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_regression_results.tsv.gz")
full_regression_results_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_roc_results.tsv.gz")
full_roc_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_pr_results.tsv.gz")
full_pr_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_coefficients.tsv.gz")
full_coef_df.to_csv(file, sep='\t', index=False)

file = os.path.join(results_dir, "full_cell_health_y_labels.tsv.gz")
full_y_df.to_csv(file, sep='\t', index=False)