In [1]:
import sys
import pathlib

import joblib
import numpy as np
import pandas as pd
from scipy.stats import uniform
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

# import local modules
sys.path.append("../../")
from src.utils import shuffle_features, evaluate, train_multiclass, load_json_file

In [2]:
# setting random seeds varaibles
seed = 0
np.random.seed(seed)

# setting paths and parameters
results_dir = pathlib.Path("../../results").resolve(strict=True)
data_splits_dir = (results_dir / "1.data_splits").resolve(strict=True)

# setting path for training dataset
training_dataset_path = (data_splits_dir / "training_data.csv.gz").resolve(strict=True)

# holdout paths
plate_holdout_path = (data_splits_dir / "plate_holdout.csv.gz").resolve(strict=True)
treatment_holdout_path = (data_splits_dir / "treatment_holdout.csv.gz").resolve(
    strict=True
)
wells_holdout_path = (data_splits_dir / "wells_holdout.csv.gz").resolve(strict=True)

# setting output paths
modeling_dir = (results_dir / "2.modeling").resolve()
modeling_dir.mkdir(exist_ok=True)

# ml parameters to hyperparameterization tuning
param_grid = {
    "estimator__C": uniform(0.1, 10),
    "estimator__solver": ["newton-cg", "liblinear", "sag", "saga"],
    "estimator__penalty": ["l1", "l2", "elasticnet"],
    "estimator__l1_ratio": uniform(0, 1),
}

In [3]:
# loading injurt codes
injury_codes = load_json_file(data_splits_dir / "injury_codes.json")

# loading in the dataset
training_df = pd.read_csv(training_dataset_path)

# display data
print("Shape: ", training_df.shape)
training_df.head()

Shape:  (13502, 379)


Unnamed: 0,injury_code,Mahalanobis distance,Experimental Condition [Treatment time (h)],Compound Name,Characteristics [Cell Line],Compound PubChem CID,Plate,Term Source 2 Accession,Control Type,Compound SMILES,...,Nuclei_Texture_InverseDifferenceMoment_DNA_20_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumVariance_DNA_20_0
0,0,7.51,24,DMSO,U2OS,679.0,BR00110363,EFO_0002869,Negative,CS(=O)C,...,-0.011258,9.8e-05,0.057244,0.160847,-0.083034,-0.02329,-0.066369,-0.015235,-0.035909,-0.032067
1,0,6.21,24,DMSO,U2OS,679.0,BR00110363,EFO_0002869,Negative,CS(=O)C,...,0.064689,0.025857,0.099848,0.017477,0.0213,0.058137,-0.09728,-0.073545,-0.044883,-0.01524
2,0,10.94,24,DMSO,U2OS,679.0,BR00110363,EFO_0002869,Negative,CS(=O)C,...,0.020937,0.04106,0.119247,0.111741,0.041592,0.224199,-0.088845,0.000327,-0.003115,-0.014406
3,0,7.59,24,DMSO,U2OS,679.0,BR00110363,EFO_0002869,Negative,CS(=O)C,...,0.006589,0.022156,0.036473,-0.013141,0.00869,0.06086,0.044924,0.040528,0.070877,0.072871
4,0,5.28,24,DMSO,U2OS,679.0,BR00110363,EFO_0002869,Negative,CS(=O)C,...,-0.028361,0.007213,0.023068,0.110361,0.054405,0.030157,0.06648,0.03891,0.048559,0.056829


In [4]:
# splitting between meta and feature columns
meta_cols = training_df.columns[:33]
feat_cols = training_df.columns[33:]

# Splitting the data where y = injury_types and X = morphology features
X = training_df[feat_cols].values
y_labels = training_df["injury_code"]

# since this is a multi-class problem and in order for precision and recalls to work
# we need to binarize it to different classes
# source: https://stackoverflow.com/questions/56090541/how-to-plot-precision-and-recall-of-multiclass-classifier
n_classes = len(np.unique(y_labels.values))
y = label_binarize(y_labels, classes=[*range(n_classes)])

# then we can split the data set with are newly binarized labels
# we made sure to use stratify to ensure proportionality within training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, stratify=y)

## Training and Evaluating Multi-class Logistic Model with original dataset split


In [5]:
# train and get the best_model
best_model = train_multiclass(X_train, y_train, param_grid=param_grid, seed=seed)

# save model
joblib.dump(best_model, modeling_dir / "multi_class_model.joblib")

['/home/erikserrano/Desktop/Cytotoxic-Nuisance-Metadata-Analysis/results/2.modeling/multi_class_model.joblib']

In [6]:
test_precision_recall_df, test_f1_score_df = evaluate(
    model=best_model, X=X_test, y=y_test, dataset="test", shuffled=False, seed=seed
)
train_precision_recall_df, train_f1_score_df = evaluate(
    model=best_model, X=X_train, y=y_train, dataset="train", shuffled=False, seed=seed
)

## Training and Evaluating Multi-class Logistic Model with shuffled dataset split


In [7]:
# shuffle feature space
shuffled_X_train = shuffle_features(X_train, seed=seed)

In [8]:
shuffled_best_model = train_multiclass(
    shuffled_X_train, y_train, param_grid=param_grid, seed=seed
)

In [None]:
shuffle_test_precision_recall_df, shuffle_test_f1_score_df = evaluate(
    model=best_model, X=X_test, y=y_test, dataset="test", shuffled=True, seed=seed
)
shuffle_train_precision_recall_df, shuffle_train_f1_score_df = evaluate(
    model=best_model,
    X=shuffled_X_train,
    y=y_train,
    dataset="train",
    shuffled=True,
    seed=seed,
)

## Evaluating Multi-class model with holdout data

In [None]:
# loading in holdout data
# setting seed
n_classes = len(np.unique(y_labels.values))

# loading all holdouts
plate_holdout_df = pd.read_csv(plate_holdout_path)
treatment_holdout_df = pd.read_csv(treatment_holdout_path)
well_holdout_df = pd.read_csv(wells_holdout_path)

# splitting the dataset into
X_plate_holdout = plate_holdout_df[feat_cols]
y_plate_holout = label_binarize(
    y=plate_holdout_df["injury_code"],
    classes=[*range(n_classes)],
)

X_treatment_holdout = treatment_holdout_df[feat_cols]
y_treatment_holout = label_binarize(
    y=treatment_holdout_df["injury_code"],
    classes=[*range(n_classes)],
)

X_well_holdout = well_holdout_df[feat_cols]
y_well_holout = label_binarize(
    y=well_holdout_df["injury_code"],
    classes=[*range(n_classes)],
)

### Evaluating Multi-class model trained with original split with holdout data 

In [None]:
# evaluating with plate holdout
plate_ho_precision_recall_df, plate_ho_f1_score_df = evaluate(
    model=best_model,
    X=X_plate_holdout,
    y=y_plate_holout,
    dataset="plate_holdout",
    shuffled=False,
    seed=seed,
)
plate_ho_shuffle_precision_recall_df, plate_ho_shuffle_train_f1_score_df = evaluate(
    model=shuffled_best_model,
    X=X_plate_holdout,
    y=y_plate_holout,
    dataset="plate_holdout",
    shuffled=True,
    seed=seed,
)

# evaluating with treatment holdout
treatment_ho_precision_recall_df, treatment_ho_f1_score_df = evaluate(
    model=best_model,
    X=X_treatment_holdout,
    y=y_treatment_holout,
    dataset="treatment_holdout",
    shuffled=False,
    seed=seed,
)
treatment_ho_shuffle_precision_recall_df, treatment_ho_shuffle_train_f1_score_df = (
    evaluate(
        model=shuffled_best_model,
        X=X_treatment_holdout,
        y=y_treatment_holout,
        dataset="treatment_holdout",
        shuffled=True,
        seed=seed,
    )
)

# evaluating with treatment holdout
well_ho_precision_recall_df, well_ho_test_f1_score_df = evaluate(
    model=best_model,
    X=X_well_holdout,
    y=y_well_holout,
    dataset="well_holdout",
    shuffled=False,
    seed=seed,
)
well_ho_shuffle_precision_recall_df, well_ho_shuffle_train_f1_score_df = evaluate(
    model=shuffled_best_model,
    X=X_well_holdout,
    y=y_well_holout,
    dataset="well_holdout",
    shuffled=True,
    seed=seed,
)

In [None]:
# storing all f1 scores
all_f1_scores = pd.concat(
    [
        test_f1_score_df,
        train_f1_score_df,
        shuffle_test_f1_score_df,
        shuffle_train_f1_score_df,
        plate_ho_f1_score_df,
        plate_ho_shuffle_train_f1_score_df,
        treatment_ho_f1_score_df,
        treatment_ho_shuffle_train_f1_score_df,
        well_ho_test_f1_score_df,
        well_ho_shuffle_train_f1_score_df,
    ]
)

# saving all f1 scores
all_f1_scores.to_csv(
    modeling_dir / "all_f1_scores.csv.gz", index=False, compression="gzip"
)

In [None]:
# storing pr scores
all_pr_scores = pd.concat(
    [
        shuffle_test_precision_recall_df,
        shuffle_train_precision_recall_df,
        shuffle_test_precision_recall_df,
        shuffle_train_precision_recall_df,
        plate_ho_precision_recall_df,
        plate_ho_shuffle_precision_recall_df,
        treatment_ho_precision_recall_df,
        treatment_ho_shuffle_precision_recall_df,
        well_ho_precision_recall_df,
        well_ho_shuffle_precision_recall_df,
    ]
)

# saving pr scores
all_pr_scores.to_csv(
    modeling_dir / "precision_recall_scores.csv.gz", index=False, compression="gzip"
)