In [1]:
seed = 123

In [2]:
# import helper functions
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

Using TensorFlow backend.


In [3]:
# load p53 ds (missing values removed)
X, Y = load_p53_ds()
    
# 80/20 stratified split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)


Import completed after 4.4 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


# Iterate

To evaluate the different pre-processing steps, we iterate over different algorithms and hyperparameters, more specifically:

* Remove most strongly correlated features
    * n_remove
* Center/scale
    * no scaling, StandardScaler, RobustScaler
* Univariate feature selection
    * F/MIC
    * k features
* Model-based feature selection
    * Linear/non-linear (non-LRC) models
    * m features
* Add principal components
    * n

4-fold CV is performed and the mean train and validation fold metrics (MCC and balanced accuracy) are collected for all conditions to jointly evaluate the best-performing combinations. Note: transformations are performed on the entire trainin/validation set prior to CV partitioning to speed up the iterations. Hence, validation fold performance might be expected to be a slight overestimation, proper evalution and hyperparameter optimization will be performed later.

In [4]:
# downsample majority class to speed up iterations
DS = RandomUnderSampler(sampling_strategy=0.01)
X_train, Y_train = DS.fit_resample(X_train, Y_train)

print(X_train.shape)

(12221, 5408)


In [5]:
CV = 4

# initialize df for results
results = pd.DataFrame(
    index=np.arange(1000), columns=[
    "remove_corr_n", "scaler", "fs_model", "fs_m", "fs_univ_k", "pca_n", "tot_n_features", 
    "train_mcc", "val_mcc", "train_balanced_acc", "val_balanced_acc"]
)

i = 0  # counter

# remove strongly correlated features
for n_remove in [0, 10, 100, 250, 500, 750]:
    
    if n_remove:
        rm = RemoveCorrelatedFeatures(n_remove=n_remove)
        X_t = rm.fit_transform(X_train)
    else:
        X_t = X_train

    for scaler in ["no_scaling"]:

        if scaler != "no_scaling":
            X_t = scaler.fit_transform(X_t)
            scaler_name = scaler.__class__.__name__
        else:
            scaler_name = scaler

        # dimension reduction
        X_train_pca = PCA(n_components=25).fit_transform(X_t)

        # univariate selection
        selector_F = SelectKBest(f_classif, k="all").fit(X_t, Y_train)

        # model-based selection
        for model_name, model in [
            ("LinearSVC", LinearSVC(max_iter=15000, random_state=seed, class_weight="balanced"))
        ]:

            # fit the model
            model.fit(X_t, Y_train)

            # select m best features according to model
            for m in [10, 250, 500, 1000, 4000]:

                sel = SelectFromModel(model, prefit=True, threshold=-np.inf, max_features=m)
                model_sel = set(sel.get_support(indices=True))

                # select a total of k original features based on F-stat and MIC
                for k in [10, 20, 50, 75]:

                    # get best-scoring feature indices
                    F_sel = set(selector_F.scores_.argsort()[-k // 2:][::-1])
                    
                    # take union of all selected features
                    feature_sel = F_sel | model_sel

                    # add PCs
                    for n in [0, 10, 25]:

                        if n:
                            X_train_pca_sel = X_train_pca[:, :n]

                            # join with union of selected features
                            X_train_selection = np.concatenate(
                                (X_train_pca_sel, np.array(X_t)[:, list(feature_sel)]),
                                axis=1
                            )

                        else:
                            X_train_selection = np.array(X_t)[:, list(feature_sel)]

                        # Logistic Regression model 4-fold cv
                        out = [n_remove, scaler_name, model_name, m, k, n, X_train_selection.shape[-1]]

                        print("=" * 80)
                        print(" | ".join(str(o).ljust(10) for o in out), f" |  (i={i})")
                        print("=" * 80)

                        out.extend(
                            cv(model=LogisticRegression(max_iter=5000, penalty="none", class_weight="balanced", random_state=seed),
                               x=X_train_selection,
                               y=Y_train,
                               cv=CV,
                               n_jobs=-1
                              )
                        )

                        # store results
                        for col, value in zip(results.columns, out):
                            results[col][i] = value

                        i += 1
                    

        # baseline: only scaling and removal of correlated features 
        out = [n_remove, scaler_name, "na", "na", "na", 0, X_t.shape[-1]]

        print("=" * 80)
        print(" | ".join(str(o).ljust(10) for o in out), f" |  (i={i})")
        print("=" * 80)

        out.extend(
            cv(model=LogisticRegression(max_iter=5000, penalty="none", class_weight="balanced", random_state=seed),
               x=X_t,
               y=Y_train,
               cv=CV,
               n_jobs=-1
              )
        )

        # store results
        for col, value in zip(results.columns, out):
            results[col][i] = value

        i += 1



0          | no_scaling | LinearSVC  | 10         | 10         | 0          | 15          |  (i=0)
test_balanced_acc              0.86 (± 0.00)       [0.86, 0.86, 0.87, 0.87]
test_mcc                       0.20 (± 0.00)       [0.2, 0.2, 0.21, 0.21]
--------------------------------------------------------------------------------
Training MCC:                  0.21 (± 0.01)       [0.21, 0.21, 0.23, 0.22]

0          | no_scaling | LinearSVC  | 10         | 10         | 10         | 25          |  (i=1)
test_balanced_acc              0.87 (± 0.03)       [0.87, 0.89, 0.9, 0.82]
test_mcc                       0.22 (± 0.02)       [0.21, 0.22, 0.24, 0.19]
--------------------------------------------------------------------------------
Training MCC:                  0.23 (± 0.01)       [0.21, 0.23, 0.24, 0.24]

0          | no_scaling | LinearSVC  | 10         | 10         | 25         | 40          |  (i=2)
test_balanced_acc              0.88 (± 0.03)       [0.91, 0.87, 0.92, 0.83]
test_mcc  



10         | no_scaling | LinearSVC  | 10         | 10         | 0          | 15          |  (i=61)
test_balanced_acc              0.86 (± 0.01)       [0.86, 0.87, 0.87, 0.85]
test_mcc                       0.21 (± 0.00)       [0.21, 0.21, 0.21, 0.2]
--------------------------------------------------------------------------------
Training MCC:                  0.21 (± 0.01)       [0.2, 0.21, 0.23, 0.22]

10         | no_scaling | LinearSVC  | 10         | 10         | 10         | 25          |  (i=62)
test_balanced_acc              0.87 (± 0.03)       [0.87, 0.89, 0.9, 0.81]
test_mcc                       0.22 (± 0.02)       [0.21, 0.23, 0.25, 0.19]
--------------------------------------------------------------------------------
Training MCC:                  0.23 (± 0.01)       [0.21, 0.23, 0.24, 0.25]

10         | no_scaling | LinearSVC  | 10         | 10         | 25         | 40          |  (i=63)
test_balanced_acc              0.88 (± 0.03)       [0.91, 0.87, 0.92, 0.84]
test_mc



100        | no_scaling | LinearSVC  | 10         | 10         | 0          | 15          |  (i=122)
test_balanced_acc              0.86 (± 0.01)       [0.86, 0.87, 0.87, 0.85]
test_mcc                       0.21 (± 0.00)       [0.21, 0.21, 0.21, 0.2]
--------------------------------------------------------------------------------
Training MCC:                  0.21 (± 0.01)       [0.2, 0.21, 0.23, 0.22]

100        | no_scaling | LinearSVC  | 10         | 10         | 10         | 25          |  (i=123)
test_balanced_acc              0.86 (± 0.03)       [0.87, 0.89, 0.88, 0.81]
test_mcc                       0.22 (± 0.02)       [0.21, 0.23, 0.23, 0.19]
--------------------------------------------------------------------------------
Training MCC:                  0.23 (± 0.01)       [0.22, 0.23, 0.24, 0.24]

100        | no_scaling | LinearSVC  | 10         | 10         | 25         | 40          |  (i=124)
test_balanced_acc              0.88 (± 0.03)       [0.91, 0.87, 0.92, 0.84]
tes



250        | no_scaling | LinearSVC  | 10         | 10         | 0          | 15          |  (i=183)
test_balanced_acc              0.86 (± 0.01)       [0.86, 0.87, 0.87, 0.85]
test_mcc                       0.21 (± 0.00)       [0.21, 0.21, 0.21, 0.2]
--------------------------------------------------------------------------------
Training MCC:                  0.21 (± 0.01)       [0.2, 0.21, 0.23, 0.22]

250        | no_scaling | LinearSVC  | 10         | 10         | 10         | 25          |  (i=184)
test_balanced_acc              0.86 (± 0.03)       [0.87, 0.89, 0.88, 0.81]
test_mcc                       0.21 (± 0.02)       [0.21, 0.22, 0.24, 0.19]
--------------------------------------------------------------------------------
Training MCC:                  0.23 (± 0.01)       [0.21, 0.23, 0.24, 0.24]

250        | no_scaling | LinearSVC  | 10         | 10         | 25         | 40          |  (i=185)
test_balanced_acc              0.88 (± 0.03)       [0.91, 0.87, 0.92, 0.84]
tes



500        | no_scaling | LinearSVC  | 10         | 10         | 0          | 15          |  (i=244)
test_balanced_acc              0.86 (± 0.02)       [0.86, 0.88, 0.85, 0.83]
test_mcc                       0.20 (± 0.01)       [0.2, 0.22, 0.2, 0.19]
--------------------------------------------------------------------------------
Training MCC:                  0.22 (± 0.01)       [0.2, 0.21, 0.23, 0.23]

500        | no_scaling | LinearSVC  | 10         | 10         | 10         | 25          |  (i=245)
test_balanced_acc              0.86 (± 0.04)       [0.85, 0.89, 0.9, 0.79]
test_mcc                       0.21 (± 0.03)       [0.2, 0.22, 0.25, 0.18]
--------------------------------------------------------------------------------
Training MCC:                  0.24 (± 0.01)       [0.22, 0.23, 0.25, 0.25]

500        | no_scaling | LinearSVC  | 10         | 10         | 25         | 40          |  (i=246)
test_balanced_acc              0.89 (± 0.03)       [0.91, 0.87, 0.93, 0.85]
test_m



750        | no_scaling | LinearSVC  | 10         | 10         | 0          | 15          |  (i=305)
test_balanced_acc              0.86 (± 0.02)       [0.88, 0.88, 0.85, 0.84]
test_mcc                       0.20 (± 0.01)       [0.21, 0.21, 0.2, 0.2]
--------------------------------------------------------------------------------
Training MCC:                  0.22 (± 0.01)       [0.21, 0.21, 0.21, 0.24]

750        | no_scaling | LinearSVC  | 10         | 10         | 10         | 25          |  (i=306)
test_balanced_acc              0.86 (± 0.03)       [0.88, 0.87, 0.89, 0.81]
test_mcc                       0.21 (± 0.02)       [0.22, 0.21, 0.23, 0.19]
--------------------------------------------------------------------------------
Training MCC:                  0.22 (± 0.01)       [0.21, 0.22, 0.23, 0.24]

750        | no_scaling | LinearSVC  | 10         | 10         | 25         | 40          |  (i=307)
test_balanced_acc              0.88 (± 0.04)       [0.91, 0.87, 0.92, 0.82]
tes

In [6]:
results = results.dropna(axis=0)
results.sort_values("val_mcc", ascending=False)

Unnamed: 0,remove_corr_n,scaler,fs_model,fs_m,fs_univ_k,pca_n,tot_n_features,train_mcc,val_mcc,train_balanced_acc,val_balanced_acc
25,0,no_scaling,LinearSVC,500,10,10,511,1,0.579906,1,0.79903
90,10,no_scaling,LinearSVC,500,20,25,531,1,0.565837,1,0.783159
34,0,no_scaling,LinearSVC,500,75,10,539,1,0.554758,1,0.794895
32,0,no_scaling,LinearSVC,500,50,25,544,1,0.549863,1,0.77465
153,100,no_scaling,LinearSVC,500,50,10,529,1,0.548383,1,0.775054
...,...,...,...,...,...,...,...,...,...,...,...
0,0,no_scaling,LinearSVC,10,10,0,15,0.214327,0.204679,0.882282,0.864615
305,750,no_scaling,LinearSVC,10,10,0,15,0.215483,0.203306,0.88102,0.860934
244,500,no_scaling,LinearSVC,10,10,0,15,0.215693,0.202758,0.88191,0.85781
247,500,no_scaling,LinearSVC,10,20,0,20,0.219417,0.20208,0.886858,0.85749


In [7]:
results.to_csv(f"/kaggle/working/results_{scaler_name}.csv")