In [1]:
seed = 123

In [2]:
# import helper functions
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

# load dataset (no NAs)
X, Y = load_p53_ds()
# sratified 80/20 split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)

Using TensorFlow backend.



Import completed after 3.3 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


In [3]:
# downsample for intitial search
#DS = RandomUnderSampler(sampling_strategy=0.01)
#X_train, Y_train = DS.fit_resample(X_train, Y_train)

print(X_train.shape)

(24927, 5408)


In [4]:
CV = 4

# grid search on entire pipeline
FU = FeatureUnion([
    ("fs_univ", SelectKBest(f_classif, k=20)),
    ("fs_model", SelectFromModel(
        ExtraTreesClassifier(n_estimators=100, random_state=seed), threshold=-np.inf, max_features=125)
    )
])

pipe = Pipeline([
    ("remove_corr", RemoveCorrelatedFeatures()),
    ("scale", StandardScaler()),
    ("fs", FU),
    ("clf", RandomForestClassifier(
        random_state=seed, 
        class_weight="balanced", 
        min_samples_leaf=10, 
        n_estimators=500,
        max_depth=15
    ))
])

param_grid = dict(
    remove_corr__n_remove=[10, 25, 50],
    fs__fs_model__max_features=[100, 125],
    clf__n_estimators=[500, 1000]
)

GS = GridSearchCV(pipe, param_grid=param_grid, cv=CV, verbose=10, scoring=make_scorer(matthews_corrcoef), n_jobs=-1)
GS.fit(X_train, Y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done  46 out of  48 | elapsed: 22.9min remaining:   59.7s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 23.0min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('remove_corr',
                                        RemoveCorrelatedFeatures(n_remove=500)),
                                       ('scale',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('fs',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('fs_univ',
                                                                        SelectKBest(k=20,
                                                                                    score_func=<function f_classif at 0x7f3c5faea488>)),
                                                                       ('fs_model',
                   

In [5]:
res = pd.DataFrame(GS.cv_results_)
res.sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__n_estimators,param_fs__fs_model__max_features,param_remove_corr__n_remove,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
10,138.788489,3.516833,1.404862,0.10908,1000,125,25,"{'clf__n_estimators': 1000, 'fs__fs_model__max...",0.584864,0.429868,0.544054,0.499041,0.514457,0.057502,1
3,96.16353,1.494475,0.971381,0.074802,500,125,10,"{'clf__n_estimators': 500, 'fs__fs_model__max_...",0.584864,0.429868,0.528936,0.509059,0.513182,0.055553,2
9,139.841172,2.389027,1.451246,0.107059,1000,125,10,"{'clf__n_estimators': 1000, 'fs__fs_model__max...",0.584864,0.429868,0.528936,0.509059,0.513182,0.055553,2
0,92.704488,1.574339,0.98618,0.100789,500,100,10,"{'clf__n_estimators': 500, 'fs__fs_model__max_...",0.569903,0.453167,0.513534,0.509059,0.511416,0.041303,4
5,96.781841,2.359921,0.938864,0.056013,500,125,50,"{'clf__n_estimators': 500, 'fs__fs_model__max_...",0.584864,0.429868,0.497667,0.531014,0.510853,0.056161,5
11,138.038637,3.790894,1.149985,0.203299,1000,125,50,"{'clf__n_estimators': 1000, 'fs__fs_model__max...",0.584864,0.429868,0.497667,0.531014,0.510853,0.056161,5
4,94.385455,2.517367,0.942348,0.066856,500,125,25,"{'clf__n_estimators': 500, 'fs__fs_model__max_...",0.584864,0.429868,0.521143,0.499041,0.508729,0.05537,7
7,127.627467,3.739864,1.361079,0.089273,1000,100,25,"{'clf__n_estimators': 1000, 'fs__fs_model__max...",0.584864,0.453167,0.506243,0.489582,0.508464,0.048105,8
2,88.059264,2.404619,1.014674,0.145325,500,100,50,"{'clf__n_estimators': 500, 'fs__fs_model__max_...",0.584864,0.453167,0.476147,0.499041,0.503305,0.049803,9
6,127.264019,2.081508,1.394601,0.08532,1000,100,10,"{'clf__n_estimators': 1000, 'fs__fs_model__max...",0.569903,0.429868,0.49018,0.509059,0.499753,0.049958,10


In [6]:
res.to_csv("results.csv")