In [1]:
seed = 123

In [2]:
# import helper functions
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

# load p53 ds (missing values removed)
X, Y = load_p53_ds()
    
# 80/20 stratified split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)

Using TensorFlow backend.



Import completed after 2.9 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


In [3]:
# downsample majority class to speed up iterations
DS = RandomUnderSampler(sampling_strategy=0.01)
X_train, Y_train = DS.fit_resample(X_train, Y_train)

print(X_train.shape)

(12221, 5408)


In [4]:
CV=4

# finer grid search over dimension-reduction pipeline
FU = FeatureUnion([
    ("pca", PCA()),
    ("fs_univ", SelectKBest(f_classif)),
    ("fs_model", SelectFromModel(
        LinearSVC(max_iter=15000, random_state=seed, class_weight="balanced"), threshold=-np.inf, max_features=500)
    )
])

pipe = Pipeline([
    ("remove_corr", RemoveCorrelatedFeatures(n_remove=500)),
    ("ss", StandardScaler()),
    ("pp", FU),
    ("clf", LogisticRegression(max_iter=5000, penalty="none", class_weight="balanced", random_state=seed))
])

param_grid = dict(
    remove_corr__n_remove=[0, 50, 100, 500],
    pp__pca__n_components=[10, 25, 50],
    pp__fs_univ__k=[10, 30, 50]
)

GS = GridSearchCV(pipe, param_grid=param_grid, cv=CV, verbose=10, scoring=make_scorer(matthews_corrcoef), n_jobs=-1)
GS.fit(X_train, Y_train)

Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 50.5min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 68.3min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 91.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 113.5min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 143.1min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 174.0min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 205.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 239.0min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 279.7min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 322.0min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 363.4min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 383.0min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('remove_corr',
                                        RemoveCorrelatedFeatures(n_remove=500)),
                                       ('ss',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pp',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('pca',
                                                                        PCA(copy=True,
                                                                            iterated_power='auto',
                                                                            n_components=None,
                                                        

In [5]:
results = pd.DataFrame(GS.cv_results_)
results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pp__fs_univ__k,param_pp__pca__n_components,param_remove_corr__n_remove,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
31,542.443622,98.152052,0.453263,0.037704,50,25,500,"{'pp__fs_univ__k': 50, 'pp__pca__n_components'...",0.462098,0.572572,0.450545,0.503757,0.497243,0.047783,1
32,672.412304,105.062954,0.518858,0.026265,50,50,0,"{'pp__fs_univ__k': 50, 'pp__pca__n_components'...",0.488394,0.52989,0.457167,0.432433,0.476971,0.036424,2
35,518.289045,95.032412,0.398962,0.066941,50,50,500,"{'pp__fs_univ__k': 50, 'pp__pca__n_components'...",0.452839,0.421114,0.488359,0.528705,0.472754,0.040117,3
11,543.629566,105.190868,0.439512,0.037869,10,50,500,"{'pp__fs_univ__k': 10, 'pp__pca__n_components'...",0.43493,0.467145,0.432214,0.512068,0.461589,0.03222,4
6,631.283652,86.245669,0.406623,0.005595,10,25,100,"{'pp__fs_univ__k': 10, 'pp__pca__n_components'...",0.435815,0.468036,0.432214,0.503757,0.459956,0.02888,5
19,537.438967,97.703835,0.42636,0.038049,30,25,500,"{'pp__fs_univ__k': 30, 'pp__pca__n_components'...",0.40681,0.514167,0.461783,0.439178,0.455485,0.03911,6
4,664.987519,104.56648,0.452217,0.027571,10,25,0,"{'pp__fs_univ__k': 10, 'pp__pca__n_components'...",0.420505,0.491352,0.425974,0.47884,0.454168,0.031303,7
17,671.016353,101.67068,0.499337,0.013357,30,25,50,"{'pp__fs_univ__k': 30, 'pp__pca__n_components'...",0.462098,0.461609,0.413843,0.47884,0.454097,0.024254,8
28,670.809166,97.162618,0.463168,0.019675,50,25,0,"{'pp__fs_univ__k': 50, 'pp__pca__n_components'...",0.505462,0.494589,0.438086,0.374952,0.453272,0.051951,9
23,547.68433,104.995395,0.471853,0.036043,30,50,500,"{'pp__fs_univ__k': 30, 'pp__pca__n_components'...",0.417284,0.478811,0.402645,0.512068,0.452702,0.044627,10


In [6]:
results.to_csv(f"/kaggle/working/results_LRC_DimRed_B.csv")