In [1]:
seed = 123

In [2]:
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

# load dataset (no NAs)
X, Y = load_p53_ds()
# sratified 80/20 split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)

Using TensorFlow backend.



Import completed after 3.6 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


In [3]:
CV = 4

FU = FeatureUnion([
    ("pca", PCA(n_components = 10)),
    ("fs_univ", SelectKBest(f_classif, k=50)),
    ("fs_model", SelectFromModel(
        LinearSVC(max_iter=15000, random_state=seed, class_weight="balanced"), threshold=-np.inf, max_features=500)
    )
])

pipe = Pipeline([
    ("remove_corr", RemoveCorrelatedFeatures()),
    ("ss", StandardScaler()),
    ("pp", FU),
    ("resample", SVMSMOTE(k_neighbors=10, m_neighbors=10)),
    ("clf", LogisticRegression(max_iter=5000, penalty="l2", class_weight="balanced", random_state=seed))
])

cw = [None]
cw.extend({0:i, 1:1-i} for i in [0.05, 0.1, 0.15])

param_grid = dict(
    remove_corr__n_remove=[0, 1000],
    pp__pca__n_components=[20],
    pp__fs_model__max_features=[500],
    resample__sampling_strategy=[0.05, 0.1],
    clf__C=[1, 2],
    clf__class_weight=["balanced"]
)

GS = GridSearchCV(pipe, param_grid=param_grid, cv=CV, verbose=10, scoring=make_scorer(matthews_corrcoef), n_jobs=-1)
GS.fit(X_train, Y_train)

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 34.9min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 49.2min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 79.9min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 100.5min
[Parallel(n_jobs=-1)]: Done  29 out of  32 | elapsed: 125.7min remaining: 13.0min
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed: 127.2min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('remove_corr',
                                        RemoveCorrelatedFeatures(n_remove=500)),
                                       ('ss',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pp',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('pca',
                                                                        PCA(copy=True,
                                                                            iterated_power='auto',
                                                                            n_components=10,
                                                          

In [4]:
results = pd.DataFrame(GS.cv_results_)

results.sort_values("mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__class_weight,param_pp__fs_model__max_features,param_pp__pca__n_components,param_remove_corr__n_remove,param_resample__sampling_strategy,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
4,1072.311664,20.494551,0.803121,0.02268,2,balanced,500,20,0,0.05,"{'clf__C': 2, 'clf__class_weight': 'balanced',...",0.423466,0.477858,0.465376,0.371807,0.434627,0.041489,1
1,1065.830589,19.145597,0.780582,0.007943,1,balanced,500,20,0,0.1,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.410192,0.482605,0.474588,0.362632,0.432504,0.049146,2
0,1084.479144,26.038175,0.798745,0.075238,1,balanced,500,20,0,0.05,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.403995,0.477858,0.460962,0.371807,0.428655,0.042733,3
2,817.023335,44.473785,0.705101,0.072216,1,balanced,500,20,1000,0.05,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.347257,0.511146,0.490214,0.350025,0.424661,0.076385,4
3,814.631992,50.113887,0.736332,0.086907,1,balanced,500,20,1000,0.1,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.361791,0.502826,0.465376,0.362632,0.423156,0.062367,5
5,1081.14061,20.296269,0.870244,0.040348,2,balanced,500,20,0,0.1,"{'clf__C': 2, 'clf__class_weight': 'balanced',...",0.403995,0.477858,0.449272,0.358285,0.422353,0.045407,6
7,814.622216,42.85567,0.561135,0.082794,2,balanced,500,20,1000,0.1,"{'clf__C': 2, 'clf__class_weight': 'balanced',...",0.351912,0.476133,0.449272,0.381691,0.414752,0.050003,7
6,827.471216,48.772674,0.764025,0.088514,2,balanced,500,20,1000,0.05,"{'clf__C': 2, 'clf__class_weight': 'balanced',...",0.347257,0.48037,0.449272,0.362632,0.409883,0.056291,8


In [5]:
results.to_csv(f"/kaggle/working/results_LRC_Classification.csv")