In [1]:
seed=123

In [2]:
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

# load dataset (no NAs)
X, Y = load_p53_ds()
# sratified 80/20 split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)

Using TensorFlow backend.



Import completed after 4.2 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


In [3]:
CV = 4

# dim-reduction from 5408 --> ~ 550
FU = FeatureUnion([
    ("pca", PCA(n_components=25)),
    ("fs_univ", SelectKBest(f_classif, k=50)),
    ("fs_model", SelectFromModel(
        LinearSVC(max_iter=15000, random_state=seed, class_weight="balanced"), threshold=-np.inf, max_features=500)
    )
])

pipe = Pipeline([
    ("remove_corr", RemoveCorrelatedFeatures(n_remove=500)),
    ("ss", StandardScaler()),
    ("preproc", FU)
])

X_train_prepped = pipe.fit_transform(X_train, Y_train)



In [4]:
# evaluate effect of additional dim-reduction with clustering: Feature Agglomeration
pipe = Pipeline([
    ("feature_agglomeration", FeatureAgglomeration()),
    ("clf", LogisticRegression(max_iter=5000, class_weight="balanced", random_state=seed))
])

param_grid = dict(
    feature_agglomeration__n_clusters = [100, 250, 500],
    feature_agglomeration__linkage = ["ward", "single", "average"]
)

GS = GridSearchCV(pipe, param_grid=param_grid, cv=CV, verbose=10, scoring=make_scorer(matthews_corrcoef), n_jobs=-1)
GS.fit(X_train_prepped, Y_train)

Fitting 4 folds for each of 9 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  33 out of  36 | elapsed:  3.2min remaining:   17.5s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  3.4min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('feature_agglomeration',
                                        FeatureAgglomeration(affinity='euclidean',
                                                             compute_full_tree='auto',
                                                             connectivity=None,
                                                             distance_threshold=None,
                                                             linkage='ward',
                                                             memory=None,
                                                             n_clusters=2,
                                                             pooling_func=<function mean at 0x7ff0e819a9d8>)),
                                       ('clf',
                                        LogisticRegression(C=1.0,
                                                           class_wei

In [5]:
results = pd.DataFrame(GS.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_feature_agglomeration__linkage,param_feature_agglomeration__n_clusters,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,12.529678,1.839146,0.11891,0.003018,ward,100,"{'feature_agglomeration__linkage': 'ward', 'fe...",0.196374,0.208161,0.225491,0.234782,0.216202,0.01491,8
1,23.152726,2.750131,0.134641,0.003686,ward,250,"{'feature_agglomeration__linkage': 'ward', 'fe...",0.262406,0.358719,0.337176,0.345262,0.325891,0.037452,5
2,27.887336,1.366424,0.165782,0.012976,ward,500,"{'feature_agglomeration__linkage': 'ward', 'fe...",0.435468,0.502826,0.547462,0.444683,0.48261,0.045484,2
3,15.553984,2.697171,0.125771,0.006782,single,100,"{'feature_agglomeration__linkage': 'single', '...",0.20881,0.220298,0.203305,0.193217,0.206408,0.009777,9
4,22.002906,5.369148,0.145123,0.017811,single,250,"{'feature_agglomeration__linkage': 'single', '...",0.228841,0.39103,0.380073,0.365125,0.341267,0.065557,4
5,27.860238,2.186418,0.164763,0.016949,single,500,"{'feature_agglomeration__linkage': 'single', '...",0.400838,0.457905,0.567733,0.422685,0.46229,0.064192,3
6,13.568974,3.272595,0.121126,0.001087,average,100,"{'feature_agglomeration__linkage': 'average', ...",0.214691,0.250176,0.243554,0.25931,0.241933,0.016693,7
7,23.597314,3.30533,0.142376,0.013857,average,250,"{'feature_agglomeration__linkage': 'average', ...",0.264868,0.349078,0.32808,0.333479,0.318876,0.032121,6
8,24.863457,3.852753,0.143678,0.034168,average,500,"{'feature_agglomeration__linkage': 'average', ...",0.435468,0.506936,0.573825,0.440031,0.489065,0.056526,1


In [6]:
results.to_csv("/kaggle/working/results_LRC_FeatureAggl.csv")

In [7]:
# evaluate effect of KMeansClustering
pipe = Pipeline([
    ("kmeans", MiniBatchKMeans()),
    ("clf", LogisticRegression(max_iter=5000, class_weight="balanced", random_state=seed))
])

param_grid = dict(
    kmeans__n_clusters = [100, 250, 500],
)

GS = GridSearchCV(pipe, param_grid=param_grid, cv=4, verbose=10, scoring=make_scorer(matthews_corrcoef), n_jobs=-1)
GS.fit(X_train_prepped, Y_train)

Fitting 4 folds for each of 3 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  3.0min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  5.9min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  6.4min finished
  init_size=init_size)
  init_size=init_size)
  init_size=init_size)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('kmeans',
                                        MiniBatchKMeans(batch_size=100,
                                                        compute_labels=True,
                                                        init='k-means++',
                                                        init_size=None,
                                                        max_iter=100,
                                                        max_no_improvement=10,
                                                        n_clusters=8, n_init=3,
                                                        random_state=None,
                                                        reassignment_ratio=0.01,
                                                        tol=0.0, verbose=0)),
                                       ('clf',
                                        LogisticRegression(C=1.0

In [8]:
results = pd.DataFrame(GS.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kmeans__n_clusters,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,54.73529,5.177114,0.10113,0.00474,100,{'kmeans__n_clusters': 100},0.205697,0.197746,0.195901,0.188743,0.197022,0.006033,3
1,118.597529,12.792619,0.18889,0.014517,250,{'kmeans__n_clusters': 250},0.296906,0.281182,0.303795,0.255217,0.284275,0.018671,2
2,191.944628,8.636116,0.281046,0.091113,500,{'kmeans__n_clusters': 500},0.426886,0.323476,0.375122,0.397505,0.380747,0.037819,1


In [9]:
results.to_csv("/kaggle/working/results_LRC_KMeans.csv")