In [1]:
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, ParameterGrid
import seaborn as sns
from sklearn.model_selection import RepeatedStratifiedKFold
import warnings
warnings.filterwarnings("ignore")


In [2]:
# training dataset loading
Training_csv_file_path = './data/TrainingSet/mRNA_sublocation_TrainingSet_NC-BERTdata.csv'
Training_data= pd.read_csv(Training_csv_file_path)
# separate sequence features and labels
X_Train = Training_data.drop(columns=['label']).values
y_Train = Training_data['label'].values
# the training data and test data are standardized
sc = StandardScaler()
sc.fit(X_Train)
X_Train = sc.transform(X_Train)

In [3]:
from sklearn.decomposition import PCA
import numpy as np
# Initialize PCA and set the number of principal components
pca = PCA(n_components=72)
# Fit PCA on the training data and transform the training data to its principal components
X_Train = pca.fit_transform(X_Train)


In [4]:
# Define hyperparameter ranges
param_grid = {
    'learning_rate':[0.08,0.1],
    'depth':[5, 6],
    'rsm':[0.9, 1],
    'subsample':[0.6, 1.0],
    'min_data_in_leaf':[140,150]
}

# Create all combinations of hyperparameters
grid = list(ParameterGrid(param_grid))

# Initialize the result list
results = []

# 100 times 5-fold cross-validation
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=42)
rskf.get_n_splits(X_Train, y_Train)

# Open the CSV file for writing
with open('./result/CatBoost-100times-5-fold cv.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['learning_rate', 'depth','rsm','subsample','min_data_in_leaf', 'accuracy', 'precision', 'recall', 'f1'])
    
    # Perform a grid search
    for params in tqdm(grid, desc="Hyperparameter search"):
        acc_scores = []
        prec_scores = []
        rec_scores = []
        f1_scores = []
        
        for i, (train_index, test_index) in enumerate(rskf.split(X_Train, y_Train)):
            X_train, X_test = X_Train[train_index], X_Train[test_index]
            y_train, y_test = y_Train[train_index], y_Train[test_index]
            clf = CatBoostClassifier(iterations=1000,
                                     learning_rate=params['learning_rate'],
                                     depth=params['depth'],
                                     rsm=params['rsm'],
                                     subsample=params['subsample'],
                                     min_data_in_leaf=params['min_data_in_leaf'],
                                     logging_level='Silent',
                                     early_stopping_rounds=50)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            acc_scores.append(accuracy_score(y_test, y_pred))
            prec_scores.append(precision_score(y_test, y_pred))
            rec_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))
        
        # Save scores for this hyperparameter combination
        for acc, prec, rec, f1 in zip(acc_scores, prec_scores, rec_scores, f1_scores):
            writer.writerow([params['learning_rate'], params['depth'],params['rsm'],params['subsample'],params['min_data_in_leaf'], acc, prec, rec, f1])

        acc_mean, acc_std = np.mean(acc_scores), np.std(acc_scores)
        prec_mean, prec_std = np.mean(prec_scores), np.std(prec_scores)
        rec_mean, rec_std = np.mean(rec_scores), np.std(rec_scores)
        f1_mean, f1_std = np.mean(f1_scores), np.std(f1_scores)
        print(f" params: {params}")
        print(f" acc_mean: {acc_mean}, acc_std: {acc_std}")
        print(f" prec_mean: {prec_mean}, prec_std: {prec_std}")
        print(f" rec_mean: {rec_mean}, rec_std: {rec_std}")
        print(f" f1_mean: {f1_mean}, f1_std: {f1_std}")

Hyperparameter search:   3%|█▋                                                     | 1/32 [54:51<28:20:31, 3291.35s/it]

 params: {'depth': 5, 'learning_rate': 0.08, 'min_data_in_leaf': 140, 'rsm': 0.9, 'subsample': 0.6}
 acc_mean: 0.6893538196504883, acc_std: 0.012872690689444868
 prec_mean: 0.6653079843264181, prec_std: 0.018484217250265302
 rec_mean: 0.5760327069269598, rec_std: 0.023030538749052178
 f1_mean: 0.6172004237506367, f1_std: 0.017419098465156428


Hyperparameter search:   6%|███▎                                                 | 2/32 [1:49:16<27:17:57, 3275.92s/it]

 params: {'depth': 5, 'learning_rate': 0.08, 'min_data_in_leaf': 140, 'rsm': 0.9, 'subsample': 1.0}
 acc_mean: 0.68674112075588, acc_std: 0.01389146812078485
 prec_mean: 0.6617377047231503, prec_std: 0.02029833151076825
 rec_mean: 0.5735337225567111, rec_std: 0.023090832258132406
 f1_mean: 0.6142118110342081, f1_std: 0.01799072221372561


Hyperparameter search:   9%|████▉                                                | 3/32 [2:30:01<23:19:59, 2896.52s/it]

 params: {'depth': 5, 'learning_rate': 0.08, 'min_data_in_leaf': 140, 'rsm': 1, 'subsample': 0.6}
 acc_mean: 0.6895276025925874, acc_std: 0.013073430274429622
 prec_mean: 0.6656673700564802, prec_std: 0.019118001691938235
 rec_mean: 0.5759885908897404, rec_std: 0.022027539566723563
 f1_mean: 0.6173424384343537, f1_std: 0.01711276939082414


Hyperparameter search:  12%|██████▋                                              | 4/32 [3:10:08<21:01:26, 2703.09s/it]

 params: {'depth': 5, 'learning_rate': 0.08, 'min_data_in_leaf': 140, 'rsm': 1, 'subsample': 1.0}
 acc_mean: 0.6859372070343946, acc_std: 0.013192371735499779
 prec_mean: 0.6604174178040089, prec_std: 0.01882936634354255
 rec_mean: 0.5730451742382777, rec_std: 0.022711490101357
 f1_mean: 0.6133928701151308, f1_std: 0.017584908850228773


Hyperparameter search:  16%|████████▎                                            | 5/32 [3:57:30<20:39:00, 2753.34s/it]

 params: {'depth': 5, 'learning_rate': 0.08, 'min_data_in_leaf': 150, 'rsm': 0.9, 'subsample': 0.6}
 acc_mean: 0.6893538196504883, acc_std: 0.012872690689444868
 prec_mean: 0.6653079843264181, prec_std: 0.018484217250265302
 rec_mean: 0.5760327069269598, rec_std: 0.023030538749052178
 f1_mean: 0.6172004237506367, f1_std: 0.017419098465156428


Hyperparameter search:  19%|█████████▉                                           | 6/32 [4:43:10<19:51:08, 2748.80s/it]

 params: {'depth': 5, 'learning_rate': 0.08, 'min_data_in_leaf': 150, 'rsm': 0.9, 'subsample': 1.0}
 acc_mean: 0.68674112075588, acc_std: 0.01389146812078485
 prec_mean: 0.6617377047231503, prec_std: 0.02029833151076825
 rec_mean: 0.5735337225567111, rec_std: 0.023090832258132406
 f1_mean: 0.6142118110342081, f1_std: 0.01799072221372561


Hyperparameter search:  22%|███████████▌                                         | 7/32 [5:20:48<17:58:29, 2588.38s/it]

 params: {'depth': 5, 'learning_rate': 0.08, 'min_data_in_leaf': 150, 'rsm': 1, 'subsample': 0.6}
 acc_mean: 0.6895276025925874, acc_std: 0.013073430274429622
 prec_mean: 0.6656673700564802, prec_std: 0.019118001691938235
 rec_mean: 0.5759885908897404, rec_std: 0.022027539566723563
 f1_mean: 0.6173424384343537, f1_std: 0.01711276939082414


Hyperparameter search:  25%|█████████████▎                                       | 8/32 [5:57:58<16:29:45, 2474.39s/it]

 params: {'depth': 5, 'learning_rate': 0.08, 'min_data_in_leaf': 150, 'rsm': 1, 'subsample': 1.0}
 acc_mean: 0.6859372070343946, acc_std: 0.013192371735499779
 prec_mean: 0.6604174178040089, prec_std: 0.01882936634354255
 rec_mean: 0.5730451742382777, rec_std: 0.022711490101357
 f1_mean: 0.6133928701151308, f1_std: 0.017584908850228773


Hyperparameter search:  28%|██████████████▉                                      | 9/32 [6:44:08<16:23:50, 2566.54s/it]

 params: {'depth': 5, 'learning_rate': 0.1, 'min_data_in_leaf': 140, 'rsm': 0.9, 'subsample': 0.6}
 acc_mean: 0.6879190253416685, acc_std: 0.012852014165947676
 prec_mean: 0.6628294608216793, prec_std: 0.018488592126768587
 rec_mean: 0.5759097609925197, rec_std: 0.022379746505200297
 f1_mean: 0.6160755632343586, f1_std: 0.017131932520163918


Hyperparameter search:  31%|████████████████▎                                   | 10/32 [7:29:46<16:00:28, 2619.46s/it]

 params: {'depth': 5, 'learning_rate': 0.1, 'min_data_in_leaf': 140, 'rsm': 0.9, 'subsample': 1.0}
 acc_mean: 0.6855530845626963, acc_std: 0.012812534034007799
 prec_mean: 0.6597167878060188, prec_std: 0.018649965437409658
 rec_mean: 0.5732572401629873, rec_std: 0.022683627621120018
 f1_mean: 0.6131874291050542, f1_std: 0.01707414843659629


Hyperparameter search:  34%|█████████████████▉                                  | 11/32 [8:05:06<14:23:19, 2466.65s/it]

 params: {'depth': 5, 'learning_rate': 0.1, 'min_data_in_leaf': 140, 'rsm': 1, 'subsample': 0.6}
 acc_mean: 0.6884016302572806, acc_std: 0.012881602619661016
 prec_mean: 0.6636012617154252, prec_std: 0.018602994532595923
 rec_mean: 0.5761707474305175, rec_std: 0.022513868118635903
 f1_mean: 0.6165469724026408, f1_std: 0.01713170118542724


Hyperparameter search:  38%|███████████████████▌                                | 12/32 [8:39:34<13:01:49, 2345.46s/it]

 params: {'depth': 5, 'learning_rate': 0.1, 'min_data_in_leaf': 140, 'rsm': 1, 'subsample': 1.0}
 acc_mean: 0.6849311096697626, acc_std: 0.013251141517472458
 prec_mean: 0.6587634146933999, prec_std: 0.019079021528068806
 rec_mean: 0.5728331083135682, rec_std: 0.022314706396948783
 f1_mean: 0.6125576319330901, f1_std: 0.017398793175948824


Hyperparameter search:  41%|█████████████████████▏                              | 13/32 [9:22:37<12:45:28, 2417.29s/it]

 params: {'depth': 5, 'learning_rate': 0.1, 'min_data_in_leaf': 150, 'rsm': 0.9, 'subsample': 0.6}
 acc_mean: 0.6879190253416685, acc_std: 0.012852014165947676
 prec_mean: 0.6628294608216793, prec_std: 0.018488592126768587
 rec_mean: 0.5759097609925197, rec_std: 0.022379746505200297
 f1_mean: 0.6160755632343586, f1_std: 0.017131932520163918


Hyperparameter search:  44%|██████████████████████▎                            | 14/32 [10:05:05<12:17:03, 2456.87s/it]

 params: {'depth': 5, 'learning_rate': 0.1, 'min_data_in_leaf': 150, 'rsm': 0.9, 'subsample': 1.0}
 acc_mean: 0.6855530845626963, acc_std: 0.012812534034007799
 prec_mean: 0.6597167878060188, prec_std: 0.018649965437409658
 rec_mean: 0.5732572401629873, rec_std: 0.022683627621120018
 f1_mean: 0.6131874291050542, f1_std: 0.01707414843659629


Hyperparameter search:  47%|███████████████████████▉                           | 15/32 [10:40:03<11:05:27, 2348.69s/it]

 params: {'depth': 5, 'learning_rate': 0.1, 'min_data_in_leaf': 150, 'rsm': 1, 'subsample': 0.6}
 acc_mean: 0.6884016302572806, acc_std: 0.012881602619661016
 prec_mean: 0.6636012617154252, prec_std: 0.018602994532595923
 rec_mean: 0.5761707474305175, rec_std: 0.022513868118635903
 f1_mean: 0.6165469724026408, f1_std: 0.01713170118542724


Hyperparameter search:  50%|█████████████████████████▌                         | 16/32 [11:14:33<10:03:56, 2264.81s/it]

 params: {'depth': 5, 'learning_rate': 0.1, 'min_data_in_leaf': 150, 'rsm': 1, 'subsample': 1.0}
 acc_mean: 0.6849311096697626, acc_std: 0.013251141517472458
 prec_mean: 0.6587634146933999, prec_std: 0.019079021528068806
 rec_mean: 0.5728331083135682, rec_std: 0.022314706396948783
 f1_mean: 0.6125576319330901, f1_std: 0.017398793175948824


Hyperparameter search:  53%|███████████████████████████                        | 17/32 [12:17:46<11:21:05, 2724.39s/it]

 params: {'depth': 6, 'learning_rate': 0.08, 'min_data_in_leaf': 140, 'rsm': 0.9, 'subsample': 0.6}
 acc_mean: 0.6910398686226074, acc_std: 0.013396063005637568
 prec_mean: 0.6697525306389003, prec_std: 0.019930452120840825
 rec_mean: 0.5724575442437511, rec_std: 0.021601394615681183
 f1_mean: 0.6170571842549382, f1_std: 0.017348988366805894


Hyperparameter search:  56%|████████████████████████████▋                      | 18/32 [13:24:33<12:05:35, 3109.70s/it]

 params: {'depth': 6, 'learning_rate': 0.08, 'min_data_in_leaf': 140, 'rsm': 0.9, 'subsample': 1.0}
 acc_mean: 0.6891307655861151, acc_std: 0.013088172333047982
 prec_mean: 0.666876267355353, prec_std: 0.01891880753708556
 rec_mean: 0.5709202456972571, rec_std: 0.023016537814598104
 f1_mean: 0.6149232598991866, f1_std: 0.017755448210438087


Hyperparameter search:  59%|██████████████████████████████▎                    | 19/32 [14:19:20<11:25:18, 3162.95s/it]

 params: {'depth': 6, 'learning_rate': 0.08, 'min_data_in_leaf': 140, 'rsm': 1, 'subsample': 0.6}
 acc_mean: 0.6904819701088831, acc_std: 0.013030464437615203
 prec_mean: 0.6688911680787656, prec_std: 0.019475616440476823
 rec_mean: 0.5721381742990939, rec_std: 0.021914068086440898
 f1_mean: 0.6164863346682637, f1_std: 0.017070865894980525


Hyperparameter search:  62%|███████████████████████████████▉                   | 20/32 [15:12:00<10:32:26, 3162.23s/it]

 params: {'depth': 6, 'learning_rate': 0.08, 'min_data_in_leaf': 140, 'rsm': 1, 'subsample': 1.0}
 acc_mean: 0.6886843745543703, acc_std: 0.01380509847447354
 prec_mean: 0.6664325370690096, prec_std: 0.01996846204438474
 rec_mean: 0.5700425956334002, rec_std: 0.023428148627482732
 f1_mean: 0.6142235240145995, f1_std: 0.018473455405727618


Hyperparameter search:  66%|█████████████████████████████████▍                 | 21/32 [16:20:51<10:33:02, 3452.92s/it]

 params: {'depth': 6, 'learning_rate': 0.08, 'min_data_in_leaf': 150, 'rsm': 0.9, 'subsample': 0.6}
 acc_mean: 0.6910398686226074, acc_std: 0.013396063005637568
 prec_mean: 0.6697525306389003, prec_std: 0.019930452120840825
 rec_mean: 0.5724575442437511, rec_std: 0.021601394615681183
 f1_mean: 0.6170571842549382, f1_std: 0.017348988366805894


Hyperparameter search:  69%|███████████████████████████████████                | 22/32 [17:35:27<10:26:41, 3760.12s/it]

 params: {'depth': 6, 'learning_rate': 0.08, 'min_data_in_leaf': 150, 'rsm': 0.9, 'subsample': 1.0}
 acc_mean: 0.6891307655861151, acc_std: 0.013088172333047982
 prec_mean: 0.666876267355353, prec_std: 0.01891880753708556
 rec_mean: 0.5709202456972571, rec_std: 0.023016537814598104
 f1_mean: 0.6149232598991866, f1_std: 0.017755448210438087


Hyperparameter search:  72%|█████████████████████████████████████▍              | 23/32 [18:30:47<9:04:12, 3628.05s/it]

 params: {'depth': 6, 'learning_rate': 0.08, 'min_data_in_leaf': 150, 'rsm': 1, 'subsample': 0.6}
 acc_mean: 0.6904819701088831, acc_std: 0.013030464437615203
 prec_mean: 0.6688911680787656, prec_std: 0.019475616440476823
 rec_mean: 0.5721381742990939, rec_std: 0.021914068086440898
 f1_mean: 0.6164863346682637, f1_std: 0.017070865894980525


Hyperparameter search:  75%|███████████████████████████████████████             | 24/32 [19:27:58<7:55:50, 3568.81s/it]

 params: {'depth': 6, 'learning_rate': 0.08, 'min_data_in_leaf': 150, 'rsm': 1, 'subsample': 1.0}
 acc_mean: 0.6886843745543703, acc_std: 0.01380509847447354
 prec_mean: 0.6664325370690096, prec_std: 0.01996846204438474
 rec_mean: 0.5700425956334002, rec_std: 0.023428148627482732
 f1_mean: 0.6142235240145995, f1_std: 0.018473455405727618


Hyperparameter search:  78%|███████████████████████████████████████           | 25/32 [35:54:29<39:23:24, 20257.77s/it]

 params: {'depth': 6, 'learning_rate': 0.1, 'min_data_in_leaf': 140, 'rsm': 0.9, 'subsample': 0.6}
 acc_mean: 0.6901001844619553, acc_std: 0.012661881200131874
 prec_mean: 0.6679745022379343, prec_std: 0.018707269444968904
 rec_mean: 0.572646037827647, rec_std: 0.021919836908540227
 f1_mean: 0.6163970156003677, f1_std: 0.01680110310306133


Hyperparameter search:  81%|████████████████████████████████████████▋         | 26/32 [36:55:52<25:28:28, 15284.72s/it]

 params: {'depth': 6, 'learning_rate': 0.1, 'min_data_in_leaf': 140, 'rsm': 0.9, 'subsample': 1.0}
 acc_mean: 0.6878567544815974, acc_std: 0.01332297138133157
 prec_mean: 0.6648492103067123, prec_std: 0.019687736824896915
 rec_mean: 0.5704328650489571, rec_std: 0.022123501027825253
 f1_mean: 0.613780136599175, f1_std: 0.017409246720770006


Hyperparameter search:  84%|██████████████████████████████████████████▏       | 27/32 [37:41:47<16:00:28, 11525.68s/it]

 params: {'depth': 6, 'learning_rate': 0.1, 'min_data_in_leaf': 140, 'rsm': 1, 'subsample': 0.6}
 acc_mean: 0.6901177359480011, acc_std: 0.013142871948553288
 prec_mean: 0.6681291647458178, prec_std: 0.01928368548379546
 rec_mean: 0.5723257191510065, rec_std: 0.022250082085278945
 f1_mean: 0.616277904614149, f1_std: 0.01740615680982774


Hyperparameter search:  88%|█████████████████████████████████████████████▌      | 28/32 [38:23:25<9:47:49, 8817.31s/it]

 params: {'depth': 6, 'learning_rate': 0.1, 'min_data_in_leaf': 140, 'rsm': 1, 'subsample': 1.0}
 acc_mean: 0.6878738390626941, acc_std: 0.013137657810906388
 prec_mean: 0.665074904196652, prec_std: 0.01931631231038253
 rec_mean: 0.5698833911086785, rec_std: 0.022364000032063652
 f1_mean: 0.6135549255926471, f1_std: 0.01738384070420942


Hyperparameter search:  91%|███████████████████████████████████████████████▏    | 29/32 [39:15:43<5:55:39, 7113.32s/it]

 params: {'depth': 6, 'learning_rate': 0.1, 'min_data_in_leaf': 150, 'rsm': 0.9, 'subsample': 0.6}
 acc_mean: 0.6901001844619553, acc_std: 0.012661881200131874
 prec_mean: 0.6679745022379343, prec_std: 0.018707269444968904
 rec_mean: 0.572646037827647, rec_std: 0.021919836908540227
 f1_mean: 0.6163970156003677, f1_std: 0.01680110310306133


Hyperparameter search:  94%|████████████████████████████████████████████████▊   | 30/32 [40:09:35<3:18:17, 5948.86s/it]

 params: {'depth': 6, 'learning_rate': 0.1, 'min_data_in_leaf': 150, 'rsm': 0.9, 'subsample': 1.0}
 acc_mean: 0.6878567544815974, acc_std: 0.01332297138133157
 prec_mean: 0.6648492103067123, prec_std: 0.019687736824896915
 rec_mean: 0.5704328650489571, rec_std: 0.022123501027825253
 f1_mean: 0.613780136599175, f1_std: 0.017409246720770006


Hyperparameter search:  97%|██████████████████████████████████████████████████▍ | 31/32 [40:51:25<1:21:57, 4917.35s/it]

 params: {'depth': 6, 'learning_rate': 0.1, 'min_data_in_leaf': 150, 'rsm': 1, 'subsample': 0.6}
 acc_mean: 0.6901177359480011, acc_std: 0.013142871948553288
 prec_mean: 0.6681291647458178, prec_std: 0.01928368548379546
 rec_mean: 0.5723257191510065, rec_std: 0.022250082085278945
 f1_mean: 0.616277904614149, f1_std: 0.01740615680982774


Hyperparameter search: 100%|██████████████████████████████████████████████████████| 32/32 [41:31:30<00:00, 4671.58s/it]

 params: {'depth': 6, 'learning_rate': 0.1, 'min_data_in_leaf': 150, 'rsm': 1, 'subsample': 1.0}
 acc_mean: 0.6878738390626941, acc_std: 0.013137657810906388
 prec_mean: 0.665074904196652, prec_std: 0.01931631231038253
 rec_mean: 0.5698833911086785, rec_std: 0.022364000032063652
 f1_mean: 0.6135549255926471, f1_std: 0.01738384070420942



