In [24]:
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import svm
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


In [25]:
# training dataset loading
Training_csv_file_path = './data/TrainingSet/mRNA_sublocation_TrainingSet_NC-BERTdata.csv'
Training_data= pd.read_csv(Training_csv_file_path)
Training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
0,0.008038,0.005472,0.007248,0.007894,0.007398,0.004448,0.000894,0.007232,0.009605,0.007154,...,-0.047920,-0.044799,0.009836,-0.027684,-0.060108,0.051503,0.074873,0.094912,0.142718,1
1,0.006421,0.004846,0.007124,0.006833,0.007555,0.005529,0.002641,0.006684,0.008229,0.006792,...,-0.063734,-0.007245,0.007870,-0.022348,-0.066921,0.064694,0.101658,0.116233,0.136837,1
2,0.012833,0.005971,0.006511,0.007626,0.007715,0.005825,0.001119,0.006599,0.007285,0.005525,...,-0.080484,-0.063357,0.037052,-0.023793,-0.052618,0.047666,0.088847,0.100192,0.156532,1
3,0.013917,0.006805,0.005778,0.013393,0.008006,0.004290,0.000706,0.007448,0.006812,0.002914,...,-0.044457,-0.043053,0.013077,-0.001783,-0.055099,0.052132,0.093488,0.079941,0.162534,1
4,0.017324,0.006722,0.007164,0.007420,0.007784,0.003431,0.001093,0.007214,0.006707,0.006244,...,-0.056244,-0.031527,0.027563,-0.006933,-0.035199,0.053570,0.106644,0.127546,0.129778,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,0.020823,0.006709,0.006760,0.009948,0.007638,0.003582,0.001047,0.007103,0.007427,0.004873,...,-0.071451,-0.064951,0.026554,-0.024152,-0.040399,0.054827,0.108673,0.107649,0.108352,0
4658,0.002121,0.007038,0.006064,0.003244,0.007038,0.005526,0.002388,0.005519,0.008397,0.006210,...,-0.069669,-0.118523,0.001044,-0.004282,-0.044469,0.064367,0.085585,0.090057,0.146606,0
4659,0.018389,0.005796,0.007591,0.005788,0.005100,0.006153,0.002046,0.007090,0.005993,0.006751,...,-0.073933,-0.074885,0.044387,-0.000744,-0.048722,-0.010145,0.098738,0.119175,0.117543,0
4660,0.001512,0.001544,0.005322,0.000000,0.003088,0.007880,0.001362,0.001574,0.003991,0.010899,...,-0.071051,-0.068419,-0.018095,0.009988,-0.081945,0.103257,-0.006743,0.098912,0.106491,0


In [26]:
# separate sequence features and labels
X_Train = Training_data.drop(columns=['label']).values
y_Train = Training_data['label'].values

In [27]:
# the training data and test data are standardized
sc = StandardScaler()
sc.fit(X_Train)
X_Train = sc.transform(X_Train)

In [28]:
from sklearn.decomposition import PCA
import numpy as np
# Initialize PCA and set the number of principal components
pca = PCA(n_components=72)
# Fit PCA on the training data and transform the training data to its principal components
X_Train = pca.fit_transform(X_Train)


In [29]:
# Define hyperparameter ranges
param_grid = {
    'C':[0.1, 1, 10],
    'gamma':[0.1, 0.01],
    'degree':[2, 3],
    'coef0':[-1, 0, 1]
}

# Create all combinations of hyperparameters
grid = list(ParameterGrid(param_grid))

# Initialize the result list
results = []

# 100 times 5-fold cross-validation
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=42)
rskf.get_n_splits(X_Train, y_Train)

# Open the CSV file for writing
with open('./result/SVM-100times-5-fold cv.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['C', 'gamma', 'degree', 'coef0','accuracy', 'precision', 'recall', 'f1'])
    
    # Perform a grid search
    for params in tqdm(grid, desc="Hyperparameter search"):
        acc_scores = []
        prec_scores = []
        rec_scores = []
        f1_scores = []
        
        for i, (train_index, test_index) in enumerate(rskf.split(X_Train, y_Train)):
            X_train, X_test = X_Train[train_index], X_Train[test_index]
            y_train, y_test = y_Train[train_index], y_Train[test_index]
            clf = svm.SVC(kernel='rbf',C=params['C'],gamma=params['gamma'],degree=params['degree'],coef0=params['coef0'])
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            acc_scores.append(accuracy_score(y_test, y_pred))
            prec_scores.append(precision_score(y_test, y_pred))
            rec_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))
        
        # Save scores for this hyperparameter combination
        for acc, prec, rec, f1 in zip(acc_scores, prec_scores, rec_scores, f1_scores):
            writer.writerow([params['C'], params['gamma'],params['degree'],params['coef0'], acc, prec, rec, f1])

        acc_mean, acc_std = np.mean(acc_scores), np.std(acc_scores)
        prec_mean, prec_std = np.mean(prec_scores), np.std(prec_scores)
        rec_mean, rec_std = np.mean(rec_scores), np.std(rec_scores)
        f1_mean, f1_std = np.mean(f1_scores), np.std(f1_scores)
        print(f" params: {params}")
        print(f" acc_mean: {acc_mean}, acc_std: {acc_std}")
        print(f" prec_mean: {prec_mean}, prec_std: {prec_std}")
        print(f" rec_mean: {rec_mean}, rec_std: {rec_std}")
        print(f" f1_mean: {f1_mean}, f1_std: {f1_std}")

Hyperparameter search:   3%|█▌                                                       | 1/36 [13:18<7:46:00, 798.87s/it]

 params: {'C': 0.1, 'coef0': -1, 'degree': 2, 'gamma': 0.1}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:   6%|███▏                                                     | 2/36 [25:50<7:17:01, 771.22s/it]

 params: {'C': 0.1, 'coef0': -1, 'degree': 2, 'gamma': 0.01}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:   8%|████▊                                                    | 3/36 [38:56<7:07:49, 777.86s/it]

 params: {'C': 0.1, 'coef0': -1, 'degree': 3, 'gamma': 0.1}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  11%|██████▎                                                  | 4/36 [51:39<6:51:44, 772.03s/it]

 params: {'C': 0.1, 'coef0': -1, 'degree': 3, 'gamma': 0.01}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  14%|███████▋                                               | 5/36 [1:04:53<6:42:57, 779.92s/it]

 params: {'C': 0.1, 'coef0': 0, 'degree': 2, 'gamma': 0.1}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  17%|█████████▏                                             | 6/36 [1:17:34<6:26:45, 773.52s/it]

 params: {'C': 0.1, 'coef0': 0, 'degree': 2, 'gamma': 0.01}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  19%|██████████▋                                            | 7/36 [1:30:27<6:13:42, 773.20s/it]

 params: {'C': 0.1, 'coef0': 0, 'degree': 3, 'gamma': 0.1}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  22%|████████████▏                                          | 8/36 [1:42:52<5:56:40, 764.31s/it]

 params: {'C': 0.1, 'coef0': 0, 'degree': 3, 'gamma': 0.01}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  25%|█████████████▊                                         | 9/36 [1:56:01<5:47:21, 771.90s/it]

 params: {'C': 0.1, 'coef0': 1, 'degree': 2, 'gamma': 0.1}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  28%|███████████████                                       | 10/36 [2:08:21<5:30:14, 762.10s/it]

 params: {'C': 0.1, 'coef0': 1, 'degree': 2, 'gamma': 0.01}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  31%|████████████████▌                                     | 11/36 [2:21:13<5:18:47, 765.09s/it]

 params: {'C': 0.1, 'coef0': 1, 'degree': 3, 'gamma': 0.1}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  33%|██████████████████                                    | 12/36 [2:33:35<5:03:18, 758.27s/it]

 params: {'C': 0.1, 'coef0': 1, 'degree': 3, 'gamma': 0.01}
 acc_mean: 0.5649936289324667, acc_std: 0.0004102540792233376
 prec_mean: 0.0, prec_std: 0.0
 rec_mean: 0.0, rec_std: 0.0
 f1_mean: 0.0, f1_std: 0.0


Hyperparameter search:  36%|███████████████████▌                                  | 13/36 [2:47:05<4:56:36, 773.76s/it]

 params: {'C': 1, 'coef0': -1, 'degree': 2, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  39%|█████████████████████                                 | 14/36 [3:00:23<4:46:28, 781.30s/it]

 params: {'C': 1, 'coef0': -1, 'degree': 2, 'gamma': 0.01}
 acc_mean: 0.670992881424543, acc_std: 0.012449408012700544
 prec_mean: 0.6945042934320366, prec_std: 0.0237640814168273
 rec_mean: 0.43569326765188837, rec_std: 0.023252699863501702
 f1_mean: 0.5351042485690182, f1_std: 0.0209556399748628


Hyperparameter search:  42%|██████████████████████▌                               | 15/36 [3:13:14<4:32:18, 778.00s/it]

 params: {'C': 1, 'coef0': -1, 'degree': 3, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  44%|████████████████████████                              | 16/36 [3:25:24<4:14:31, 763.57s/it]

 params: {'C': 1, 'coef0': -1, 'degree': 3, 'gamma': 0.01}
 acc_mean: 0.670992881424543, acc_std: 0.012449408012700544
 prec_mean: 0.6945042934320366, prec_std: 0.0237640814168273
 rec_mean: 0.43569326765188837, rec_std: 0.023252699863501702
 f1_mean: 0.5351042485690182, f1_std: 0.0209556399748628


Hyperparameter search:  47%|█████████████████████████▌                            | 17/36 [3:37:18<3:57:08, 748.87s/it]

 params: {'C': 1, 'coef0': 0, 'degree': 2, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  50%|███████████████████████████                           | 18/36 [3:48:57<3:40:08, 733.79s/it]

 params: {'C': 1, 'coef0': 0, 'degree': 2, 'gamma': 0.01}
 acc_mean: 0.670992881424543, acc_std: 0.012449408012700544
 prec_mean: 0.6945042934320366, prec_std: 0.0237640814168273
 rec_mean: 0.43569326765188837, rec_std: 0.023252699863501702
 f1_mean: 0.5351042485690182, f1_std: 0.0209556399748628


Hyperparameter search:  53%|████████████████████████████▌                         | 19/36 [4:00:49<3:26:04, 727.33s/it]

 params: {'C': 1, 'coef0': 0, 'degree': 3, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  56%|██████████████████████████████                        | 20/36 [4:12:30<3:11:46, 719.18s/it]

 params: {'C': 1, 'coef0': 0, 'degree': 3, 'gamma': 0.01}
 acc_mean: 0.670992881424543, acc_std: 0.012449408012700544
 prec_mean: 0.6945042934320366, prec_std: 0.0237640814168273
 rec_mean: 0.43569326765188837, rec_std: 0.023252699863501702
 f1_mean: 0.5351042485690182, f1_std: 0.0209556399748628


Hyperparameter search:  58%|███████████████████████████████▌                      | 21/36 [4:24:22<2:59:16, 717.10s/it]

 params: {'C': 1, 'coef0': 1, 'degree': 2, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  61%|█████████████████████████████████                     | 22/36 [4:36:09<2:46:39, 714.26s/it]

 params: {'C': 1, 'coef0': 1, 'degree': 2, 'gamma': 0.01}
 acc_mean: 0.670992881424543, acc_std: 0.012449408012700544
 prec_mean: 0.6945042934320366, prec_std: 0.0237640814168273
 rec_mean: 0.43569326765188837, rec_std: 0.023252699863501702
 f1_mean: 0.5351042485690182, f1_std: 0.0209556399748628


Hyperparameter search:  64%|██████████████████████████████████▌                   | 23/36 [4:48:40<2:37:05, 725.01s/it]

 params: {'C': 1, 'coef0': 1, 'degree': 3, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  67%|████████████████████████████████████                  | 24/36 [5:00:58<2:25:47, 728.94s/it]

 params: {'C': 1, 'coef0': 1, 'degree': 3, 'gamma': 0.01}
 acc_mean: 0.670992881424543, acc_std: 0.012449408012700544
 prec_mean: 0.6945042934320366, prec_std: 0.0237640814168273
 rec_mean: 0.43569326765188837, rec_std: 0.023252699863501702
 f1_mean: 0.5351042485690182, f1_std: 0.0209556399748628


Hyperparameter search:  69%|█████████████████████████████████████▌                | 25/36 [5:14:04<2:16:48, 746.27s/it]

 params: {'C': 10, 'coef0': -1, 'degree': 2, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  72%|███████████████████████████████████████               | 26/36 [5:26:17<2:03:42, 742.22s/it]

 params: {'C': 10, 'coef0': -1, 'degree': 2, 'gamma': 0.01}
 acc_mean: 0.6692660783204301, acc_std: 0.012482294540286164
 prec_mean: 0.6654669268847602, prec_std: 0.02062866873498607
 rec_mean: 0.48267027914614125, rec_std: 0.02337116398951598
 f1_mean: 0.5592055045231556, f1_std: 0.019466170211360306


Hyperparameter search:  75%|████████████████████████████████████████▌             | 27/36 [5:39:13<1:52:49, 752.18s/it]

 params: {'C': 10, 'coef0': -1, 'degree': 3, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  78%|██████████████████████████████████████████            | 28/36 [5:51:16<1:39:09, 743.70s/it]

 params: {'C': 10, 'coef0': -1, 'degree': 3, 'gamma': 0.01}
 acc_mean: 0.6692660783204301, acc_std: 0.012482294540286164
 prec_mean: 0.6654669268847602, prec_std: 0.02062866873498607
 rec_mean: 0.48267027914614125, rec_std: 0.02337116398951598
 f1_mean: 0.5592055045231556, f1_std: 0.019466170211360306


Hyperparameter search:  81%|███████████████████████████████████████████▌          | 29/36 [6:03:38<1:26:41, 743.02s/it]

 params: {'C': 10, 'coef0': 0, 'degree': 2, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  83%|█████████████████████████████████████████████         | 30/36 [6:15:14<1:12:53, 728.98s/it]

 params: {'C': 10, 'coef0': 0, 'degree': 2, 'gamma': 0.01}
 acc_mean: 0.6692660783204301, acc_std: 0.012482294540286164
 prec_mean: 0.6654669268847602, prec_std: 0.02062866873498607
 rec_mean: 0.48267027914614125, rec_std: 0.02337116398951598
 f1_mean: 0.5592055045231556, f1_std: 0.019466170211360306


Hyperparameter search:  86%|██████████████████████████████████████████████▌       | 31/36 [6:27:37<1:01:05, 733.19s/it]

 params: {'C': 10, 'coef0': 0, 'degree': 3, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  89%|█████████████████████████████████████████████████▊      | 32/36 [6:39:15<48:10, 722.57s/it]

 params: {'C': 10, 'coef0': 0, 'degree': 3, 'gamma': 0.01}
 acc_mean: 0.6692660783204301, acc_std: 0.012482294540286164
 prec_mean: 0.6654669268847602, prec_std: 0.02062866873498607
 rec_mean: 0.48267027914614125, rec_std: 0.02337116398951598
 f1_mean: 0.5592055045231556, f1_std: 0.019466170211360306


Hyperparameter search:  92%|███████████████████████████████████████████████████▎    | 33/36 [6:51:37<36:24, 728.28s/it]

 params: {'C': 10, 'coef0': 1, 'degree': 2, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search:  94%|████████████████████████████████████████████████████▉   | 34/36 [7:03:08<23:54, 717.16s/it]

 params: {'C': 10, 'coef0': 1, 'degree': 2, 'gamma': 0.01}
 acc_mean: 0.6692660783204301, acc_std: 0.012482294540286164
 prec_mean: 0.6654669268847602, prec_std: 0.02062866873498607
 rec_mean: 0.48267027914614125, rec_std: 0.02337116398951598
 f1_mean: 0.5592055045231556, f1_std: 0.019466170211360306


Hyperparameter search:  97%|██████████████████████████████████████████████████████▍ | 35/36 [7:15:35<12:06, 726.14s/it]

 params: {'C': 10, 'coef0': 1, 'degree': 3, 'gamma': 0.1}
 acc_mean: 0.5653368431705376, acc_std: 0.0006659439707549743
 prec_mean: 0.32, prec_std: 0.466476151587624
 rec_mean: 0.0007890530925013684, rec_std: 0.0011502339295616814
 f1_mean: 0.001574224470776195, f1_std: 0.0022948093181428507


Hyperparameter search: 100%|████████████████████████████████████████████████████████| 36/36 [7:27:26<00:00, 745.74s/it]

 params: {'C': 10, 'coef0': 1, 'degree': 3, 'gamma': 0.01}
 acc_mean: 0.6692660783204301, acc_std: 0.012482294540286164
 prec_mean: 0.6654669268847602, prec_std: 0.02062866873498607
 rec_mean: 0.48267027914614125, rec_std: 0.02337116398951598
 f1_mean: 0.5592055045231556, f1_std: 0.019466170211360306



