In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from catboost import CatBoostClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV
from itertools import product
import random
from tqdm import tqdm

In [5]:
# training dataset loading
Training_csv_file_path = './data/TrainingSet/mRNA_sublocation_TrainingSet_NC-BERTdata.csv'
Training_data= pd.read_csv(Training_csv_file_path)
Training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
0,0.008038,0.005472,0.007248,0.007894,0.007398,0.004448,0.000894,0.007232,0.009605,0.007154,...,-0.047920,-0.044799,0.009836,-0.027684,-0.060108,0.051503,0.074873,0.094912,0.142718,1
1,0.006421,0.004846,0.007124,0.006833,0.007555,0.005529,0.002641,0.006684,0.008229,0.006792,...,-0.063734,-0.007245,0.007870,-0.022348,-0.066921,0.064694,0.101658,0.116233,0.136837,1
2,0.012833,0.005971,0.006511,0.007626,0.007715,0.005825,0.001119,0.006599,0.007285,0.005525,...,-0.080484,-0.063357,0.037052,-0.023793,-0.052618,0.047666,0.088847,0.100192,0.156532,1
3,0.013917,0.006805,0.005778,0.013393,0.008006,0.004290,0.000706,0.007448,0.006812,0.002914,...,-0.044457,-0.043053,0.013077,-0.001783,-0.055099,0.052132,0.093488,0.079941,0.162534,1
4,0.017324,0.006722,0.007164,0.007420,0.007784,0.003431,0.001093,0.007214,0.006707,0.006244,...,-0.056244,-0.031527,0.027563,-0.006933,-0.035199,0.053570,0.106644,0.127546,0.129778,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,0.020823,0.006709,0.006760,0.009948,0.007638,0.003582,0.001047,0.007103,0.007427,0.004873,...,-0.071451,-0.064951,0.026554,-0.024152,-0.040399,0.054827,0.108673,0.107649,0.108352,0
4658,0.002121,0.007038,0.006064,0.003244,0.007038,0.005526,0.002388,0.005519,0.008397,0.006210,...,-0.069669,-0.118523,0.001044,-0.004282,-0.044469,0.064367,0.085585,0.090057,0.146606,0
4659,0.018389,0.005796,0.007591,0.005788,0.005100,0.006153,0.002046,0.007090,0.005993,0.006751,...,-0.073933,-0.074885,0.044387,-0.000744,-0.048722,-0.010145,0.098738,0.119175,0.117543,0
4660,0.001512,0.001544,0.005322,0.000000,0.003088,0.007880,0.001362,0.001574,0.003991,0.010899,...,-0.071051,-0.068419,-0.018095,0.009988,-0.081945,0.103257,-0.006743,0.098912,0.106491,0


In [7]:
# test dataset loading
Test_csv_file_path = './data/TestSet/mRNA_sublocation_TestSet_NC-BERTdata.csv'
Test_data= pd.read_csv(Test_csv_file_path)
Test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
0,0.011010,0.005166,0.007069,0.009104,0.006584,0.004962,0.000983,0.006195,0.007331,0.005005,...,-0.053218,-0.046008,0.024874,-0.005043,-0.057149,0.022523,0.123339,0.146821,0.165012,1
1,0.001138,0.002840,0.004449,0.000387,0.005293,0.008433,0.003076,0.004343,0.004783,0.009227,...,-0.060201,-0.053725,0.023453,0.007158,-0.104437,0.018186,0.066489,0.168612,0.168197,1
2,0.003841,0.003923,0.006374,0.003134,0.005604,0.007207,0.001879,0.005027,0.005505,0.008505,...,-0.086440,-0.074154,0.037891,0.010463,-0.048493,0.094847,0.098022,0.145162,0.164766,1
3,0.004768,0.003772,0.005141,0.003835,0.004800,0.007349,0.002178,0.004614,0.005968,0.006595,...,-0.072469,-0.075094,0.079060,-0.053585,-0.033657,-0.002991,0.147035,0.143220,0.139921,1
4,0.014964,0.005276,0.007248,0.011185,0.005625,0.003709,0.000834,0.006393,0.007677,0.003865,...,-0.060701,-0.105159,0.082175,-0.042828,-0.013750,-0.040331,0.205573,0.173816,0.187638,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0.002883,0.003271,0.005074,0.002614,0.003925,0.005676,0.004618,0.003668,0.004228,0.007505,...,-0.083052,-0.081839,0.025406,0.034709,-0.057688,0.085145,0.059397,0.119146,0.114285,0
514,0.008520,0.004350,0.004851,0.005622,0.003838,0.005746,0.001919,0.007434,0.004079,0.004629,...,-0.050953,-0.018395,-0.001208,-0.011797,-0.064337,0.017005,0.087882,0.118566,0.137715,0
515,0.003307,0.003241,0.005121,0.002833,0.005537,0.005652,0.001430,0.006058,0.006633,0.008223,...,-0.077407,-0.028342,0.029947,-0.024831,-0.072220,-0.001702,0.085262,0.091931,0.144309,0
516,0.004880,0.002683,0.006275,0.003445,0.004216,0.005869,0.002368,0.002345,0.005615,0.008456,...,-0.115380,-0.046133,0.049324,-0.001430,-0.059596,0.059717,0.050933,0.101675,0.132865,0


In [11]:
# separate sequence features and labels
X_train = Training_data.drop(columns=['label']).values
y_train = Training_data['label'].values
X_test = Test_data.drop(columns=['label']).values
y_test = Test_data['label'].values
# the training data and test data are standardized
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [12]:
from sklearn.decomposition import PCA
import numpy as np
# Initialize PCA and set the number of principal components
pca = PCA(n_components=72)
# Fit PCA on the training data and transform the training data to its principal components
X_train = pca.fit_transform(X_train)
# Get the transformation matrix (principal components)
transformation_matrix = pca.components_
# Apply the same transformation matrix to the test data
# This ensures the test data is transformed in the same way as the training data
X_test = np.dot(X_test, transformation_matrix.T)

In [None]:
# Random search was used, and the number of hyperparameters searched was defined
num = 100 
# Define the hyperparameter search space
learning_rate = [0.08,0.1]
depth = [4, 5, 6]
rsm = [0.9, 1]
subsample = [0.6, 0.8, 1.0]
min_data_in_leaf = [140,150,160]

seed_value = 42
random.seed(seed_value)
# Hyperparameter tuning
hyperparameter_space = list(product(learning_rate,depth,rsm,subsample,min_data_in_leaf))
hyperparameters = [random.choice(hyperparameter_space) for i in range(num)]
results = []
best_acc = 0
for hyperparameter in tqdm(hyperparameters, desc="Hyperparameter Search"):
    learning_rate,depth,rsm,subsample,min_data_in_leaf = hyperparameter
    val_accuracy_scores = []
    val_precision_scores = []
    val_recall_scores = []
    val_f1_scores = []
    
#     5-fold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train,y_train), 1):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        clf = CatBoostClassifier(iterations=1000,learning_rate=learning_rate,depth=depth,rsm=rsm,subsample=subsample,min_data_in_leaf=min_data_in_leaf, logging_level='Silent',early_stopping_rounds=50)
        
        clf.fit(X_train_fold,y_train_fold)
        val_predictions = clf.predict(X_val_fold)
        val_accuracy = accuracy_score(y_val_fold, val_predictions)
        val_precision = precision_score(y_val_fold, val_predictions)
        val_recall = recall_score(y_val_fold, val_predictions)
        val_f1 = f1_score(y_val_fold, val_predictions)

#         Saving metrics
        val_accuracy_scores.append(val_accuracy)
        val_precision_scores.append(val_precision)
        val_recall_scores.append(val_recall)
        val_f1_scores.append(val_f1)
#   The average value of each metric was calculated
    val_ACC = np.mean(val_accuracy_scores)
    val_Precision = np.mean(val_precision_scores)
    val_Recall = np.mean(val_recall_scores)
    val_F1 = np.mean(val_f1_scores)
    
# Independent testing
    clf.fit(X_train,y_train)
    test_predictions = clf.predict(X_test)
    cm = confusion_matrix(y_test,test_predictions)
    TP = cm[1, 1]
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
# Calculating test metrics
    test_ACC = accuracy_score(y_test, test_predictions)
    test_Precision = precision_score(y_test, test_predictions)
    test_Recall = recall_score(y_test, test_predictions)
    test_F1 = f1_score(y_test, test_predictions)
    mcc = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))**0.5
    fpr, tpr, thresholds = roc_curve(y_test, test_predictions, pos_label=1)
    roc_auc = auc(fpr, tpr)
    results.append({
        "超参数组合": hyperparameter,
        "val_ACC": val_ACC,
        "val_Precision": val_Precision,
        "val_Recall": val_Recall,
        "val_F1":val_F1,
        "test_ACC":test_ACC,      
        "test_Precision":test_Precision,
        "test_Recall":test_Recall,
        "test_F1":test_F1,
        "test_MCC":mcc,
        "test_roc_auc":roc_auc
    })
    

In [14]:
# result ranking
sorted_results = sorted(results, key=lambda x: x["test_ACC"], reverse=True)
for i,result in enumerate(sorted_results):
    print("超参数: learning_rate,depth,rsm,subsample,min_data_in_leaf:", result["超参数组合"])
    print("val_ACC:", result["val_ACC"])
    print("val_Precision:", result["val_Precision"])
    print("val_Recall:", result["val_Recall"])
    print("val_F1:", result["val_F1"])
    print("test_ACC:", result["test_ACC"])
    print("test_Precision:", result["test_Precision"])
    print("test_Recall:", result["test_Recall"])
    print("test_F1:", result["test_F1"])
    print("test_MCC:", result["test_MCC"])
    print("test_roc_auc:", result["test_roc_auc"])
    print("-" * 60)

超参数: learning_rate,depth,rsm,subsample,min_data_in_leaf: (0.08, 6, 0.9, 0.6, 150)
val_ACC: 0.6911189158605081
val_Precision: 0.6691544568475355
val_Recall: 0.5739378458918689
val_F1: 0.6173392854959217
test_ACC: 0.7606177606177607
test_Precision: 0.76
test_Recall: 0.6666666666666666
test_F1: 0.7102803738317756
test_MCC: 0.510981411068828
test_roc_auc: 0.7505747126436781
------------------------------------------------------------
超参数: learning_rate,depth,rsm,subsample,min_data_in_leaf: (0.08, 6, 0.9, 0.6, 150)
val_ACC: 0.6911189158605081
val_Precision: 0.6691544568475355
val_Recall: 0.5739378458918689
val_F1: 0.6173392854959217
test_ACC: 0.7606177606177607
test_Precision: 0.76
test_Recall: 0.6666666666666666
test_F1: 0.7102803738317756
test_MCC: 0.510981411068828
test_roc_auc: 0.7505747126436781
------------------------------------------------------------
超参数: learning_rate,depth,rsm,subsample,min_data_in_leaf: (0.1, 5, 0.9, 0.8, 140)
val_ACC: 0.6896172299426374
val_Precision: 0.666841