In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from catboost import CatBoostClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV
from itertools import product
import random
from tqdm import tqdm

In [8]:
# training dataset loading
Training_csv_file_path = './data/TrainingSet/mRNA_sublocation_TrainingSet_hfeature.csv'
Training_data= pd.read_csv(Training_csv_file_path)
Training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,Z-Curve_X,Z-Curve_Y,Z-Curve_Z,gcContent,AT/GC,GC skew,AT skew,label
0,0.008038,0.005472,0.007248,0.007894,0.007398,0.004448,0.000894,0.007232,0.009605,0.007154,...,0.007301,0.012933,183,-31,381,45.001312,0.062391,0.036260,1.222157,1
1,0.006421,0.004846,0.007124,0.006833,0.007555,0.005529,0.002641,0.006684,0.008229,0.006792,...,0.007317,0.014494,20,-16,122,47.749077,0.013910,0.001412,1.094281,1
2,0.012833,0.005971,0.006511,0.007626,0.007715,0.005825,0.001119,0.006599,0.007285,0.005525,...,0.006162,0.012829,248,152,1736,44.059677,0.007456,0.024468,1.269649,1
3,0.013917,0.006805,0.005778,0.013393,0.008006,0.004290,0.000706,0.007448,0.006812,0.002914,...,0.006759,0.023986,-211,-43,1183,34.672195,-0.062780,-0.050377,1.884155,1
4,0.017324,0.006722,0.007164,0.007420,0.007784,0.003431,0.001093,0.007214,0.006707,0.006244,...,0.006850,0.013950,178,-46,320,42.673993,0.120172,0.052716,1.343348,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,0.020823,0.006709,0.006760,0.009948,0.007638,0.003582,0.001047,0.007103,0.007427,0.004873,...,0.007436,0.016279,267,57,1591,39.366395,0.035654,0.035714,1.540238,0
4658,0.002121,0.007038,0.006064,0.003244,0.007038,0.005526,0.002388,0.005519,0.008397,0.006210,...,0.003900,0.006741,-1,-1,-1,50.069930,0.000000,-0.002801,0.997207,0
4659,0.018389,0.005796,0.007591,0.005788,0.005100,0.006153,0.002046,0.007090,0.005993,0.006751,...,0.004802,0.011305,29,73,91,47.270546,-0.027919,0.058020,1.115482,0
4660,0.001512,0.001544,0.005322,0.000000,0.003088,0.007880,0.001362,0.001574,0.003991,0.010899,...,0.000000,0.000000,-2,10,-108,71.428571,-0.033333,0.055556,0.400000,0


In [9]:
# test dataset loading
Test_csv_file_path = './data/TestSet/mRNA_sublocation_TestSet_hfeature.csv'
Test_data= pd.read_csv(Test_csv_file_path)
Test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,Z-Curve_X,Z-Curve_Y,Z-Curve_Z,gcContent,AT/GC,GC skew,AT skew,label
0,0.011010,0.005166,0.007069,0.009104,0.006584,0.004962,0.000983,0.006195,0.007331,0.005005,...,0.007570,0.013872,167,-101,433,44.322056,0.079290,0.015544,1.256213,1
1,0.001138,0.002840,0.004449,0.000387,0.005293,0.008433,0.003076,0.004343,0.004783,0.009227,...,0.002325,0.003215,-150,68,-770,62.867647,-0.057948,-0.036904,0.590643,1
2,0.003841,0.003923,0.006374,0.003134,0.005604,0.007207,0.001879,0.005027,0.005505,0.008505,...,0.003533,0.003954,-72,62,-512,57.428903,-0.033855,-0.003408,0.741283,1
3,0.004768,0.003772,0.005141,0.003835,0.004800,0.007349,0.002178,0.004614,0.005968,0.006595,...,0.005558,0.009890,-345,-101,-377,53.347540,-0.040613,-0.084888,0.874501,1
4,0.014964,0.005276,0.007248,0.011185,0.005625,0.003709,0.000834,0.006393,0.007677,0.003865,...,0.007978,0.021691,-181,-467,2243,35.542091,0.051868,-0.064800,1.813565,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0.002883,0.003271,0.005074,0.002614,0.003925,0.005676,0.004618,0.003668,0.004228,0.007505,...,0.003535,0.002715,4,-8,-338,64.297800,0.007895,-0.004739,0.555263,0
514,0.008520,0.004350,0.004851,0.005622,0.003838,0.005746,0.001919,0.007434,0.004079,0.004629,...,0.006452,0.013806,-211,-169,97,48.393508,-0.014374,-0.121951,1.066393,0
515,0.003307,0.003241,0.005121,0.002833,0.005537,0.005652,0.001430,0.006058,0.006633,0.008223,...,0.005108,0.005465,12,-96,-424,57.412587,0.032887,-0.034483,0.741778,0
516,0.004880,0.002683,0.006275,0.003445,0.004216,0.005869,0.002368,0.002345,0.005615,0.008456,...,0.003107,0.001193,45,13,-295,64.618434,0.024540,0.081232,0.547546,0


In [10]:
# separate sequence features and labels
X_train = Training_data.drop(columns=['label']).values
y_train = Training_data['label'].values

X_test = Test_data.drop(columns=['label']).values
y_test = Test_data['label'].values

# the training data and test data are standardized
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [11]:
# Random search was used, and the number of hyperparameters searched was defined
num = 100
# Define the hyperparameter search space
learning_rate = [0.08,0.1]
depth = [4, 5, 6]
rsm = [0.9, 1]
subsample = [0.6, 0.8, 1.0]
min_data_in_leaf = [140,150,160]

seed_value = 42
random.seed(seed_value)
# Hyperparameter tuning
hyperparameter_space = list(product(learning_rate,depth,rsm,subsample,min_data_in_leaf))
hyperparameters = [random.choice(hyperparameter_space) for i in range(num)]
results = []
best_acc = 0
for hyperparameter in tqdm(hyperparameters, desc="Hyperparameter Search"):
    learning_rate,depth,rsm,subsample,min_data_in_leaf = hyperparameter
 
    val_accuracy_scores = []
    val_precision_scores = []
    val_recall_scores = []
    val_f1_scores = []
    
    #     5-fold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train,y_train), 1):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        clf = CatBoostClassifier(iterations=1000,learning_rate=learning_rate,depth=depth,rsm=rsm,subsample=subsample,min_data_in_leaf=min_data_in_leaf, logging_level='Silent',early_stopping_rounds=50)
        
        clf.fit(X_train_fold,y_train_fold)
     
        val_predictions = clf.predict(X_val_fold)
        val_accuracy = accuracy_score(y_val_fold, val_predictions)
        val_precision = precision_score(y_val_fold, val_predictions)
        val_recall = recall_score(y_val_fold, val_predictions)
        val_f1 = f1_score(y_val_fold, val_predictions)

#         Saving metrics
        val_accuracy_scores.append(val_accuracy)
        val_precision_scores.append(val_precision)
        val_recall_scores.append(val_recall)
        val_f1_scores.append(val_f1)
#   The average value of each metric was calculated
    val_ACC = np.mean(val_accuracy_scores)
    val_Precision = np.mean(val_precision_scores)
    val_Recall = np.mean(val_recall_scores)
    val_F1 = np.mean(val_f1_scores)
    
# Independent testing
    clf.fit(X_train,y_train)
    test_predictions = clf.predict(X_test)
    cm = confusion_matrix(y_test,test_predictions)
    TP = cm[1, 1]
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]

    test_ACC = accuracy_score(y_test, test_predictions)
    test_Precision = precision_score(y_test, test_predictions)
    test_Recall = recall_score(y_test, test_predictions)
    test_F1 = f1_score(y_test, test_predictions)
    mcc = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))**0.5

    fpr, tpr, thresholds = roc_curve(y_test, test_predictions, pos_label=1)
    roc_auc = auc(fpr, tpr)
    
    
   
    

    results.append({
        "超参数组合": hyperparameter,
        "val_ACC": val_ACC,
        "val_Precision": val_Precision,
        "val_Recall": val_Recall,
        "val_F1":val_F1,
        "test_ACC":test_ACC,      
        "test_Precision":test_Precision,
        "test_Recall":test_Recall,
        "test_F1":test_F1,
        "test_MCC":mcc,
        "test_roc_auc":roc_auc
    })
    
    

Hyperparameter Search: 100%|█████████████████████████████████████████████████████████| 100/100 [58:16<00:00, 34.97s/it]


In [12]:
# result ranking
sorted_results = sorted(results, key=lambda x: x["val_ACC"], reverse=True)
for i,result in enumerate(sorted_results):
    print("超参数: learning_rate,depth,rsm,subsample,min_data_in_leaf :", result["超参数组合"])
    print("val_ACC:", result["val_ACC"])
    print("val_Precision:", result["val_Precision"])
    print("val_Recall:", result["val_Recall"])
    print("val_F1:", result["val_F1"])
    print("test_ACC:", result["test_ACC"])
    print("test_Precision:", result["test_Precision"])
    print("test_Recall:", result["test_Recall"])
    print("test_F1:", result["test_F1"])
    print("test_MCC:", result["test_MCC"])
    print("test_roc_auc:", result["test_roc_auc"])
    print("-" * 60)

超参数: learning_rate,depth,rsm,subsample,min_data_in_leaf : (0.08, 4, 0.9, 0.6, 140)
val_ACC: 0.6799665576455111
val_Precision: 0.6591396352796444
val_Recall: 0.5483062701453505
val_F1: 0.5983976020099446
test_ACC: 0.7142857142857143
test_Precision: 0.7061855670103093
test_Recall: 0.6008771929824561
test_F1: 0.6492890995260664
test_MCC: 0.41469056189449516
test_roc_auc: 0.7021627344222625
------------------------------------------------------------
超参数: learning_rate,depth,rsm,subsample,min_data_in_leaf : (0.1, 5, 0.9, 0.6, 150)
val_ACC: 0.6786748639535579
val_Precision: 0.6528507159112944
val_Recall: 0.55816091954023
val_F1: 0.6015936707087581
test_ACC: 0.7200772200772201
test_Precision: 0.6948356807511737
test_Recall: 0.6491228070175439
test_F1: 0.671201814058957
test_MCC: 0.4287460935470322
test_roc_auc: 0.7124924379915305
------------------------------------------------------------
超参数: learning_rate,depth,rsm,subsample,min_data_in_leaf : (0.08, 6, 1, 0.6, 140)
val_ACC: 0.67867440394