In [37]:
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, ParameterGrid
import seaborn as sns
from sklearn.model_selection import RepeatedStratifiedKFold
import warnings
warnings.filterwarnings("ignore")


In [38]:
# training dataset loading
Training_csv_file_path = './data/TrainingSet/mRNA_sublocation_TrainingSet_NC-BERTdata.csv'
Training_data= pd.read_csv(Training_csv_file_path)
# separate sequence features and labels
X_Train = Training_data.drop(columns=['label']).values
y_Train = Training_data['label'].values
# the training data and test data are standardized
sc = StandardScaler()
sc.fit(X_Train)
X_Train = sc.transform(X_Train)

In [39]:
from sklearn.decomposition import PCA
import numpy as np
# Initialize PCA and set the number of principal components
pca = PCA(n_components=72)
# Fit PCA on the training data and transform the training data to its principal components
X_Train = pca.fit_transform(X_Train)


In [40]:
# Define hyperparameter ranges
param_grid = {
    'n_estimators': [50,80],
    'max_depth': [5, 10, 15, 20, 25]
}

# Create all combinations of hyperparameters
grid = list(ParameterGrid(param_grid))

# Initialize the result list
results = []

# 100 times 5-fold cross-validation
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=42)
rskf.get_n_splits(X_Train, y_Train)

# Open the CSV file for writing
with open('./result/RF-100times-5-fold cv.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['n_estimators', 'max_depth', 'accuracy', 'precision', 'recall', 'f1'])
    
    # Perform a grid search
    for params in tqdm(grid, desc="Hyperparameter search"):
        acc_scores = []
        prec_scores = []
        rec_scores = []
        f1_scores = []
        
        for i, (train_index, test_index) in enumerate(rskf.split(X_Train, y_Train)):
            X_train, X_test = X_Train[train_index], X_Train[test_index]
            y_train, y_test = y_Train[train_index], y_Train[test_index]
            clf = RandomForestClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'])
            
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            acc_scores.append(accuracy_score(y_test, y_pred))
            prec_scores.append(precision_score(y_test, y_pred))
            rec_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))
        
        # Save scores for this hyperparameter combination
        for acc, prec, rec, f1 in zip(acc_scores, prec_scores, rec_scores, f1_scores):
            writer.writerow([params['n_estimators'], params['max_depth'], acc, prec, rec, f1])

        acc_mean, acc_std = np.mean(acc_scores), np.std(acc_scores)
        prec_mean, prec_std = np.mean(prec_scores), np.std(prec_scores)
        rec_mean, rec_std = np.mean(rec_scores), np.std(rec_scores)
        f1_mean, f1_std = np.mean(f1_scores), np.std(f1_scores)
        print(f" params: {params}")
        print(f" acc_mean: {acc_mean}, acc_std: {acc_std}")
        print(f" prec_mean: {prec_mean}, prec_std: {prec_std}")
        print(f" rec_mean: {rec_mean}, rec_std: {rec_std}")
        print(f" f1_mean: {f1_mean}, f1_std: {f1_std}")

Hyperparameter search:  10%|█████▉                                                     | 1/10 [05:10<46:31, 310.17s/it]

 params: {'max_depth': 5, 'n_estimators': 50}
 acc_mean: 0.658358288597859, acc_std: 0.012212119434470332
 prec_mean: 0.7119380971684051, prec_std: 0.028830973230437214
 rec_mean: 0.3617541446208113, rec_std: 0.031178895653059365
 f1_mean: 0.47879793278114163, f1_std: 0.02845879691092134


Hyperparameter search:  20%|███████████▊                                               | 2/10 [13:45<57:28, 431.06s/it]

 params: {'max_depth': 5, 'n_estimators': 80}
 acc_mean: 0.6601885260983767, acc_std: 0.011919557525254264
 prec_mean: 0.718661596775079, prec_std: 0.02916325019938911
 rec_mean: 0.3608575928966734, rec_std: 0.028923034407500176
 f1_mean: 0.4796287107843417, f1_std: 0.026746160602167154


Hyperparameter search:  30%|█████████████████▋                                         | 3/10 [22:38<55:42, 477.46s/it]

 params: {'max_depth': 10, 'n_estimators': 50}
 acc_mean: 0.6807760604262405, acc_std: 0.012939041614802886
 prec_mean: 0.6855215721425278, prec_std: 0.022189389693586765
 rec_mean: 0.49254310040746824, rec_std: 0.02483301829507969
 f1_mean: 0.5728472464780701, f1_std: 0.0204246989071107


Hyperparameter search:  40%|██████████████████████▊                                  | 4/10 [37:00<1:02:56, 629.40s/it]

 params: {'max_depth': 10, 'n_estimators': 80}
 acc_mean: 0.6857481795307031, acc_std: 0.01289460733934781
 prec_mean: 0.6953657613552195, prec_std: 0.022019008390334528
 rec_mean: 0.4947176306026881, rec_std: 0.02489906435830323
 f1_mean: 0.5777561962790826, f1_std: 0.020481293427120986


Hyperparameter search:  50%|█████████████████████████████▌                             | 5/10 [47:40<52:45, 633.12s/it]

 params: {'max_depth': 15, 'n_estimators': 50}
 acc_mean: 0.6797206367387494, acc_std: 0.013213318613803097
 prec_mean: 0.6726322914570247, prec_std: 0.021498115713302655
 rec_mean: 0.5146336799854041, rec_std: 0.023703412254780107
 f1_mean: 0.5828004239042268, f1_std: 0.01932274197559182


Hyperparameter search:  60%|██████████████████████████████████▏                      | 6/10 [1:04:43<51:03, 765.78s/it]

 params: {'max_depth': 15, 'n_estimators': 80}
 acc_mean: 0.6846584164792147, acc_std: 0.013410273358275163
 prec_mean: 0.6823378706988337, prec_std: 0.022215826596749415
 rec_mean: 0.5155695067810011, rec_std: 0.023270045335365915
 f1_mean: 0.5870287245567961, f1_std: 0.01931967474441705


Hyperparameter search:  70%|███████████████████████████████████████▉                 | 7/10 [1:15:55<36:44, 734.85s/it]

 params: {'max_depth': 20, 'n_estimators': 50}
 acc_mean: 0.6768330826306758, acc_std: 0.01331169425283554
 prec_mean: 0.6660277577515253, prec_std: 0.020473686381539808
 rec_mean: 0.5162696101684607, rec_std: 0.024164991822968187
 f1_mean: 0.5813760876898212, f1_std: 0.019766773679590847


Hyperparameter search:  80%|█████████████████████████████████████████████▌           | 8/10 [1:33:44<28:02, 841.24s/it]

 params: {'max_depth': 20, 'n_estimators': 80}
 acc_mean: 0.6834443877105097, acc_std: 0.012921481941369557
 prec_mean: 0.6771588805101956, prec_std: 0.020674635135530486
 rec_mean: 0.5211905978227817, rec_std: 0.02480479237844303
 f1_mean: 0.5886749278817004, f1_std: 0.019506477809056157


Hyperparameter search:  90%|███████████████████████████████████████████████████▎     | 9/10 [1:44:57<13:08, 788.80s/it]

 params: {'max_depth': 25, 'n_estimators': 50}
 acc_mean: 0.6759689749711347, acc_std: 0.01341922449930488
 prec_mean: 0.6715154497943717, prec_std: 0.021887825033578837
 rec_mean: 0.5001453019521985, rec_std: 0.02422156334850037
 f1_mean: 0.5729742905314397, f1_std: 0.02017952669849549


Hyperparameter search: 100%|████████████████████████████████████████████████████████| 10/10 [2:02:36<00:00, 735.64s/it]

 params: {'max_depth': 25, 'n_estimators': 80}
 acc_mean: 0.6837038166604565, acc_std: 0.012608760344309584
 prec_mean: 0.6819786976302601, prec_std: 0.020779399301133795
 rec_mean: 0.5120840965760506, rec_std: 0.02345756139256024
 f1_mean: 0.5846259489603606, f1_std: 0.018919337268705243



