In [None]:
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.neural_network import MLPClassifier
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef
from imblearn.metrics import geometric_mean_score

# Set random seed
random_seed = 2

# Read all CSV files in the folder
data_folder = '****'
files = [file for file in os.listdir(data_folder) if file.endswith('.csv')]

# Read the best parameters
best_params = pd.read_csv('../merged.csv')

# Create an empty DataFrame to store evaluation results and best parameters
evaluation_results = pd.DataFrame(columns=['Fingerprint', 'Algorithm', 'Best Parameters', 'Train AUC', 'Train ACC', 'Train SE', 'Train SP',
                                           'Train F1', 'Train MCC', 'Train G-mean', 'Test AUC', 'Test ACC', 'Test SE', 'Test SP', 'Test F1', 'Test MCC', 'Test G-mean'])

# Iterate through each file
for file in files:
    # Read CSV file
    data = pd.read_csv(os.path.join(data_folder, file)) 
    # Read training and testing sets
    X = data.drop(['Name', 'Activity'], axis=1)
    y = data['Activity']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=random_seed, stratify=y)

    # Extract fingerprint information
    fingerprint = file.split('.')[0]  # Assuming the filename contains fingerprint information, use the first part of the filename as the fingerprint

    # Initialize models
    models = [('RF', RandomForestClassifier(random_state=random_seed)),
              ('SVM', SVC(random_state=random_seed)),
              ('LightGBM', LGBMClassifier(random_state=random_seed)),
              ('DT', DecisionTreeClassifier(random_state=random_seed)),
              ('KNN', KNeighborsClassifier()),
              ('LR', LogisticRegression(random_state=random_seed, max_iter=1000)),
              ('XGBoost', XGBClassifier(random_state=random_seed)),
              ('ANN',MLPClassifier(random_state=random_seed))]

    for name, model in models:
        # Get the best parameters for the current model
        best_param = best_params[(best_params['Fingerprint'] == fingerprint) & (best_params['Algorithm'] == name)]['Best Parameters'].values[0]
        best_param = eval(best_param)  # Convert the string to a dictionary

        # Apply SMOTETomek to the training data
        smotetomek = SMOTETomek(random_state=random_seed)
        X_train_resampled, y_train_resampled = smotetomek.fit_resample(X_train, y_train)

        # Set the model's best parameters
        model.set_params(**best_param)

        # Fit the model on resampled data
        model.fit(X_train_resampled, y_train_resampled)
        train_pred = model.predict(X_train_resampled)
        test_pred = model.predict(X_test)

        # Calculate evaluation metrics
        train_auc = roc_auc_score(y_train_resampled, train_pred)
        test_auc = roc_auc_score(y_test, test_pred)
        train_acc = accuracy_score(y_train_resampled, train_pred)
        test_acc = accuracy_score(y_test, test_pred)
        train_se = recall_score(y_train_resampled, train_pred)
        test_se = recall_score(y_test, test_pred)
        train_sp = precision_score(y_train_resampled, train_pred)
        test_sp = precision_score(y_test, test_pred)
        train_f1 = f1_score(y_train_resampled, train_pred)
        test_f1 = f1_score(y_test, test_pred)
        train_mcc = matthews_corrcoef(y_train_resampled, train_pred)
        test_mcc = matthews_corrcoef(y_test, test_pred)
        train_gmean = geometric_mean_score(y_train_resampled, train_pred)
        test_gmean = geometric_mean_score(y_test, test_pred)

        # Append results to the DataFrame
        evaluation_results = evaluation_results.append({'Fingerprint': fingerprint,
                                                        'Algorithm': name,
                                                        'Best Parameters': best_param,
                                                        'Train AUC': train_auc, 'Train ACC': train_acc,
                                                        'Train SE': train_se, 'Train SP': train_sp,
                                                        'Train F1': train_f1, 'Train MCC': train_mcc,
                                                        'Train G-mean': train_gmean,
                                                        'Test AUC': test_auc, 'Test ACC': test_acc,
                                                        'Test SE': test_se, 'Test SP': test_sp,
                                                        'Test F1': test_f1, 'Test MCC': test_mcc,
                                                        'Test G-mean': test_gmean},
                                                       ignore_index=True)

# Save the evaluation results as a CSV file
evaluation_results.to_csv('TrainingResults.csv', index=False)
evaluation_results = pd.DataFrame(evaluation_results)
