In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the four datasets
dataset_names = ['C:/Users/awzma/Testosterone Deficiency/Datasets/dataset.xlsx', 
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + ROS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + RUS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/ROS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx']

datasets = [pd.read_excel(name) for name in dataset_names]

# Train and test the Random Forest on each dataset
for i, dataset in enumerate(datasets):
    # Split the dataset into features and target variable
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create the Random Forest classifier with 100 trees
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the Random Forest classifier using the training set
    rf.fit(X_train, y_train)

    # Make predictions on the testing set
    predictions = rf.predict(X_test)

    # Calculate the accuracy on the testing set
    accuracy = accuracy_score(y_test, predictions)

    # Print the accuracy for this dataset
    print(f"Accuracy for dataset {i+1} = {accuracy:.4%}")


Accuracy for dataset 1 = 81.4706%
Accuracy for dataset 2 = 85.4251%
Accuracy for dataset 3 = 65.7795%
Accuracy for dataset 4 = 86.5031%
Accuracy for dataset 5 = 65.7795%
Accuracy for dataset 6 = 68.2243%
Accuracy for dataset 7 = 82.8571%
Accuracy for dataset 8 = 93.4783%
Accuracy for dataset 9 = 84.3249%


In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Load the nine datasets
dataset_names = ['C:/Users/awzma/Testosterone Deficiency/Datasets/dataset.xlsx', 
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/ROS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + ROS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + RUS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + SMOTE balanced_dataset.xlsx']

# Define the number of folds for cross-validation
n_folds = 10

# Initialize lists to store accuracy scores
accuracy_scores = []
accuracy_scores_mean = []

# Train and test the Random Forest on each dataset using stratified 10-fold cross-validation
for i, dataset_name in enumerate(dataset_names):
    # Load the dataset
    dataset = pd.read_excel(dataset_name)

    # Split the dataset into features and target variable
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Create the Random Forest classifier with 100 trees
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Perform stratified 10-fold cross-validation and get accuracy scores
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    fold_accuracy_scores = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rf.fit(X_train, y_train)
        predictions = rf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        fold_accuracy_scores.append(accuracy)
    accuracy_scores.append(fold_accuracy_scores)
    accuracy_scores_mean.append(np.mean(fold_accuracy_scores))

    # Print the accuracy for this dataset
    #print(f"Accuracy scores for dataset {i+1}: {fold_accuracy_scores}")
    print(f"Average accuracy for dataset {i+1}: {accuracy_scores_mean[i]:.4%}")
  
    # Print the mean accuracy across all datasets
print(f"Mean accuracy: {np.mean(accuracy_scores_mean):.4%}")


Average accuracy for dataset 1: 80.8076%
Average accuracy for dataset 2: 66.4230%
Average accuracy for dataset 3: 82.9777%
Average accuracy for dataset 4: 93.3843%
Average accuracy for dataset 5: 84.8910%
Average accuracy for dataset 6: 86.2348%
Average accuracy for dataset 7: 65.0121%
Average accuracy for dataset 8: 86.9362%
Average accuracy for dataset 9: 65.0121%
Mean accuracy: 79.0754%
