In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the four datasets
dataset_names = ['C:/Users/awzma/Testosterone Deficiency/Datasets/dataset.xlsx', 
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + ROS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + RUS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/ROS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx']

datasets = [pd.read_excel(name) for name in dataset_names]

# Train and test the Naive Bayes classifier on each dataset
for i, dataset in enumerate(datasets):
    # Split the dataset into features and target variable
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create the Naive Bayes classifier
    nb = GaussianNB()

    # Train the classifier using the training set
    nb.fit(X_train, y_train)

    # Make predictions on the testing set
    predictions = nb.predict(X_test)

    # Calculate the accuracy on the testing set
    accuracy = accuracy_score(y_test, predictions)

    # Print the accuracy for this dataset
    print(f"Accuracy for dataset {i+1} = {accuracy:.4%}")


Accuracy for dataset 1 = 80.8824%
Accuracy for dataset 2 = 75.9109%
Accuracy for dataset 3 = 65.7795%
Accuracy for dataset 4 = 81.5951%
Accuracy for dataset 5 = 65.7795%
Accuracy for dataset 6 = 67.2897%
Accuracy for dataset 7 = 80.2597%
Accuracy for dataset 8 = 68.4211%
Accuracy for dataset 9 = 72.3112%


In [3]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Load the four datasets
dataset_names = ['C:/Users/awzma/Testosterone Deficiency/Datasets/dataset.xlsx', 
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/ROS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + ROS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + RUS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + SMOTE balanced_dataset.xlsx']

# Define the number of folds for cross-validation
num_folds = 10

# Create an empty list to store the accuracies for each dataset
accuracies = []

# Loop over each dataset
for i, dataset_name in enumerate(dataset_names):
    # Load the dataset
    dataset = pd.read_excel(dataset_name)

    # Split the dataset into features and target variable
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Create the Naive Bayes classifier
    nb = GaussianNB()

    # Create a stratified k-fold cross-validator
    cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Create an empty list to store the accuracy for each fold
    fold_accuracies = []

    # Loop over each fold
    for train_index, test_index in cv.split(X, y):
        # Split the dataset into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the classifier using the training set
        nb.fit(X_train, y_train)

        # Make predictions on the testing set
        predictions = nb.predict(X_test)

        # Calculate the accuracy on the testing set
        accuracy = accuracy_score(y_test, predictions)

        # Append the accuracy to the list for this fold
        fold_accuracies.append(accuracy)

    # Calculate the mean accuracy across all folds for this dataset
    mean_accuracy = np.mean(fold_accuracies)

    # Print the accuracy for each fold for this dataset
    #print(f"Dataset {i+1} - Fold accuracies: {fold_accuracies}")

    # Print the mean accuracy for this dataset
    print(f"Average accuracy for dataset {i+1} = {mean_accuracy:.4%}")

    # Append the mean accuracy for this dataset to the list of accuracies
    accuracies.append(mean_accuracy)

# Calculate the mean accuracy across all datasets
mean_accuracy_all_datasets = np.mean(accuracies)

# Print the mean accuracy for all datasets
print(f"Mean accuracy: {mean_accuracy_all_datasets:.4%}")


Average accuracy for dataset 1 = 80.2776%
Average accuracy for dataset 2 = 65.6789%
Average accuracy for dataset 3 = 80.4272%
Average accuracy for dataset 4 = 66.3469%
Average accuracy for dataset 5 = 68.8869%
Average accuracy for dataset 6 = 78.0972%
Average accuracy for dataset 7 = 65.9305%
Average accuracy for dataset 8 = 82.3803%
Average accuracy for dataset 9 = 65.9305%
Mean accuracy: 72.6618%
