In [7]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the four datasets
dataset_names = ['C:/Users/awzma/Testosterone Deficiency/Datasets/dataset.xlsx', 
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/ROS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + ROS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + RUS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + SMOTE balanced_dataset.xlsx']
datasets = [pd.read_excel(name) for name in dataset_names]

# Train and test the XGBoost classifier on each dataset
for i, dataset in enumerate(datasets):
    # Split the dataset into features and target variable
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create the XGBoost classifier with default hyperparameters
    xgb_classifier = xgb.XGBClassifier()

    # Train the XGBoost classifier using the training set
    xgb_classifier.fit(X_train, y_train)

    # Make predictions on the testing set
    predictions = xgb_classifier.predict(X_test)

    # Calculate the accuracy on the testing set
    accuracy = accuracy_score(y_test, predictions)

    # Print the accuracy for this dataset
    print(f"Accuracy for dataset {i+1} = {accuracy:.4%}")


Accuracy for dataset 1 = 77.6471%
Accuracy for dataset 2 = 86.4372%
Accuracy for dataset 3 = 61.2167%
Accuracy for dataset 4 = 87.7301%
Accuracy for dataset 5 = 61.2167%
Accuracy for dataset 6 = 64.4860%
Accuracy for dataset 7 = 82.8571%
Accuracy for dataset 8 = 90.1602%
Accuracy for dataset 9 = 88.4439%


In [5]:
pip install xgboost


Collecting xgboost
  Using cached xgboost-1.7.5-py3-none-win_amd64.whl (70.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Load the four datasets
dataset_names = ['C:/Users/awzma/Testosterone Deficiency/Datasets/dataset.xlsx', 
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/ROS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + ROS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + RUS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + SMOTE balanced_dataset.xlsx']
datasets = [pd.read_excel(name) for name in dataset_names]

# Train and test the XGBoost classifier on each dataset using 10-fold stratified cross-validation
mean_accuracy = 0
for i, dataset in enumerate(datasets):
    # Split the dataset into features and target variable
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Create the stratified 10-fold cross-validator
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # Create an empty list to store the accuracy scores for each fold
    fold_accuracies = []

    # Iterate over the folds
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        # Get the training and testing sets for this fold
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Create the XGBoost classifier with default hyperparameters
        xgb_classifier = xgb.XGBClassifier()

        # Train the XGBoost classifier using the training set
        xgb_classifier.fit(X_train, y_train)

        # Make predictions on the testing set
        predictions = xgb_classifier.predict(X_test)

        # Calculate the accuracy on the testing set
        accuracy = accuracy_score(y_test, predictions)

        # Store the accuracy score for this fold
        fold_accuracies.append(accuracy)

        # Print the accuracy for this fold
        #print(f"Accuracy for dataset {i+1}, fold {fold+1} = {accuracy:.4%}")

    # Calculate the mean accuracy for all folds
    mean_fold_accuracy = np.mean(fold_accuracies)

    # Add the mean fold accuracy to the total mean accuracy
    mean_accuracy += mean_fold_accuracy

    # Print the mean accuracy for this dataset
    print(f"Average accuracy for dataset {i+1} = {mean_fold_accuracy:.4%}")

# Calculate the mean accuracy for all datasets
mean_accuracy /= len(datasets)

# Print the mean accuracy for all datasets
print(f"Mean accuracy = {mean_accuracy:.4%}")


Average accuracy for dataset 1 = 79.4230%
Average accuracy for dataset 2 = 65.0123%
Average accuracy for dataset 3 = 82.4566%
Average accuracy for dataset 4 = 89.8124%
Average accuracy for dataset 5 = 87.4090%
Average accuracy for dataset 6 = 88.9069%
Average accuracy for dataset 7 = 63.4906%
Average accuracy for dataset 8 = 87.3065%
Average accuracy for dataset 9 = 63.4906%
Mean accuracy = 78.5898%
