In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")


# Load the four datasets
dataset_names = ['C:/Users/awzma/Testosterone Deficiency/Datasets/dataset.xlsx', 
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + ROS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + RUS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/ROS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx']

datasets = [pd.read_excel(name) for name in dataset_names]

# Train and test the KNN classifier on each dataset
for i, dataset in enumerate(datasets):
    # Split the dataset into features and target variable
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create the KNN classifier with 5 neighbors
    knn = KNeighborsClassifier(n_neighbors=5)

    # Train the KNN classifier using the training set
    knn.fit(X_train, y_train)

    # Make predictions on the testing set
    predictions = knn.predict(X_test)

    # Calculate the accuracy on the testing set
    accuracy = accuracy_score(y_test, predictions)

    # Print the accuracy for this dataset
    print(f"Accuracy for dataset {i+1} = {accuracy:.4%}")


Accuracy for dataset 1 = 79.2647%
Accuracy for dataset 2 = 89.8785%
Accuracy for dataset 3 = 63.8783%
Accuracy for dataset 4 = 84.6626%
Accuracy for dataset 5 = 63.8783%
Accuracy for dataset 6 = 64.4860%
Accuracy for dataset 7 = 83.3766%
Accuracy for dataset 8 = 73.4554%
Accuracy for dataset 9 = 76.6590%


In [4]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")


# Load the four datasets
dataset_names = ['C:/Users/awzma/Testosterone Deficiency/Datasets/dataset.xlsx', 
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/ROS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + SMOTE balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + ROS balanced_dataset.xlsx',
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RENN + RUS balanced_dataset.xlsx',  
                 'C:/Users/awzma/Testosterone Deficiency/Datasets/RUS + SMOTE balanced_dataset.xlsx']

datasets = [pd.read_excel(name) for name in dataset_names]

# Define an empty list to store the accuracies for each dataset
datasets_accuracies = []

# Train and test the KNN classifier on each dataset using stratified 10-fold cross validation
for i, dataset in enumerate(datasets):
    # Split the dataset into features and target variable
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Define the stratified 10-fold cross validator
    skf = StratifiedKFold(n_splits=10)

    # Define an empty list to store the accuracies for this dataset
    accuracies = []

    # Train and test the KNN classifier using the cross validator
    for train_index, test_index in skf.split(X, y):
        # Split the dataset into training and testing sets
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Create the KNN classifier with 5 neighbors
        knn = KNeighborsClassifier(n_neighbors=5)

        # Train the KNN classifier using the training set
        knn.fit(X_train, y_train)

        # Make predictions on the testing set
        predictions = knn.predict(X_test)

        # Calculate the accuracy on the testing set
        accuracy = accuracy_score(y_test, predictions)

        # Append the accuracy to the list of accuracies for this dataset
        accuracies.append(accuracy)

    # Append the list of accuracies for this dataset to the list of all accuracies
    datasets_accuracies.append(accuracies)

    # Calculate the average accuracy for this dataset
    avg_accuracy = np.mean(accuracies)

    # Print the average accuracy for this dataset
    print(f"Average accuracy for dataset {i+1} = {avg_accuracy:.4%}")

# Calculate the mean accuracy across all datasets
all_accuracies = [np.mean(accuracies) for accuracies in datasets_accuracies]
mean_accuracy = np.mean(all_accuracies)
print(f"Mean accuracy = {mean_accuracy:.4%}")


Average accuracy for dataset 1 = 79.4521%
Average accuracy for dataset 2 = 65.8499%
Average accuracy for dataset 3 = 83.8102%
Average accuracy for dataset 4 = 77.8164%
Average accuracy for dataset 5 = 79.9450%
Average accuracy for dataset 6 = 92.6721%
Average accuracy for dataset 7 = 62.6550%
Average accuracy for dataset 8 = 86.5763%
Average accuracy for dataset 9 = 62.6550%
Mean accuracy = 76.8258%
