In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, make_scorer
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_excel(r'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx')

# Split the dataset into features and target variable
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Resample the data to address class imbalance using SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

# Create the stratified 10-fold cross-validator
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define the hyperparameter grid for the XGBoost model
param_grid = {'max_depth': [3, 4, 5],
              'learning_rate': [0.05, 0.1, 0.15],
              'n_estimators': [100, 200, 300],
              'min_child_weight': [1, 3, 5],
              'gamma': [0, 0.1, 0.2]}

# Create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Define the scoring metric for the randomized search
scoring_metric = make_scorer(accuracy_score)

# Create the randomized search object
random_search = RandomizedSearchCV(xgb_classifier, param_grid, n_iter=50, cv=skf, scoring=scoring_metric,
                                   random_state=42, verbose=1, n_jobs=-1)

# Fit the randomized search object to the resampled data
random_search.fit(X_resampled, y_resampled)

# Print the best hyperparameters and the associated mean cross-validation accuracy score
print(f"Best hyperparameters: {random_search.best_params_}")
print(f"Best mean cross-validation score: {random_search.best_score_:.4%}")

# Use the best model to make predictions on the test set
best_model = random_search.best_estimator_
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test set accuracy: {accuracy:.4%}")



Fitting 10 folds for each of 50 candidates, totalling 500 fits
Best hyperparameters: {'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.2}
Best mean cross-validation score: 87.0886%


NameError: name 'X_test' is not defined

In [4]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, make_scorer
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_excel(r'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx')

# Split the dataset into features and target variable
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Resample the data to address class imbalance using SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

# Split the resampled dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Create the stratified 10-fold cross-validator
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define the hyperparameter grid for the XGBoost model
param_grid = {'max_depth': [3, 4, 5],
              'learning_rate': [0.05, 0.1, 0.15],
              'n_estimators': [100, 200, 300],
              'min_child_weight': [1, 3, 5],
              'gamma': [0, 0.1, 0.2]}

# Create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Define the scoring metric for the randomized search
scoring_metric = make_scorer(accuracy_score)

# Create the randomized search object
random_search = RandomizedSearchCV(xgb_classifier, param_grid, n_iter=50, cv=skf, scoring=scoring_metric,
                                   random_state=42, verbose=1, n_jobs=-1)

# Fit the randomized search object to the training data
random_search.fit(X_train, y_train)

# Print the best hyperparameters and the associated mean cross-validation accuracy score
print(f"Best hyperparameters: {random_search.best_params_}")
print(f"Best mean cross-validation score: {random_search.best_score_:.2%}")

# Use the best model to make predictions on the test set
best_model = random_search.best_estimator_
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test set accuracy: {accuracy:.2%}")


Fitting 10 folds for each of 50 candidates, totalling 500 fits
Best hyperparameters: {'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.2}
Best mean cross-validation score: 85.92%
Test set accuracy: 89.02%


In [5]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, make_scorer
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_excel(r'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx')

# Split the dataset into features and target variable
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Resample the data to address class imbalance using SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

# Create the stratified 10-fold cross-validator
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define the hyperparameter grid for the XGBoost model
param_grid = {'max_depth': [3, 4, 5],
              'learning_rate': [0.05, 0.1, 0.15],
              'n_estimators': [100, 200, 300],
              'min_child_weight': [1, 3, 5],
              'gamma': [0, 0.1, 0.2]}

# Create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Define the scoring metric for the randomized search
scoring_metric = make_scorer(accuracy_score)

# Create the randomized search object
random_search = RandomizedSearchCV(xgb_classifier, param_grid, n_iter=50, cv=skf, scoring=scoring_metric,
                                   random_state=42, verbose=1, n_jobs=-1)

# Fit the randomized search object to the resampled data
random_search.fit(X_resampled, y_resampled)

# Print the best hyperparameters and the associated mean cross-validation accuracy score
print(f"Best hyperparameters: {random_search.best_params_}")
print(f"Best mean cross-validation score: {random_search.best_score_:.4%}")

# Use the best model to make predictions on the test set
best_model = random_search.best_estimator_
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test set accuracy: {accuracy:.4%}")



Fitting 10 folds for each of 50 candidates, totalling 500 fits
Best hyperparameters: {'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.2}
Best mean cross-validation score: 87.0886%
Test set accuracy: 96.3387%


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, make_scorer
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_excel(r'C:/Users/awzma/Testosterone Deficiency/Datasets/SMOTE resampled_dataset.xlsx')

# Split the dataset into features and target variable
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Resample the data to address class imbalance using SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

# Create the stratified 10-fold cross-validator
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define the hyperparameter grid for the XGBoost model
param_grid = {'max_depth': [3, 4, 5],
              'learning_rate': [0.05, 0.1, 0.15],
              'n_estimators': [100, 200, 300],
              'min_child_weight': [1, 3, 5],
              'gamma': [0, 0.1, 0.2]}

# Create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Define the scoring metric for the randomized search
scoring_metric = make_scorer(accuracy_score)

# Create the randomized search object
random_search = RandomizedSearchCV(xgb_classifier, param_grid, n_iter=50, cv=skf, scoring=scoring_metric,
                                   random_state=42, verbose=1, n_jobs=-1)

# Fit the randomized search object to the resampled data
random_search.fit(X_resampled, y_resampled)

print(f"Best hyperparameters: {random_search.best_params_}")

# Use the best model to make predictions on the original dataset
best_model = random_search.best_estimator_
accuracy_scores = best_model.predict_proba(X)[:,1]

# Convert the accuracy scores to percentages and round to two decimal places
accuracy_percents = np.round(accuracy_scores * 100, 2)

# Add a new column 'accuracy' to the original dataset
df['Level'] = accuracy_percents

# Add a new column 'abnormality_level' based on the accuracy score
df['Abnormality_Level'] = pd.cut(df['Level'], bins=[0, 40, 70, 100], labels=['Low', 'Medium', 'High'])

# Filter the dataset to include only rows with target=1
df_filtered = df.loc[df['T'] == 1, df.columns != 'T']

# Save the filtered dataset to a new Excel file
df_filtered.to_excel(r'C:/Users/awzma/Testosterone Deficiency/Proposed/FINAL_OUTPUT_DATASET4.xlsx', index=False)


Fitting 10 folds for each of 50 candidates, totalling 500 fits
