In [3]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# load the imbalanced dataset
data = pd.read_excel(r'C:\Users\awzma\Testosterone Deficiency\dataset.xlsx')

# separate the features and target variable
X = data.drop('T', axis=1)
y = data['T']

# apply Random Under Sampling to balance the dataset
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

# apply Synthetic Minority Over Sampling (SMOTE) to balance the dataset further
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_rus, y_rus)

# concatenate the balanced features and target variable into a new dataframe
df_balanced = pd.concat([pd.DataFrame(X_smote), pd.DataFrame(y_smote)], axis=1)

# save the balanced dataset to a CSV file on the local device
df_balanced.to_excel('RUS + SMOTE balanced_dataset.xlsx', index=False)

# Print the number of rows and affected/unaffected rows in the balanced dataset
print('Total rows in balanced dataset:', len(df_balanced))
print('Affected rows in balanced dataset:', sum(df_balanced['T'] == 1))
print('Unaffected rows in balanced dataset:', sum(df_balanced['T'] == 0))


Total rows in balanced dataset: 1312
Affected rows in balanced dataset: 656
Unaffected rows in balanced dataset: 656
