In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import RandomOverSampler 
from imblearn.combine import SMOTETomek, SMOTEENN
import joblib

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
print(dataset.shape)
print(len(dataset[dataset['MetabolicSyndrome'] == 0]))
print(dataset.head())

(2401, 13)
1579
   Age     Sex  Income   Race  WaistCirc   BMI  Albuminuria  UrAlbCr  \
0   22    Male  8200.0  White       81.0  23.3            0     3.88   
1   44  Female  4500.0  White       80.1  23.2            0     8.55   
2   21    Male   800.0  Asian       69.6  20.1            0     5.07   
3   43  Female  2000.0  Black      120.4  33.3            0     5.22   
4   51    Male     NaN  Asian       81.1  20.1            0     8.13   

   UricAcid  BloodGlucose  HDL  Triglycerides  MetabolicSyndrome  
0       4.9            92   41             84                  0  
1       4.5            82   28             56                  0  
2       5.4           107   43             78                  0  
3       5.0           104   73            141                  0  
4       5.0            95   43            126                  0  


In [2]:
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

# This is the incorrect implementation
'''
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)
'''
# Fill NaN values in column with index 2
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())

# Fill NaN values in column with index 4
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())

# Fill NaN values in column with index 5
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())
print(dataset.head())

   Age  Sex      Income  Race  WaistCirc   BMI  Albuminuria  UrAlbCr  \
0   22    0  8200.00000     0       81.0  23.3            0     3.88   
1   44    1  4500.00000     0       80.1  23.2            0     8.55   
2   21    0   800.00000     1       69.6  20.1            0     5.07   
3   43    1  2000.00000     2      120.4  33.3            0     5.22   
4   51    0  4005.25394     1       81.1  20.1            0     8.13   

   UricAcid  BloodGlucose  HDL  Triglycerides  MetabolicSyndrome  
0       4.9            92   41             84                  0  
1       4.5            82   28             56                  0  
2       5.4           107   43             78                  0  
3       5.0           104   73            141                  0  
4       5.0            95   43            126                  0  


  dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
  dataset['Race'] = dataset['Race'].replace(race_mapping)


In [3]:
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]


test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
#print(test_1)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN
smote = SMOTE()

In [4]:
def generate_synthetic_samples(train_data, method='combined', weights=(0.5, 0.5)):
    """
    Generate synthetic samples using a weighted combination of ADASYN and CTGAN.
    """
    train_data = train_data.dropna(subset=['MetabolicSyndrome'])

    X = train_data.drop('MetabolicSyndrome', axis=1)
    y = train_data['MetabolicSyndrome']

    # Identify discrete (categorical) columns in your dataset
    discrete_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Generate synthetic samples using ADASYN
    adasyn = ADASYN(random_state=42)
    X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
    adasyn_samples = pd.DataFrame(X_adasyn[len(X):], columns=X.columns)

    # Generate synthetic samples using CTGAN
    ctgan = CTGAN(epochs=300)
    ctgan.fit(X, discrete_columns)
    ctgan_samples = ctgan.sample(len(adasyn_samples))
    ctgan_samples.columns = X.columns  # Ensure columns match original data

    if method == 'adasyn':
        final_synthetic = adasyn_samples
    elif method == 'ctgan':
        final_synthetic = ctgan_samples
    else:  # Combined method
        n_adasyn = int(weights[0] * len(adasyn_samples))
        n_ctgan = len(adasyn_samples) - n_adasyn

        final_synthetic = pd.concat([
            adasyn_samples.sample(n=n_adasyn, random_state=42, replace=True),
            ctgan_samples.sample(n=n_ctgan, random_state=42, replace=True)
        ])

    final_synthetic['MetabolicSyndrome'] = 1  # Assuming synthetic samples are for the minority class
    balanced_data = pd.concat([train_data, final_synthetic])

    return balanced_data


In [5]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming train_data and test_data are pandas DataFrames
# Generate synthetic data using the custom function
# 0.14289794020845348, 0.8571020597915465
balanced_train_data = generate_synthetic_samples(train_data, method='combined', weights=(0.14289794020845348, 0.8571020597915465))

# Split features and labels
X_train = balanced_train_data.drop('MetabolicSyndrome', axis=1)
y_train = balanced_train_data['MetabolicSyndrome']

X_test = test_data.drop('MetabolicSyndrome', axis=1)
y_test = test_data['MetabolicSyndrome']

# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.8600
Precision: 0.8892
Recall: 0.8225
F1 Score: 0.8545


In [None]:
import joblib
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import beta

# Function to evaluate the model
def evaluate_model(train_data, test_data, weights):
    # Generate synthetic data with the current weight combination
    balanced_train_data = generate_synthetic_samples(train_data, method='combined', weights=weights)

    # Split features and labels
    X_train = balanced_train_data.drop('MetabolicSyndrome', axis=1)
    y_train = balanced_train_data['MetabolicSyndrome']
    X_test = test_data.drop('MetabolicSyndrome', axis=1)
    y_test = test_data['MetabolicSyndrome']

    # Create and train the XGBoost model
    xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = xgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return accuracy, precision, recall, f1, xgb_model

# Function to find the best weight combination
def find_best_weight_combination_beta(train_data, test_data, num_samples=100, alpha=2, beta_param=2):
    best_accuracy = 0
    best_weights = (0, 0)
    best_results = {}
    best_model = None

    # Sample weights from a Beta distribution
    for _ in range(num_samples):
        # Randomly sample a weight for ADASYN and CTGAN from a Beta distribution
        weight_adasyn = beta.rvs(alpha, beta_param)
        weight_ctgan = 1 - weight_adasyn

        weights = (weight_adasyn, weight_ctgan)

        # Evaluate the model with the current weights
        accuracy, precision, recall, f1, xgb_model = evaluate_model(train_data, test_data, weights)

        # If this combination performs better, update best weights and results
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_weights = weights
            best_results = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
            best_model = xgb_model

    return best_weights, best_results, best_model

# Run the weight optimization
optimal_weights, optimal_results, best_model = find_best_weight_combination_beta(train_data, test_data)

# Save the best model
joblib.dump(best_model, 'best_xgboost_model.pkl')

# Display the best weights and corresponding evaluation metrics
print(f"\nOptimal Weight Combination (ADASYN, CTGAN): {optimal_weights}")
print(f"\nOptimal Combined Method Results:")
print(optimal_results)
