In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import RandomOverSampler 
from imblearn.combine import SMOTETomek, SMOTEENN
import joblib

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
print(dataset.shape)
print(len(dataset[dataset['MetabolicSyndrome'] == 0]))
print(dataset.head())

(2401, 13)
1579
   Age     Sex  Income   Race  WaistCirc   BMI  Albuminuria  UrAlbCr  \
0   22    Male  8200.0  White       81.0  23.3            0     3.88   
1   44  Female  4500.0  White       80.1  23.2            0     8.55   
2   21    Male   800.0  Asian       69.6  20.1            0     5.07   
3   43  Female  2000.0  Black      120.4  33.3            0     5.22   
4   51    Male     NaN  Asian       81.1  20.1            0     8.13   

   UricAcid  BloodGlucose  HDL  Triglycerides  MetabolicSyndrome  
0       4.9            92   41             84                  0  
1       4.5            82   28             56                  0  
2       5.4           107   43             78                  0  
3       5.0           104   73            141                  0  
4       5.0            95   43            126                  0  


In [2]:
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

# This is the incorrect implementation
'''
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)
'''
# Fill NaN values in column with index 2
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())

# Fill NaN values in column with index 4
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())

# Fill NaN values in column with index 5
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())
print(dataset.head())

   Age  Sex      Income  Race  WaistCirc   BMI  Albuminuria  UrAlbCr  \
0   22    0  8200.00000     0       81.0  23.3            0     3.88   
1   44    1  4500.00000     0       80.1  23.2            0     8.55   
2   21    0   800.00000     1       69.6  20.1            0     5.07   
3   43    1  2000.00000     2      120.4  33.3            0     5.22   
4   51    0  4005.25394     1       81.1  20.1            0     8.13   

   UricAcid  BloodGlucose  HDL  Triglycerides  MetabolicSyndrome  
0       4.9            92   41             84                  0  
1       4.5            82   28             56                  0  
2       5.4           107   43             78                  0  
3       5.0           104   73            141                  0  
4       5.0            95   43            126                  0  


  dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
  dataset['Race'] = dataset['Race'].replace(race_mapping)


In [3]:
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]


test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
#print(test_1)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN
smote = SMOTE()

In [4]:
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN
import numpy as np
from sklearn.neighbors import NearestNeighbors

def generate_synthetic_samples(train_data, method='combined', weights=(0.25, 0.30, 0.45)):
    """
    Generate synthetic samples using a weighted combination of SMOTE, ADASYN, and CTGAN
    """
    # Clean NaN values from the target variable first
    train_data = train_data.dropna(subset=['MetabolicSyndrome'])
    
    # Separate majority and minority classes
    train_0 = train_data[train_data['MetabolicSyndrome'] == 0]
    train_1 = train_data[train_data['MetabolicSyndrome'] == 1]
    
    # Determine minority and majority class
    if len(train_0) > len(train_1):
        majority_size = len(train_0)
        samples_needed = len(train_0) - len(train_1)
        minority_class = 1
    else:
        majority_size = len(train_1)
        samples_needed = len(train_1) - len(train_0)
        minority_class = 0
    
    X = train_data.drop('MetabolicSyndrome', axis=1)
    y = train_data['MetabolicSyndrome']
    
    # Generate synthetic samples using each method
    smote = SMOTE(random_state=42)
    adasyn = ADASYN(random_state=42)
    ctgan = CTGAN(epochs=300)
    
    # SMOTE samples
    X_smote, y_smote = smote.fit_resample(X, y)
    smote_samples = pd.DataFrame(X_smote[len(X):], columns=X.columns)
    
    # ADASYN samples
    X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
    adasyn_samples = pd.DataFrame(X_adasyn[len(X):], columns=X.columns)
    
    # CTGAN samples
    ctgan.fit(train_data, discrete_columns=['Sex', 'Race', 'MetabolicSyndrome'])
    synthetic_data = ctgan.sample(samples_needed)
    ctgan_samples = synthetic_data.drop('MetabolicSyndrome', axis=1)
    
    if method == 'smote':
        final_synthetic = smote_samples
    elif method == 'adasyn':
        final_synthetic = adasyn_samples
    elif method == 'ctgan':
        final_synthetic = ctgan_samples
    else:  # Combined method
        # Calculate number of samples needed from each method
        n_smote = int(samples_needed * weights[0])
        n_adasyn = int(samples_needed * weights[1])
        n_ctgan = samples_needed - n_smote - n_adasyn  # Ensure we get exact number needed
        
        # Take weighted samples from each method
        final_synthetic = pd.concat([
            smote_samples.sample(n=n_smote, random_state=42),
            adasyn_samples.sample(n=n_adasyn, random_state=42),
            ctgan_samples.sample(n=n_ctgan, random_state=42)
        ])
    
    # Add MetabolicSyndrome column to synthetic data
    final_synthetic['MetabolicSyndrome'] = minority_class
    
    # Create balanced dataset
    balanced_data = pd.concat([train_data, final_synthetic])
    
    return balanced_data

# Clean the dataset before splitting
dataset = dataset.dropna(subset=['MetabolicSyndrome'])

# Rest of your existing code for splitting the data
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]
test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
test_data = pd.concat([test_0, test_1])
train_data = dataset.drop(test_data.index)

# Generate balanced datasets using different methods
balanced_data_smote = generate_synthetic_samples(train_data, method='smote')
balanced_data_adasyn = generate_synthetic_samples(train_data, method='adasyn')
balanced_data_ctgan = generate_synthetic_samples(train_data, method='ctgan')

# Generate balanced dataset using combined method with custom weights
weights=(0.25, 0.45, 0.30)  # 40% SMOTE, 30% ADASYN, 30% CTGAN
balanced_data_combined = generate_synthetic_samples(train_data, method='combined', weights=weights)

# Function to train and evaluate model
def evaluate_model(train_data, test_data):
    X_train = train_data.drop('MetabolicSyndrome', axis=1).values
    y_train = train_data['MetabolicSyndrome'].values
    X_test = test_data.drop('MetabolicSyndrome', axis=1).values
    y_test = test_data['MetabolicSyndrome'].values
    
    model = XGBClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred)
    }

# Evaluate all methods
results_smote = evaluate_model(balanced_data_smote, test_data)
results_adasyn = evaluate_model(balanced_data_adasyn, test_data)
results_ctgan = evaluate_model(balanced_data_ctgan, test_data)
results_combined = evaluate_model(balanced_data_combined, test_data)

# Print results
print("\nSMOTE Results:")
print(results_smote)
print("\nADASYN Results:")
print(results_adasyn)
print("\nCTGAN Results:")
print(results_ctgan)
print("\nCombined Method Results:")
print(results_combined)


SMOTE Results:
{'accuracy': 0.85, 'precision': 0.8804347826086957, 'recall': 0.81, 'f1': 0.84375}

ADASYN Results:
{'accuracy': 0.84375, 'precision': 0.8590078328981723, 'recall': 0.8225, 'f1': 0.8403575989782885}

CTGAN Results:
{'accuracy': 0.85625, 'precision': 0.9014084507042254, 'recall': 0.8, 'f1': 0.8476821192052981}

Combined Method Results:
{'accuracy': 0.84625, 'precision': 0.8733153638814016, 'recall': 0.81, 'f1': 0.8404669260700389}


In [5]:
def generate_synthetic_samples(train_data, method='combined', weights=(0.5, 0.5)):
    """
    Generate synthetic samples using a weighted combination of ADASYN and CTGAN.
    """
    train_data = train_data.dropna(subset=['MetabolicSyndrome'])

    X = train_data.drop('MetabolicSyndrome', axis=1)
    y = train_data['MetabolicSyndrome']

    # Identify discrete (categorical) columns in your dataset
    discrete_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Generate synthetic samples using ADASYN
    adasyn = ADASYN(random_state=42)
    X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
    adasyn_samples = pd.DataFrame(X_adasyn[len(X):], columns=X.columns)

    # Generate synthetic samples using CTGAN
    ctgan = CTGAN(epochs=300)
    ctgan.fit(X, discrete_columns)
    ctgan_samples = ctgan.sample(len(adasyn_samples))
    ctgan_samples.columns = X.columns  # Ensure columns match original data

    if method == 'adasyn':
        final_synthetic = adasyn_samples
    elif method == 'ctgan':
        final_synthetic = ctgan_samples
    else:  # Combined method
        n_adasyn = int(weights[0] * len(adasyn_samples))
        n_ctgan = len(adasyn_samples) - n_adasyn

        final_synthetic = pd.concat([
            adasyn_samples.sample(n=n_adasyn, random_state=42, replace=True),
            ctgan_samples.sample(n=n_ctgan, random_state=42, replace=True)
        ])

    final_synthetic['MetabolicSyndrome'] = 1  # Assuming synthetic samples are for the minority class
    balanced_data = pd.concat([train_data, final_synthetic])

    return balanced_data
