In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import RandomOverSampler 
from imblearn.combine import SMOTETomek, SMOTEENN
import joblib
from ctgan import CTGAN

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
print(dataset.shape)
print(len(dataset[dataset['MetabolicSyndrome'] == 0]))
print(dataset.head())

In [None]:
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

# This is the incorrect implementation
'''
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)
'''
# Fill NaN values in column with index 2
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())

# Fill NaN values in column with index 4
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())

# Fill NaN values in column with index 5
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())
print(dataset.head())

In [3]:
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]


test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
#print(test_1)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN
smote = SMOTE()

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN
import numpy as np
from sklearn.neighbors import NearestNeighbors

def generate_synthetic_samples(train_data, method='combined', weights=(0.25, 0.30, 0.45)):
    """
    Generate synthetic samples using a weighted combination of SMOTE, ADASYN, and CTGAN
    """
    # Clean NaN values from the target variable first
    train_data = train_data.dropna(subset=['MetabolicSyndrome'])
    
    # Separate majority and minority classes
    train_0 = train_data[train_data['MetabolicSyndrome'] == 0]
    train_1 = train_data[train_data['MetabolicSyndrome'] == 1]
    
    # Determine minority and majority class
    if len(train_0) > len(train_1):
        majority_size = len(train_0)
        samples_needed = len(train_0) - len(train_1)
        minority_class = 1
    else:
        majority_size = len(train_1)
        samples_needed = len(train_1) - len(train_0)
        minority_class = 0
    
    X = train_data.drop('MetabolicSyndrome', axis=1)
    y = train_data['MetabolicSyndrome']
    
    # Generate synthetic samples using each method
    smote = SMOTE(random_state=42)
    adasyn = ADASYN(random_state=42)
    ctgan = CTGAN(epochs=300)
    
    # SMOTE samples
    X_smote, y_smote = smote.fit_resample(X, y)
    smote_samples = pd.DataFrame(X_smote[len(X):], columns=X.columns)
    
    # ADASYN samples
    X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
    adasyn_samples = pd.DataFrame(X_adasyn[len(X):], columns=X.columns)
    
    # CTGAN samples
    ctgan.fit(train_data, discrete_columns=['Sex', 'Race', 'MetabolicSyndrome'])
    synthetic_data = ctgan.sample(samples_needed)
    ctgan_samples = synthetic_data.drop('MetabolicSyndrome', axis=1)
    
    if method == 'smote':
        final_synthetic = smote_samples
    elif method == 'adasyn':
        final_synthetic = adasyn_samples
    elif method == 'ctgan':
        final_synthetic = ctgan_samples
    else:  # Combined method
        # Calculate number of samples needed from each method
        n_smote = int(samples_needed * weights[0])
        n_adasyn = int(samples_needed * weights[1])
        n_ctgan = samples_needed - n_smote - n_adasyn  # Ensure we get exact number needed
        
        # Take weighted samples from each method
        final_synthetic = pd.concat([
            smote_samples.sample(n=n_smote, random_state=42),
            adasyn_samples.sample(n=n_adasyn, random_state=42),
            ctgan_samples.sample(n=n_ctgan, random_state=42)
        ])
    
    # Add MetabolicSyndrome column to synthetic data
    final_synthetic['MetabolicSyndrome'] = minority_class
    
    # Create balanced dataset
    balanced_data = pd.concat([train_data, final_synthetic])
    
    return balanced_data

# Clean the dataset before splitting
dataset = dataset.dropna(subset=['MetabolicSyndrome'])

# Rest of your existing code for splitting the data
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]
test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
test_data = pd.concat([test_0, test_1])
train_data = dataset.drop(test_data.index)

# Generate balanced datasets using different methods
balanced_data_smote = generate_synthetic_samples(train_data, method='smote')
balanced_data_adasyn = generate_synthetic_samples(train_data, method='adasyn')
balanced_data_ctgan = generate_synthetic_samples(train_data, method='ctgan')

# Generate balanced dataset using combined method with custom weights
weights=(0.25, 0.45, 0.30)  # 40% SMOTE, 30% ADASYN, 30% CTGAN
balanced_data_combined = generate_synthetic_samples(train_data, method='combined', weights=weights)

# Function to train and evaluate model
def evaluate_model(train_data, test_data):
    X_train = train_data.drop('MetabolicSyndrome', axis=1).values
    y_train = train_data['MetabolicSyndrome'].values
    X_test = test_data.drop('MetabolicSyndrome', axis=1).values
    y_test = test_data['MetabolicSyndrome'].values
    
    model = XGBClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred)
    }

# Evaluate all methods
results_smote = evaluate_model(balanced_data_smote, test_data)
results_adasyn = evaluate_model(balanced_data_adasyn, test_data)
results_ctgan = evaluate_model(balanced_data_ctgan, test_data)
results_combined = evaluate_model(balanced_data_combined, test_data)

# Print results
print("\nSMOTE Results:")
print(results_smote)
print("\nADASYN Results:")
print(results_adasyn)
print("\nCTGAN Results:")
print(results_ctgan)
print("\nCombined Method Results:")
print(results_combined)

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN
import os

# Load the dataset
path = os.path.join("dataset.csv")
dataset = pd.read_csv(path)

# Drop unnecessary columns
dataset = dataset.drop(['seqn', 'Marital'], axis='columns')

# Map categorical variables to numerical values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}
dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

# Fill NaN values with the mean of the respective columns
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())

# Clean the dataset before splitting
dataset = dataset.dropna(subset=['MetabolicSyndrome'])

# Split the data into training and test sets
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]
test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
test_data = pd.concat([test_0, test_1])
train_data = dataset.drop(test_data.index)

# Define a function to generate synthetic samples using SMOTE, CTGAN, and ADASYN
def generate_synthetic_samples(train_data, weights=(0.33, 0.33, 0.34)):
    train_data = train_data.dropna(subset=['MetabolicSyndrome'])

    X = train_data.drop('MetabolicSyndrome', axis=1)
    y = train_data['MetabolicSyndrome']

    # Identify discrete (categorical) columns in your dataset
    discrete_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Generate synthetic samples using SMOTE
    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(X, y)
    smote_samples = pd.DataFrame(X_smote[len(X):], columns=X.columns)

    # Generate synthetic samples using ADASYN
    adasyn = ADASYN(random_state=42)
    X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
    adasyn_samples = pd.DataFrame(X_adasyn[len(X):], columns=X.columns)

    # Generate synthetic samples using CTGAN
    ctgan = CTGAN(epochs=100)
    ctgan.fit(X, discrete_columns)
    ctgan_samples = ctgan.sample(len(adasyn_samples))
    ctgan_samples.columns = X.columns  # Ensure columns match original data

    # Combine synthetic samples based on weights
    n_smote = int(weights[0] * len(smote_samples))
    n_ctgan = int(weights[1] * len(ctgan_samples))
    n_adasyn = max(0, len(smote_samples) - n_smote - n_ctgan)

    final_synthetic = pd.concat([
        smote_samples.sample(n=n_smote, random_state=42, replace=True),
        ctgan_samples.sample(n=n_ctgan, random_state=42, replace=True),
        adasyn_samples.sample(n=n_adasyn, random_state=42, replace=True)
    ])

    final_synthetic['MetabolicSyndrome'] = 1  # Assuming synthetic samples are for the minority class
    balanced_data = pd.concat([train_data, final_synthetic])

    return balanced_data

# Function to train and evaluate model
def evaluate_model(train_data, test_data):
    X_train = train_data.drop('MetabolicSyndrome', axis=1).values
    y_train = train_data['MetabolicSyndrome'].values
    X_test = test_data.drop('MetabolicSyndrome', axis=1).values
    y_test = test_data['MetabolicSyndrome'].values
    
    model = XGBClassifier(random_state=42, n_estimators=100, learning_rate=0.3, max_depth=3)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred)
    }

# Define a function to evaluate multiple weight combinations and store results in a CSV file
def evaluate_and_store_results(train_data, test_data):
    results_list = []

    # Iterate over lambda values for three methods
    for lambda_smote in np.arange(0, 1.05, 0.05):
        for lambda_ctgan in np.arange(0, 1.05 - lambda_smote, 0.05):
            lambda_adasyn = 1 - lambda_smote - lambda_ctgan
            weights = (lambda_smote, lambda_ctgan, lambda_adasyn)
            balanced_data_combined = generate_synthetic_samples(train_data, weights=weights)

            # Ensure sample size does not exceed population
            if len(balanced_data_combined) > len(train_data):
                balanced_data_combined = balanced_data_combined.sample(n=len(train_data), random_state=42, replace=True)

            results = evaluate_model(balanced_data_combined, test_data)
            print(results)

            # Store the results along with the lambda values and weights
            results_list.append({
                'lambda_smote': lambda_smote,
                'lambda_ctgan': lambda_ctgan,
                'lambda_adasyn': lambda_adasyn,
                'weights': weights,
                'accuracy': results['accuracy'],
                'precision': results['precision'],
                'recall': results['recall'],
                'f1': results['f1']
            })

    # Convert the results list to a DataFrame and save it as a CSV file
    results_df = pd.DataFrame(results_list)
    results_df.to_csv('smote_ctgan_adasyn_evaluation_results.csv', index=False)

# Evaluate and store the results in a CSV file
evaluate_and_store_results(train_data, test_data)

print("Evaluation results have been stored in evaluation_results.csv")

{'accuracy': 0.85625, 'precision': 0.8740157480314961, 'recall': 0.8325, 'f1': 0.852752880921895}
{'accuracy': 0.85875, 'precision': 0.8727272727272727, 'recall': 0.84, 'f1': 0.8560509554140127}
{'accuracy': 0.85875, 'precision': 0.8670076726342711, 'recall': 0.8475, 'f1': 0.8571428571428572}
{'accuracy': 0.8575, 'precision': 0.8685567010309279, 'recall': 0.8425, 'f1': 0.8553299492385786}
{'accuracy': 0.8625, 'precision': 0.8737113402061856, 'recall': 0.8475, 'f1': 0.8604060913705583}
{'accuracy': 0.8625, 'precision': 0.8756476683937824, 'recall': 0.845, 'f1': 0.8600508905852416}
{'accuracy': 0.85625, 'precision': 0.8607594936708861, 'recall': 0.85, 'f1': 0.8553459119496856}
{'accuracy': 0.845, 'precision': 0.8631578947368421, 'recall': 0.82, 'f1': 0.8410256410256411}
{'accuracy': 0.8475, 'precision': 0.8677248677248677, 'recall': 0.82, 'f1': 0.8431876606683804}
{'accuracy': 0.855, 'precision': 0.8717277486910995, 'recall': 0.8325, 'f1': 0.8516624040920717}
{'accuracy': 0.8625, 'precis

In [None]:
results_df.to_csv('smote_ctgan_adasyn_evaluation_results.csv', index=False)

In [5]:
def generate_synthetic_samples(train_data, method='combined', weights=(0.5, 0.5)):
    """
    Generate synthetic samples using a weighted combination of ADASYN and CTGAN.
    """
    train_data = train_data.dropna(subset=['MetabolicSyndrome'])

    X = train_data.drop('MetabolicSyndrome', axis=1)
    y = train_data['MetabolicSyndrome']

    # Identify discrete (categorical) columns in your dataset
    discrete_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Generate synthetic samples using ADASYN
    adasyn = ADASYN(random_state=42)
    X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
    adasyn_samples = pd.DataFrame(X_adasyn[len(X):], columns=X.columns)

    # Generate synthetic samples using CTGAN
    ctgan = CTGAN(epochs=300)
    ctgan.fit(X, discrete_columns)
    ctgan_samples = ctgan.sample(len(adasyn_samples))
    ctgan_samples.columns = X.columns  # Ensure columns match original data

    if method == 'adasyn':
        final_synthetic = adasyn_samples
    elif method == 'ctgan':
        final_synthetic = ctgan_samples
    else:  # Combined method
        n_adasyn = int(weights[0] * len(adasyn_samples))
        n_ctgan = len(adasyn_samples) - n_adasyn

        final_synthetic = pd.concat([
            adasyn_samples.sample(n=n_adasyn, random_state=42, replace=True),
            ctgan_samples.sample(n=n_ctgan, random_state=42, replace=True)
        ])

    final_synthetic['MetabolicSyndrome'] = 1  # Assuming synthetic samples are for the minority class
    balanced_data = pd.concat([train_data, final_synthetic])

    return balanced_data
