In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler 
from imblearn.combine import SMOTETomek, SMOTEENN

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
print(dataset.shape)
print(len(dataset[dataset['MetabolicSyndrome'] == 0]))

In [None]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

# This is the incorrect implementation
'''
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)
'''
# Fill NaN values in column with index 2
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())

# Fill NaN values in column with index 4
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())

# Fill NaN values in column with index 5
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())

outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]


test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
#print(test_1)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values

# Resampling the data to avoid overfitting
ros = RandomOverSampler(random_state=0)

# Resampling the data
x_resampled, y_resampled = ros.fit_resample(x_train, y_train)
print("x_train dtypes:", x_train.dtype)
print("y_train dtype:", y_train.dtype)

# XGBoost Classifier
classifier_xgboost = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.5)
classifier_xgboost = XGBClassifier(n_estimators = 412, max_depth = 8, learning_rate = 0.5)
classifier_xgboost.fit(x_resampled, y_resampled)
# joblib.dump(classifier_xgboost, 'xgboost_classifier.pkl')
y_pred = classifier_xgboost.predict(x_test)
#for i in x_test[0]:
   # print(type(i))


cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN


# Function to evaluate model
def evaluate_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# SMOTE
smote = SMOTE(random_state=0)
x_smote, y_smote = smote.fit_resample(x_train, y_train)
smote_accuracy = evaluate_model(x_smote, y_smote, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3))

smote_t = SMOTETomek(random_state=0)
x_smote_t, y_smote_t = smote_t.fit_resample(x_train, y_train)
smote_t_accuracy = evaluate_model(x_smote_t, y_smote_t, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))

smote_tn = SMOTEENN(random_state=0)
x_smote_tn, y_smote_tn = smote_tn.fit_resample(x_train, y_train)
smote_tn_accuracy = evaluate_model(x_smote_tn, y_smote_tn, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))


# ADASYN
adasyn = ADASYN(random_state=0)
x_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)
adasyn_accuracy = evaluate_model(x_adasyn, y_adasyn, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))

# CTGAN
ctgan = CTGAN(epochs=10)
ctgan.fit(x_train, y_train)
x_ctgan = ctgan.sample(len(x_train))
y_ctgan = y_train  # Optionally reuse y_train if it makes sense for your scenario
ctgan_accuracy = evaluate_model(x_ctgan, y_ctgan, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))

# Print the accuracies
print(f'SMOTE Accuracy: {smote_accuracy * 100:.2f}%')
print(f'SMOTETomek Accuracy: {smote_t_accuracy * 100:.2f}%')
print(f'SMOTEENN Accuracy: {smote_tn_accuracy * 100:.2f}%')
print(f'ADASYN Accuracy: {adasyn_accuracy * 100:.2f}%')
print(f'CTGAN Accuracy: {ctgan_accuracy * 100:.2f}%')
'''
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")
'''

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

def augment_dataset(x_train, y_train, target_size_multiplier=5):
    """
    Augment the dataset using a combination of SMOTE and ADASYN in phases
    to achieve a balanced dataset that is target_size_multiplier times larger
    """
    print("Starting data augmentation pipeline...")
    
    # Get initial class distribution
    initial_size = len(y_train)
    pos_samples = np.sum(y_train == 1)
    neg_samples = np.sum(y_train == 0)
    target_samples = max(pos_samples, neg_samples) * target_size_multiplier
    
    print(f"Initial dataset size: {initial_size}")
    print(f"Initial class distribution - Positive: {pos_samples}, Negative: {neg_samples}")
    
    # Phase 1: Use ADASYN for initial positive class augmentation
    print("\nPhase 1: ADASYN augmentation")
    adasyn = ADASYN(random_state=42)
    X_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)
    
    # Phase 2: Use SMOTE to further augment the dataset
    print("\nPhase 2: SMOTE augmentation")
    sampling_strategy = {
        0: target_samples,
        1: target_samples
    }
    
    smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
    X_final, y_final = smote.fit_resample(X_adasyn, y_adasyn)
    
    # Print final statistics
    final_pos = np.sum(y_final == 1)
    final_neg = np.sum(y_final == 0)
    print("\nAugmentation complete!")
    print(f"Final dataset size: {len(y_final)}")
    print(f"Final class distribution - Positive: {final_pos}, Negative: {final_neg}")
    
    return X_final, y_final

def evaluate_augmentation_methods(x_train, y_train, x_test, y_test):
    """
    Evaluate different augmentation methods and their combinations
    """
    models = {}
    results = {}
    
    # Base XGBoost configuration
    base_params = {
        'n_estimators': 100,
        'max_depth': 3,
        'learning_rate': 0.5
    }
    
    # Original data (baseline)
    models['baseline'] = XGBClassifier(**base_params)
    models['baseline'].fit(x_train, y_train)
    results['baseline'] = accuracy_score(y_test, models['baseline'].predict(x_test))
    
    # SMOTE
    print("\nEvaluating SMOTE...")
    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(x_train, y_train)
    models['smote'] = XGBClassifier(**base_params)
    models['smote'].fit(X_smote, y_smote)
    results['smote'] = accuracy_score(y_test, models['smote'].predict(x_test))
    
    # ADASYN
    print("\nEvaluating ADASYN...")
    adasyn = ADASYN(random_state=42)
    X_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)
    models['adasyn'] = XGBClassifier(**base_params)
    models['adasyn'].fit(X_adasyn, y_adasyn)
    results['adasyn'] = accuracy_score(y_test, models['adasyn'].predict(x_test))
    
    # Combined approach
    print("\nEvaluating combined SMOTE-ADASYN approach...")
    X_combined, y_combined = augment_dataset(x_train, y_train)
    models['combined'] = XGBClassifier(**base_params)
    models['combined'].fit(X_combined, y_combined)
    results['combined'] = accuracy_score(y_test, models['combined'].predict(x_test))
    
    # Print results
    print("\nResults:")
    for method, acc in results.items():
        print(f"{method.capitalize()} Accuracy: {acc * 100:.2f}%")
    
    return models, results

# Execute the augmentation and evaluation
print("Starting evaluation of different augmentation methods...")
models, results = evaluate_augmentation_methods(x_train, y_train, x_test, y_test)

# Use the best performing model for final predictions
best_method = max(results.items(), key=lambda x: x[1])[0]
best_model = models[best_method]
final_predictions = best_model.predict(x_test)

print(f"\nBest performing method: {best_method}")
print(f"Best accuracy: {results[best_method] * 100:.2f}%")

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier

def merge_synthetic_samples(synthetic_samples_list, weights=None):
    """
    Merge multiple sets of synthetic samples using weighted nearest neighbor averaging.
    
    Parameters:
    synthetic_samples_list: List of numpy arrays, each containing synthetic samples from different methods
    weights: List of weights for each method (default: equal weights)
    
    Returns:
    numpy array: Merged synthetic samples
    """
    if weights is None:
        weights = [1/len(synthetic_samples_list)] * len(synthetic_samples_list)
    
    # Ensure we have valid samples
    for i, samples in enumerate(synthetic_samples_list):
        if len(samples) == 0:
            raise ValueError(f"Synthetic samples list at index {i} is empty")
    
    # Ensure all sample sets have the same number of features
    n_features = synthetic_samples_list[0].shape[1]
    
    # Initialize nearest neighbor models for each synthetic dataset
    nn_models = []
    for samples in synthetic_samples_list:
        nn = NearestNeighbors(n_neighbors=1)
        nn.fit(samples)
        nn_models.append(nn)
    
    # Use the first synthetic dataset as a reference
    base_samples = synthetic_samples_list[0]
    merged_samples = np.zeros_like(base_samples)
    
    # For each sample in the base dataset
    for i in range(len(base_samples)):
        sample_sum = np.zeros(n_features)
        
        # Find nearest neighbors in each synthetic dataset
        for j, (samples, nn_model, weight) in enumerate(zip(synthetic_samples_list, nn_models, weights)):
            if j == 0:
                # For the base dataset, use the sample itself
                nearest_sample = base_samples[i]
            else:
                # Find nearest neighbor in other datasets
                distances, indices = nn_model.kneighbors([base_samples[i]])
                nearest_sample = samples[indices[0][0]]
            
            sample_sum += weight * nearest_sample
        
        merged_samples[i] = sample_sum
    
    return merged_samples

# First, let's analyze the class distribution
n_minority = np.sum(y_train == 1)
n_majority = np.sum(y_train == 0)
print(f"Original distribution - Majority: {n_majority}, Minority: {n_minority}")

# Generate synthetic samples with SMOTE and ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
ctgan = CTGAN(epochs=500)
ctgan.fit(x_train, y_train)
x_ctgan = ctgan.sample(len(x_train))
y_ctgan = y_train  

# Generate synthetic samples
x_smote, y_smote = smote.fit_resample(x_train, y_train)
x_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)

# Extract only the synthetic samples (exclude original samples)
synthetic_smote = x_smote[len(x_train):]
synthetic_adasyn = x_adasyn[len(x_train):]
synthetic_ctgan = x_ctgan[len(x_train):]

print(f"Number of synthetic samples - SMOTE: {len(synthetic_smote)}, ADASYN: {len(synthetic_adasyn)}, CTGAN: {len(synthetic_ctgan)}")

# Only proceed if we have synthetic samples
if len(synthetic_smote) > 0 and len(synthetic_adasyn) > 0:
    # Merge synthetic samples
    synthetic_samples_list = [synthetic_smote, synthetic_adasyn]
    weights = [0.8, 0.2]  # Giving slightly more weight to SMOTE
    merged_synthetic = merge_synthetic_samples(synthetic_samples_list, weights)

    # Combine original samples with merged synthetic samples
    x_merged = np.vstack([x_train, merged_synthetic])
    y_merged = np.concatenate([y_train, np.ones(len(merged_synthetic))])

    # Evaluate the merged approach
    merged_accuracy = evaluate_model(x_merged, y_merged, x_test, y_test, 
                                   XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3))

    # Print individual method accuracies for comparison
    smote_accuracy = evaluate_model(x_smote, y_smote, x_test, y_test, 
                                  XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3))
    adasyn_accuracy = evaluate_model(x_adasyn, y_adasyn, x_test, y_test, 
                                   XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3))

    print(f'SMOTE Accuracy: {smote_accuracy * 100:.2f}%')
    print(f'ADASYN Accuracy: {adasyn_accuracy * 100:.2f}%')
    print(f'Merged Approach Accuracy: {merged_accuracy * 100:.2f}%')
else:
    print("Error: One or more synthetic sample sets are empty")
    print("Please ensure the minority class has samples to generate synthetic data")

