In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import RandomOverSampler 
from imblearn.combine import SMOTETomek, SMOTEENN
import joblib

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
print(dataset.shape)
print(len(dataset[dataset['MetabolicSyndrome'] == 0]))

(2401, 13)
1579


In [2]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

# This is the incorrect implementation
'''
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)
'''
# Fill NaN values in column with index 2
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())

# Fill NaN values in column with index 4
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())

# Fill NaN values in column with index 5
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())

outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]


test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
#print(test_1)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values

# Resampling the data to avoid overfitting
ros = RandomOverSampler(random_state=0)

# Resampling the data
x_resampled, y_resampled = ros.fit_resample(x_train, y_train)
print("x_train dtypes:", x_train.dtype)
print("y_train dtype:", y_train.dtype)

# XGBoost Classifier
classifier_xgboost = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.5)
classifier_xgboost = XGBClassifier(n_estimators = 412, max_depth = 8, learning_rate = 0.5)
classifier_xgboost.fit(x_resampled, y_resampled)
# joblib.dump(classifier_xgboost, 'xgboost_classifier.pkl')
y_pred = classifier_xgboost.predict(x_test)
#for i in x_test[0]:
   # print(type(i))



cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN


# Function to evaluate model
def evaluate_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy, f1, recall, precision

# SMOTE
smote = SMOTE(random_state=0)
x_smote, y_smote = smote.fit_resample(x_train, y_train)
smote_accuracy, smote_f1, smore_recall, smore_precision = evaluate_model(x_smote, y_smote, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3)
 )
smote_xg = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.3)
smote_xg.fit(x_smote, y_smote)

smote_t = SMOTETomek(random_state=0)
x_smote_t, y_smote_t = smote_t.fit_resample(x_train, y_train)
smote_t_accuracy, smote_t_f1, smote_t_recall, smote_t_precision = evaluate_model(x_smote_t, y_smote_t, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))

smote_tn = SMOTEENN(random_state=0)
x_smote_tn, y_smote_tn = smote_tn.fit_resample(x_train, y_train)
smote_tn_accuracy, smote_tn_f1, smote_tn_recall, smote_tn_precision = evaluate_model(x_smote_tn, y_smote_tn, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))
smote_tn_model = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.3)
smote_xg.fit(x_smote_tn, y_smote_tn) 


# ADASYN
adasyn = ADASYN(random_state=0)
x_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)
adasyn_accuracy, adasyn_f1, adasyn_recall, adasyn_precision = evaluate_model(x_adasyn, y_adasyn, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))
adasyn_xg = XGBClassifier(n_estimators = 412, max_depth = 8, learning_rate = 0.3)
adasyn_xg.fit(x_adasyn, y_adasyn)

# CTGAN
ctgan = CTGAN(epochs=10)
ctgan.fit(x_train, y_train)
x_ctgan = ctgan.sample(len(x_train))
y_ctgan = y_train  # Optionally reuse y_train if it makes sense for your scenario
ctgan_accuracy, ctgan_f1, ctgan_recall, ctgan_precision = evaluate_model(x_ctgan, y_ctgan, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))


from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
dataset_imputed = imputer.fit_transform(dataset)
dataset_imputed = pd.DataFrame(dataset_imputed, columns=dataset.columns)
outcome_0 = dataset_imputed[dataset_imputed['MetabolicSyndrome'] == 0]
outcome_1 = dataset_imputed[dataset_imputed['MetabolicSyndrome'] == 1]


test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
#print(test_1)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values

# XGBoost Classifier
classifier_xgboost = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5)
classifier_xgboost.fit(x_train, y_train)
y_pred = classifier_xgboost.predict(x_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mode_merged_saved = joblib.load('merged_model.pkl')
y_pred_saved = mode_merged_saved.predict(x_test)
acc_saved = accuracy_score(y_test, y_pred)
print("Saved Accuracy", acc_saved)

ensemble_model = VotingClassifier(
    estimators=[('smote', smote_xg), ('adasyn', adasyn_xg)], voting='soft'
)
print(f'KNN Accuracy: {accuracy * 100:.2f}%')

# Print the accuracies
print(f'SMOTE Accuracy: {smote_accuracy * 100:.2f}%')
print(f'SMOTE f1: {smote_f1:.2f}')
print(f'SMOTE Recall: {smore_recall:.2f}')
print(f'SMOTE Precision: {smore_precision:.2f}')
print("\n\n")
print(f'SMOTETomek Accuracy: {smote_t_accuracy * 100:.2f}%')
print(f'SMOTETomek f1: {smote_t_f1:.2f}')
print(f'SMOTETomek Recall: {smote_t_recall:.2f}')
print(f'SMOTETomek Precision: {smote_t_precision:.2f}')
print("\n\n")
print(f'SMOTEENN Accuracy: {smote_tn_accuracy * 100:.2f}%')
print(f'SMOTEENN f1: {smote_tn_f1:.2f}')
print(f'SMOTEENN Recall: {smote_tn_recall:.2f}')
print(f'SMOTEENN Precision: {smote_tn_precision:.2f}')
print("\n\n")
print(f'ADASYN Accuracy: {adasyn_accuracy * 100:.2f}%')
print(f'ADASYN f1: {adasyn_f1:.2f}')
print(f'ADASYN Recall: {adasyn_recall:.2f}')
print(f'ADASYN Precision: {adasyn_precision:.2f}')
print("\n\n")
print(f'CTGAN Accuracy: {ctgan_accuracy * 100:.2f}%')
print(f'CTGAN f1: {ctgan_f1:.2f}')
print(f'CTGAN Recall: {ctgan_recall:.2f}')
print(f'CTGAN Precision: {ctgan_precision:.2f}')
'''
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")
'''

x_train dtypes: float64
y_train dtype: int64
Accracy: 
	86.25% is the accuracy



  dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
  dataset['Race'] = dataset['Race'].replace(race_mapping)


Saved Accuracy 0.85625
KNN Accuracy: 85.62%
SMOTE Accuracy: 86.62%
SMOTE f1: 0.86
SMOTE Recall: 0.82
SMOTE Precision: 0.90



SMOTETomek Accuracy: 85.62%
SMOTETomek f1: 0.85
SMOTETomek Recall: 0.80
SMOTETomek Precision: 0.90



SMOTEENN Accuracy: 86.38%
SMOTEENN f1: 0.87
SMOTEENN Recall: 0.88
SMOTEENN Precision: 0.85



ADASYN Accuracy: 86.62%
ADASYN f1: 0.86
ADASYN Recall: 0.83
ADASYN Precision: 0.90



CTGAN Accuracy: 55.25%
CTGAN f1: 0.37
CTGAN Recall: 0.27
CTGAN Precision: 0.62


'\nprint("Confusion matrix: ")\nprint(cm, "\n")\nprint("Precision Score: ")\nprint("\t",precision_score(y_test, y_pred), "\n")\nprint("Recall: ")\nprint("\t", recall_score(y_test, y_pred), "\n")\nprint("F1 Score: ")\nprint("\t", f1_score(y_test, y_pred), "\n")\n'

In [3]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier
import joblib
from sklearn.model_selection import cross_val_score

def improved_merge_synthetic_samples(synthetic_samples_list, weights=None):
    """
    Improved version of merge_synthetic_samples with diversity preservation
    """
    if weights is None:
        weights = [1/len(synthetic_samples_list)] * len(synthetic_samples_list)
    
    # Ensure weights sum to 1
    weights = np.array(weights) / np.sum(weights)
    
    # Calculate number of samples to take from each method
    total_samples = len(synthetic_samples_list[0])
    samples_per_method = [int(w * total_samples) for w in weights]
    
    # Adjust the last value to ensure we get exactly total_samples
    samples_per_method[-1] = total_samples - sum(samples_per_method[:-1])
    
    # Initialize merged samples array
    merged_samples = []
    
    # Take stratified samples from each method
    for samples, n_samples in zip(synthetic_samples_list, samples_per_method):
        # Randomly select samples without replacement
        indices = np.random.choice(len(samples), n_samples, replace=False)
        merged_samples.append(samples[indices])
    
    # Concatenate all samples
    merged_samples = np.vstack(merged_samples)
    
    # Shuffle the merged samples
    np.random.shuffle(merged_samples)
    
    return merged_samples

def evaluate_model(x_train, y_train, x_test, y_test, model):
    """
    Evaluate model performance using multiple metrics
    """
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    
    return accuracy, f1, recall, precision

def find_best_weights(synthetic_samples_list, x_train, y_train, x_test, y_test):
    """
    Find the best weights for merging synthetic samples using cross-validation
    """
    best_f1 = 0
    best_weights = None
    
    # Try different weight combinations
    for w1 in np.arange(0.1, 1.0, 0.1):
        w2 = 1 - w1
        weights = [w1, w2]
        
        merged_synthetic = improved_merge_synthetic_samples(synthetic_samples_list, weights)
        x_merged = np.vstack([x_train, merged_synthetic])
        y_merged = np.concatenate([y_train, np.ones(len(merged_synthetic))])
        
        model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.25,
                            reg_lambda=1.0, reg_alpha=0.1)
        
        # Use cross-validation to evaluate this weight combination
        cv_scores = cross_val_score(model, x_merged, y_merged, cv=5, scoring='f1')
        avg_f1 = np.mean(cv_scores)
        
        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_weights = weights
    
    return best_weights

def main(x_train, y_train, x_test, y_test):
    # Print original class distribution
    n_minority = np.sum(y_train == 1)
    n_majority = np.sum(y_train == 0)
    print(f"Original distribution - Majority: {n_majority}, Minority: {n_minority}")
    
    # Generate synthetic samples with SMOTE and ADASYN
    smote = SMOTE(random_state=42)
    adasyn = ADASYN(random_state=42)
    smote_tomek = SMOTETomek(random_state=42)
    
    # Generate synthetic samples
    x_smote, y_smote = smote.fit_resample(x_train, y_train)
    x_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)
    x_smote_tomek, y_smote_tomek = smote_tomek.fit_resample(x_train, y_train)
    
    # Extract only the synthetic samples
    synthetic_smote = x_smote[len(x_train):]
    synthetic_adasyn = x_adasyn[len(x_train):]
    
    print(f"Number of synthetic samples - SMOTE: {len(synthetic_smote)}, ADASYN: {len(synthetic_adasyn)}")
    
    if len(synthetic_smote) > 0 and len(synthetic_adasyn) > 0:
        # Find best weights for merging
        synthetic_samples_list = [synthetic_smote, synthetic_adasyn]
        best_weights = find_best_weights(synthetic_samples_list, x_train, y_train, x_test, y_test)
        print(f"Best weights found: {best_weights}")
        
        # Merge synthetic samples with best weights
        merged_synthetic = improved_merge_synthetic_samples(synthetic_samples_list, best_weights)
        
        # Prepare datasets
        x_merged = np.vstack([x_train, merged_synthetic])
        y_merged = np.concatenate([y_train, np.ones(len(merged_synthetic))])
        
        # Initialize models with regularization
        base_model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.25)
        merged_model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.25,
                                   reg_lambda=1.0, reg_alpha=0.1)
        ensemble_model = VotingClassifier(
            estimators=[
                ('smote', XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3)),
                ('adasyn', RandomForestClassifier(n_estimators=100)),
                ('merged', merged_model),
                ('smote_tomek', XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3))
            ],
            voting='soft'
        )
        
        # Train and evaluate models
        print("\nEvaluating individual methods...")
        smote_metrics = evaluate_model(x_smote, y_smote, x_test, y_test, base_model)
        adasyn_metrics = evaluate_model(x_adasyn, y_adasyn, x_test, y_test, base_model)
        smote_tomek_metrics = evaluate_model(x_smote_tomek, y_smote_tomek, x_test, y_test, base_model)
        merged_metrics = evaluate_model(x_merged, y_merged, x_test, y_test, merged_model)
        ensemble_metrics = evaluate_model(x_merged, y_merged, x_test, y_test, ensemble_model)
        
        # Print results
        methods = ['SMOTE', 'ADASYN', 'SMOTE-Tomek', 'Merged Approach', 'Ensemble']
        metrics = [smote_metrics, adasyn_metrics, smote_tomek_metrics, merged_metrics, ensemble_metrics]
        
        for method, (acc, f1, rec, prec) in zip(methods, metrics):
            print(f"\n{method} Results:")
            print(f"Accuracy: {acc * 100:.2f}%")
            print(f"F1-Score: {f1:.2f}")
            print(f"Recall: {rec:.2f}")
            print(f"Precision: {prec:.2f}")
        
        # Save the best model
        best_f1_score = max(m[1] for m in metrics)
        best_method_idx = [m[1] for m in metrics].index(best_f1_score)
        best_method = methods[best_method_idx]
        
        print(f"\nBest performing method: {best_method} (F1-Score: {best_f1_score:.2f})")
        
        if best_method == 'Merged Approach':
            joblib.dump(merged_model, 'best_model.pkl')
        elif best_method == 'Ensemble':
            joblib.dump(ensemble_model, 'best_model.pkl')
        
    else:
        print("Error: One or more synthetic sample sets are empty")
        print("Please ensure the minority class has samples to generate synthetic data")

# Example usage:
# Assuming you have your data split into x_train, y_train, x_test, y_test
main(x_train, y_train, x_test, y_test)

Original distribution - Majority: 1179, Minority: 422
Number of synthetic samples - SMOTE: 757, ADASYN: 709
Best weights found: [0.8, 0.19999999999999996]

Evaluating individual methods...

SMOTE Results:
Accuracy: 86.62%
F1-Score: 0.86
Recall: 0.82
Precision: 0.90

ADASYN Results:
Accuracy: 87.12%
F1-Score: 0.87
Recall: 0.84
Precision: 0.90

SMOTE-Tomek Results:
Accuracy: 86.38%
F1-Score: 0.86
Recall: 0.83
Precision: 0.89

Merged Approach Results:
Accuracy: 85.50%
F1-Score: 0.85
Recall: 0.81
Precision: 0.89

Ensemble Results:
Accuracy: 86.88%
F1-Score: 0.86
Recall: 0.84
Precision: 0.89

Best performing method: ADASYN (F1-Score: 0.87)


In [8]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier

def merge_synthetic_samples(synthetic_samples_list, weights=None):
    """
    Merge multiple sets of synthetic samples using weighted nearest neighbor averaging.
    
    Parameters:
    synthetic_samples_list: List of numpy arrays, each containing synthetic samples from different methods
    weights: List of weights for each method (default: equal weights)
    
    Returns:
    numpy array: Merged synthetic samples
    """
    if weights is None:
        weights = [1/len(synthetic_samples_list)] * len(synthetic_samples_list)
    
    # Ensure we have valid samples
    for i, samples in enumerate(synthetic_samples_list):
        if len(samples) == 0:
            raise ValueError(f"Synthetic samples list at index {i} is empty")
    
    # Ensure all sample sets have the same number of features
    n_features = synthetic_samples_list[0].shape[1]
    
    # Initialize nearest neighbor models for each synthetic dataset
    nn_models = []
    for samples in synthetic_samples_list:
        nn = NearestNeighbors(n_neighbors=1)
        nn.fit(samples)
        nn_models.append(nn)
    
    # Use the first synthetic dataset as a reference
    base_samples = synthetic_samples_list[0]
    merged_samples = np.zeros_like(base_samples)
    
    # For each sample in the base dataset
    for i in range(len(base_samples)):
        sample_sum = np.zeros(n_features)
        
        # Find nearest neighbors in each synthetic dataset
        for j, (samples, nn_model, weight) in enumerate(zip(synthetic_samples_list, nn_models, weights)):
            if j == 0:
                # For the base dataset, use the sample itself
                nearest_sample = base_samples[i]
            else:
                # Find nearest neighbor in other datasets
                distances, indices = nn_model.kneighbors([base_samples[i]])
                nearest_sample = samples[indices[0][0]]
            
            sample_sum += weight * nearest_sample
        
        merged_samples[i] = sample_sum
    
    return merged_samples

# First, let's analyze the class distribution
n_minority = np.sum(y_train == 1)
n_majority = np.sum(y_train == 0)
print(f"Original distribution - Majority: {n_majority}, Minority: {n_minority}")

# Generate synthetic samples with SMOTE and ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)

# Generate synthetic samples
x_smote, y_smote = smote.fit_resample(x_train, y_train)
x_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)

# Extract only the synthetic samples (exclude original samples)
synthetic_smote = x_smote[len(x_train):]
synthetic_adasyn = x_adasyn[len(x_train):]

print(f"Number of synthetic samples - SMOTE: {len(synthetic_smote)}, ADASYN: {len(synthetic_adasyn)}") 

# Only proceed if we have synthetic samples
if len(synthetic_smote) > 0 and len(synthetic_adasyn) > 0:
    # Merge synthetic samples
    synthetic_samples_list = [synthetic_smote, synthetic_adasyn]
    #weights = [0.45, 0.55]  # Giving slightly more weight to SMOTE
    weights = [0.35, 0.65]
    merged_synthetic = merge_synthetic_samples(synthetic_samples_list, weights)

    # Combine original samples with merged synthetic samples
    x_merged = np.vstack([x_train, merged_synthetic])
    y_merged = np.concatenate([y_train, np.ones(len(merged_synthetic))])
    merged_model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.25)
    merged_model.fit(x_merged, y_merged)
    ensemble_model.fit(x_merged, y_merged)
    joblib.dump(merged_model, 'merged_model.pkl')
    # Evaluate the merged approach
    merged_accuracy, merged_f1, merged_recall, merged_precision = evaluate_model(x_merged, y_merged, x_test, y_test, 
                                   XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.25))

    # Print individual method accuracies for comparison
    smote_accuracy, smote_f1, smote_recall, smote_precision = evaluate_model(x_smote, y_smote, x_test, y_test, 
                                  XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3))
    adasyn_accuracy, adasyn_f1, adasyn_recall, adasyn_precision = evaluate_model(x_adasyn, y_adasyn, x_test, y_test, 
                                   XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3))
    ensemble_accuracy, ensemble_f1, ensemble_recall, ensemble_precision = evaluate_model(x_merged, y_merged, x_test, y_test, 
                                     VotingClassifier(
    estimators=[('smote', smote_xg), ('adasyn', adasyn_xg), ('merged', merged_model), ('smotenn', smote_tn_model)], voting='soft'
))

    print(f'SMOTE Accuracy: {smote_accuracy * 100:.2f}%')
    print(f'SMOTE f1: {smote_f1:.2f}')
    print(f'SMOTE Recall: {smote_recall:.2f}')
    print(f'SMOTE Precision: {smote_precision:.2f}')
    print("\n\n")
    print(f'ADASYN Accuracy: {adasyn_accuracy * 100:.2f}%')
    print(f'ADASYN f1: {adasyn_f1:.2f}')
    print(f'ADASYN Recall: {adasyn_recall:.2f}')
    print(f'ADASYN Precision: {adasyn_precision:.2f}')
    print("\n\n")
    print(f'Merged Approach Accuracy: {merged_accuracy * 100:.2f}%')
    print(f'Merged Approach f1: {merged_f1:.2f}')
    print(f'Merged Approach Recall: {merged_recall:.2f}')
    print(f'Merged Approach Precision: {merged_precision:.2f}')
    print("\n\n")
    print(f"Ensemble approach accuracy: {ensemble_accuracy * 100:.2f}")
    print(f"Enseble F1: {ensemble_f1}")

    
else:
    print("Error: One or more synthetic sample sets are empty")
    print("Please ensure the minority class has samples to generate synthetic data")

from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Train base models on different datasets
model_smote = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3)
model_adasyn = RandomForestClassifier(n_estimators=100)
model_original = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3)

# Create the Voting Classifier (hard voting for majority class, soft voting for probabilities)
ensemble_model = VotingClassifier(estimators=[
    ('smote', model_smote), 
    ('adasyn', model_adasyn), 
    ('original', model_original)
], voting='hard')



Original distribution - Majority: 1179, Minority: 422
Number of synthetic samples - SMOTE: 757, ADASYN: 709
SMOTE Accuracy: 86.38%
SMOTE f1: 0.86
SMOTE Recall: 0.82
SMOTE Precision: 0.89



ADASYN Accuracy: 86.38%
ADASYN f1: 0.86
ADASYN Recall: 0.82
ADASYN Precision: 0.90



Merged Approach Accuracy: 86.75%
Merged Approach f1: 0.86
Merged Approach Recall: 0.83
Merged Approach Precision: 0.90



Ensemble approach accuracy: 87.00
Enseble F1: 0.8649350649350651


In [5]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

def augment_dataset(x_train, y_train, target_size_multiplier=700):
    """
    Augment the dataset using a combination of SMOTE and ADASYN in phases
    to achieve a balanced dataset that is target_size_multiplier times larger
    """
    print("Starting data augmentation pipeline...")
    
    # Get initial class distribution
    initial_size = len(y_train)
    pos_samples = np.sum(y_train == 1)
    neg_samples = np.sum(y_train == 0)
    target_samples = max(pos_samples, neg_samples) * target_size_multiplier
    
    print(f"Initial dataset size: {initial_size}")
    print(f"Initial class distribution - Positive: {pos_samples}, Negative: {neg_samples}")
    
    # Phase 1: Use ADASYN for initial positive class augmentation
    print("\nPhase 1: ADASYN augmentation")
    adasyn = ADASYN(random_state=42)
    X_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)
    
    # Phase 2: Use SMOTE to further augment the dataset
    print("\nPhase 2: SMOTE augmentation")
    sampling_strategy = {
        0: target_samples,
        1: target_samples
    }
    
    smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
    X_final, y_final = smote.fit_resample(X_adasyn, y_adasyn)
    
    # Print final statistics
    final_pos = np.sum(y_final == 1)
    final_neg = np.sum(y_final == 0)
    print("\nAugmentation complete!")
    print(f"Final dataset size: {len(y_final)}")
    print(f"Final class distribution - Positive: {final_pos}, Negative: {final_neg}")
    
    return X_final, y_final

def evaluate_augmentation_methods(x_train, y_train, x_test, y_test):
    """
    Evaluate different augmentation methods and their combinations
    """
    models = {}
    results = {}
    
    # Base XGBoost configuration
    base_params = {
        'n_estimators': 100,
        'max_depth': 3,
        'learning_rate': 0.5
    }
    
    # Original data (baseline)
    models['baseline'] = XGBClassifier(**base_params)
    models['baseline'].fit(x_train, y_train)
    results['baseline'] = accuracy_score(y_test, models['baseline'].predict(x_test))
    
    # SMOTE
    print("\nEvaluating SMOTE...")
    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(x_train, y_train)
    models['smote'] = XGBClassifier(**base_params)
    models['smote'].fit(X_smote, y_smote)
    results['smote'] = accuracy_score(y_test, models['smote'].predict(x_test))
    
    # ADASYN
    print("\nEvaluating ADASYN...")
    adasyn = ADASYN(random_state=42)
    X_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)
    models['adasyn'] = XGBClassifier(**base_params)
    models['adasyn'].fit(X_adasyn, y_adasyn)
    results['adasyn'] = accuracy_score(y_test, models['adasyn'].predict(x_test))
    
    # Combined approach
    print("\nEvaluating combined SMOTE-ADASYN approach...")
    X_combined, y_combined = augment_dataset(x_train, y_train)
    models['combined'] = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.04)
    models['combined'].fit(X_combined, y_combined)
    results['combined'] = accuracy_score(y_test, models['combined'].predict(x_test))
    
    # Print results
    print("\nResults:")
    for method, acc in results.items():
        print(f"{method.capitalize()} Accuracy: {acc * 100:.2f}%")
    
    return models, results

# Execute the augmentation and evaluation
print("Starting evaluation of different augmentation methods...")
models, results = evaluate_augmentation_methods(x_train, y_train, x_test, y_test)

# Use the best performing model for final predictions
best_method = max(results.items(), key=lambda x: x[1])[0]
best_model = models[best_method]
final_predictions = best_model.predict(x_test)

print(f"\nBest performing method: {best_method}")
print(f"Best accuracy: {results[best_method] * 100:.2f}%")

Starting evaluation of different augmentation methods...

Evaluating SMOTE...

Evaluating ADASYN...

Evaluating combined SMOTE-ADASYN approach...
Starting data augmentation pipeline...
Initial dataset size: 1601
Initial class distribution - Positive: 422, Negative: 1179

Phase 1: ADASYN augmentation

Phase 2: SMOTE augmentation

Augmentation complete!
Final dataset size: 1650600
Final class distribution - Positive: 825300, Negative: 825300

Results:
Baseline Accuracy: 85.62%
Smote Accuracy: 86.50%
Adasyn Accuracy: 86.38%
Combined Accuracy: 86.88%

Best performing method: combined
Best accuracy: 86.88%
