In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
print(dataset.shape)
print(len(dataset[dataset['MetabolicSyndrome'] == 0]))

(2401, 13)
1579


In [2]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

# This is the incorrect implementation
'''
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)
'''
# Fill NaN values in column with index 2
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())

# Fill NaN values in column with index 4
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())

# Fill NaN values in column with index 5
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())

outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]


test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
#print(test_1)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values

# Resampling the data to avoid overfitting
ros = RandomOverSampler(random_state=0)

# Resampling the data
x_resampled, y_resampled = ros.fit_resample(x_train, y_train)
print("x_train dtypes:", x_train.dtype)
print("y_train dtype:", y_train.dtype)

# XGBoost Classifier
classifier_xgboost = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.5)
classifier_xgboost.fit(x_resampled, y_resampled)
# joblib.dump(classifier_xgboost, 'xgboost_classifier.pkl')
y_pred = classifier_xgboost.predict(x_test)
#for i in x_test[0]:
   # print(type(i))


cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN


# Function to evaluate model
def evaluate_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# SMOTE
smote = SMOTE(random_state=0)
x_smote, y_smote = smote.fit_resample(x_train, y_train)
smote_accuracy = evaluate_model(x_smote, y_smote, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3))

# ADASYN
adasyn = ADASYN(random_state=0)
x_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)
adasyn_accuracy = evaluate_model(x_adasyn, y_adasyn, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))

# CTGAN
ctgan = CTGAN(epochs=10)
ctgan.fit(x_train, y_train)
x_ctgan = ctgan.sample(len(x_train))
y_ctgan = y_train  # Optionally reuse y_train if it makes sense for your scenario
ctgan_accuracy = evaluate_model(x_ctgan, y_ctgan, x_test, y_test, XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5))

# Print the accuracies
print(f'SMOTE Accuracy: {smote_accuracy * 100:.2f}%')
print(f'ADASYN Accuracy: {adasyn_accuracy * 100:.2f}%')
print(f'CTGAN Accuracy: {ctgan_accuracy * 100:.2f}%')
'''
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")
'''

x_train dtypes: float64
y_train dtype: int64
Accracy: 
	86.50% is the accuracy



  dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
  dataset['Race'] = dataset['Race'].replace(race_mapping)


SMOTE Accuracy: 86.62%
ADASYN Accuracy: 86.62%
CTGAN Accuracy: 51.25%


'\nprint("Confusion matrix: ")\nprint(cm, "\n")\nprint("Precision Score: ")\nprint("\t",precision_score(y_test, y_pred), "\n")\nprint("Recall: ")\nprint("\t", recall_score(y_test, y_pred), "\n")\nprint("F1 Score: ")\nprint("\t", f1_score(y_test, y_pred), "\n")\n'