In [19]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')

In [20]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)


In [21]:
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]

test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values

# Resampling the data to avoid overfitting
ros = RandomOverSampler(random_state=0)

# Resampling the data
x_resampled, y_resampled = ros.fit_resample(x_train, y_train)
x_test, y_test = ros.fit_resample(x_test, y_test)

# Making the Random Classifier model
classifier = RandomForestClassifier(n_estimators=30, criterion='entropy', random_state=0)

# Training the model
classifier.fit(x_resampled, y_resampled)

# Predicting the results 
y_pred = classifier.predict(x_test)

# Getting accuracy results and confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	86.75% is the accuracy

Confusion matrix: 
[[373  27]
 [ 79 321]] 

Precision Score: 
	 0.9224137931034483 

Recall: 
	 0.8025 

F1 Score: 
	 0.8582887700534759 



In [22]:
# Decision Tree Classifier
classifier_decision_tree = DecisionTreeClassifier(criterion='entropy')
classifier_decision_tree.fit(x_resampled, y_resampled)
y_pred = classifier_decision_tree.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	82.62% is the accuracy

Confusion matrix: 
[[362  38]
 [101 299]] 

Precision Score: 
	 0.887240356083086 

Recall: 
	 0.7475 

F1 Score: 
	 0.8113975576662145 



In [23]:
# XGBoost Classifier
classifier_xgboost = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.5)
classifier_xgboost.fit(x_resampled, y_resampled)
y_pred = classifier_xgboost.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	87.38% is the accuracy

Confusion matrix: 
[[369  31]
 [ 70 330]] 

Precision Score: 
	 0.9141274238227147 

Recall: 
	 0.825 

F1 Score: 
	 0.8672798948751642 



In [24]:
# Logistic Regression
classifier_logistic_regression = LogisticRegression()

classifier_logistic_regression.fit(x_resampled, y_resampled)
y_pred = classifier_logistic_regression.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	79.25% is the accuracy

Confusion matrix: 
[[299 101]
 [ 65 335]] 

Precision Score: 
	 0.768348623853211 

Recall: 
	 0.8375 

F1 Score: 
	 0.8014354066985646 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=10)
mlp.fit(x_resampled, y_resampled)
y_pred = mlp.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")
mlp_accuracy = mlp.score(x_test, y_test)

Accracy: 
	67.50% is the accuracy

Confusion matrix: 
[[383  17]
 [243 157]] 

Precision Score: 
	 0.9022988505747126 

Recall: 
	 0.3925 

F1 Score: 
	 0.5470383275261325 



In [26]:
tabnet = TabNetClassifier()
tabnet.fit(
    x_resampled, y_resampled,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)
y_pred = tabnet.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")




epoch 0  | loss: 0.75585 |  0:00:00s
epoch 1  | loss: 0.58946 |  0:00:00s
epoch 2  | loss: 0.53668 |  0:00:00s
epoch 3  | loss: 0.51754 |  0:00:00s
epoch 4  | loss: 0.46505 |  0:00:00s
epoch 5  | loss: 0.41816 |  0:00:00s
epoch 6  | loss: 0.408   |  0:00:00s
epoch 7  | loss: 0.38976 |  0:00:00s
epoch 8  | loss: 0.36445 |  0:00:00s
epoch 9  | loss: 0.35907 |  0:00:00s
epoch 10 | loss: 0.33651 |  0:00:00s
epoch 11 | loss: 0.3233  |  0:00:00s
epoch 12 | loss: 0.32401 |  0:00:00s
epoch 13 | loss: 0.31566 |  0:00:00s
epoch 14 | loss: 0.32349 |  0:00:00s
epoch 15 | loss: 0.30989 |  0:00:00s
epoch 16 | loss: 0.29761 |  0:00:00s
epoch 17 | loss: 0.31156 |  0:00:00s
epoch 18 | loss: 0.2947  |  0:00:00s
epoch 19 | loss: 0.29987 |  0:00:01s
epoch 20 | loss: 0.29186 |  0:00:01s
epoch 21 | loss: 0.29951 |  0:00:01s
epoch 22 | loss: 0.29421 |  0:00:01s
epoch 23 | loss: 0.29307 |  0:00:01s
epoch 24 | loss: 0.29163 |  0:00:01s
epoch 25 | loss: 0.29267 |  0:00:01s
epoch 26 | loss: 0.28873 |  0:00:01s
e

In [27]:
from datetime import datetime
def evaluate_and_store_metrics(models_dict, X_test, y_test, csv_filename='model_metrics.csv', spacing_rows=5):
    """
    Evaluate multiple models and store their metrics in a CSV file with spacing between different runs.
    
    Parameters:
    models_dict: Dictionary of model names and their corresponding trained model objects
    X_test: Test features
    y_test: Test labels
    csv_filename: Name of the output CSV file
    spacing_rows: Number of blank rows to add before new results
    """
    # Create a list to store results
    results = []
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    version = "ROS on both training and testing data"
    
    # Evaluate each model
    for model_name, model in models_dict.items():
        y_pred = model.predict(X_test)
        
        results.append({
            'Version': version,
            'timestamp': timestamp,
            'model_name': model_name,
            'accuracy': round(accuracy_score(y_test, y_pred) * 100, 2),
            'precision': round(precision_score(y_test, y_pred), 4),
            'recall': round(recall_score(y_test, y_pred), 4),
            'f1_score': round(f1_score(y_test, y_pred), 4)
        })
    
    results_df = pd.DataFrame(results)
    
    try:
        # Read existing CSV
        existing_df = pd.read_csv(csv_filename)
        
        # Create empty rows for spacing
        empty_rows = pd.DataFrame([{'timestamp': '', 'model_name': '', 'accuracy': '', 
                                  'precision': '', 'recall': '', 'f1_score': ''}] * spacing_rows)
        
        # Concatenate existing data, empty rows, and new results
        final_df = pd.concat([existing_df, empty_rows, results_df], ignore_index=True)
        
    except FileNotFoundError:
        final_df = results_df
    
    # Save to CSV without index
    final_df.to_csv(csv_filename, index=False)
    
    return results_df

def run_evaluation(dataset, test_size_each_class=400, random_state=42):
    """
    Run the complete evaluation pipeline including data splitting and model training.
    """
    # Split data
    outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
    outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]
    
    test_0 = outcome_0.sample(n=test_size_each_class, random_state=random_state)
    test_1 = outcome_1.sample(n=test_size_each_class, random_state=random_state)
    test_data = pd.concat([test_0, test_1])
    
    # Create training set
    train_data = dataset.drop(test_data.index)
    
    # Prepare features and labels
    X_train = train_data.drop('MetabolicSyndrome', axis=1).values
    y_train = train_data['MetabolicSyndrome'].values
    X_test = test_data.drop('MetabolicSyndrome', axis=1).values
    y_test = test_data['MetabolicSyndrome'].values
    
    # Resample training data only
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    
    # Initialize and train models
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=30, criterion='entropy', random_state=0),
        'Decision Tree': DecisionTreeClassifier(criterion='entropy'),
        'XGBoost': XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5),
        'Logistic Regression': LogisticRegression(),
        'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=10),
        'TabNet': TabNetClassifier()
    }
    
    # Train each model
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        if model_name == 'TabNet':
            model.fit(
                X_resampled, y_resampled,
                max_epochs=100,
                patience=10,
                batch_size=1024,
                virtual_batch_size=128,
                num_workers=0,
                drop_last=False
            )
        else:
            model.fit(X_resampled, y_resampled)
    
    # Evaluate and store results
    results = evaluate_and_store_metrics(models, X_test, y_test)
    
    return results

# Run the evaluation - it will automatically add results 5 rows below existing data
results = run_evaluation(dataset)
print(results)

Training Random Forest...
Training Decision Tree...
Training XGBoost...
Training Logistic Regression...
Training MLP...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training TabNet...




epoch 0  | loss: 0.75585 |  0:00:00s
epoch 1  | loss: 0.58946 |  0:00:00s
epoch 2  | loss: 0.53668 |  0:00:00s
epoch 3  | loss: 0.51754 |  0:00:00s
epoch 4  | loss: 0.46505 |  0:00:00s
epoch 5  | loss: 0.41816 |  0:00:00s
epoch 6  | loss: 0.408   |  0:00:00s
epoch 7  | loss: 0.38976 |  0:00:00s
epoch 8  | loss: 0.36445 |  0:00:00s
epoch 9  | loss: 0.35907 |  0:00:00s
epoch 10 | loss: 0.33651 |  0:00:00s
epoch 11 | loss: 0.3233  |  0:00:00s
epoch 12 | loss: 0.32401 |  0:00:00s
epoch 13 | loss: 0.31566 |  0:00:00s
epoch 14 | loss: 0.32349 |  0:00:00s
epoch 15 | loss: 0.30989 |  0:00:00s
epoch 16 | loss: 0.29761 |  0:00:00s
epoch 17 | loss: 0.31156 |  0:00:00s
epoch 18 | loss: 0.2947  |  0:00:00s
epoch 19 | loss: 0.29987 |  0:00:00s
epoch 20 | loss: 0.29186 |  0:00:01s
epoch 21 | loss: 0.29951 |  0:00:01s
epoch 22 | loss: 0.29421 |  0:00:01s
epoch 23 | loss: 0.29307 |  0:00:01s
epoch 24 | loss: 0.29163 |  0:00:01s
epoch 25 | loss: 0.29267 |  0:00:01s
epoch 26 | loss: 0.28873 |  0:00:01s
e