In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
import joblib
from pytorch_tabnet.tab_model import TabNetClassifier

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
print(dataset.shape)
print(len(dataset[dataset['MetabolicSyndrome'] == 0]))

(2401, 13)
1579


In [2]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)
'''
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)
'''
# Fill NaN values in column with index 2
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())

# Fill NaN values in column with index 4
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())

# Fill NaN values in column with index 5
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())

  dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
  dataset['Race'] = dataset['Race'].replace(race_mapping)


In [3]:
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]


test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
print(test_1)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values

# Resampling the data to avoid overfitting
ros = RandomOverSampler(random_state=0)

# Resampling the data
x_resampled, y_resampled = ros.fit_resample(x_train, y_train)

# Making the Random Classifier model
classifier = RandomForestClassifier(n_estimators=30, criterion='entropy', random_state=0)

# Training the model
classifier.fit(x_resampled, y_resampled)

# Predicting the results 
y_pred = classifier.predict(x_test)

# Getting accuracy results and confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")


      Age  Sex  Income  Race   WaistCirc   BMI  Albuminuria  UrAlbCr  \
1758   20    0  9000.0     0   95.100000  27.2            0     4.26   
537    78    1  1500.0     0   98.307254  35.5            0    11.63   
204    40    0  1500.0     0  120.500000  34.1            0    10.59   
514    42    0  9000.0     0  104.600000  28.7            0     3.93   
854    57    0   300.0     0   96.900000  26.8            0     5.09   
...   ...  ...     ...   ...         ...   ...          ...      ...   
697    54    0  5400.0     0  116.800000  33.1            0    14.25   
1693   33    0  1600.0     4   93.800000  27.8            0     2.96   
1271   80    0  1700.0     0  122.700000  34.1            0     5.45   
1304   38    0  1700.0     2  116.000000  34.6            0     2.35   
852    23    0   800.0     1  126.000000  38.4            0     7.75   

      UricAcid  BloodGlucose  HDL  Triglycerides  MetabolicSyndrome  
1758       5.2           111   39            239                 

In [4]:
# Decision Tree Classifier
classifier_decision_tree = DecisionTreeClassifier(criterion='entropy')
classifier_decision_tree.fit(x_resampled, y_resampled)
y_pred = classifier_decision_tree.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	82.38% is the accuracy

Confusion matrix: 
[[360  40]
 [101 299]] 

Precision Score: 
	 0.8820058997050148 

Recall: 
	 0.7475 

F1 Score: 
	 0.8092016238159675 



In [5]:
# XGBoost Classifier
classifier_xgboost = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.5)
classifier_xgboost.fit(x_resampled, y_resampled)
joblib.dump(classifier_xgboost, 'xgboost_classifier.pkl')
y_pred = classifier_xgboost.predict(x_test)
for i in x_test[0]:
    print(type(i))



cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
Accracy: 
	86.50% is the accuracy

Confusion matrix: 
[[365  35]
 [ 73 327]] 

Precision Score: 
	 0.9033149171270718 

Recall: 
	 0.8175 

F1 Score: 
	 0.858267716535433 



In [6]:
# Logistic Regression
classifier_logistic_regression = LogisticRegression()

classifier_logistic_regression.fit(x_resampled, y_resampled)
y_pred = classifier_logistic_regression.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	79.50% is the accuracy

Confusion matrix: 
[[301  99]
 [ 65 335]] 

Precision Score: 
	 0.771889400921659 

Recall: 
	 0.8375 

F1 Score: 
	 0.8033573141486811 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=10)
mlp.fit(x_resampled, y_resampled)
y_pred = mlp.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")
mlp_accuracy = mlp.score(x_test, y_test)

Accracy: 
	76.25% is the accuracy

Confusion matrix: 
[[341  59]
 [131 269]] 

Precision Score: 
	 0.8201219512195121 

Recall: 
	 0.6725 

F1 Score: 
	 0.7390109890109889 



In [8]:
tabnet = TabNetClassifier()
tabnet.fit(
    x_resampled, y_resampled,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)
y_pred = tabnet.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")




epoch 0  | loss: 0.74137 |  0:00:00s
epoch 1  | loss: 0.57173 |  0:00:00s
epoch 2  | loss: 0.5274  |  0:00:00s
epoch 3  | loss: 0.49877 |  0:00:00s
epoch 4  | loss: 0.46252 |  0:00:00s
epoch 5  | loss: 0.43444 |  0:00:00s
epoch 6  | loss: 0.42045 |  0:00:00s
epoch 7  | loss: 0.43382 |  0:00:00s
epoch 8  | loss: 0.41097 |  0:00:00s
epoch 9  | loss: 0.39625 |  0:00:00s
epoch 10 | loss: 0.38309 |  0:00:00s
epoch 11 | loss: 0.36921 |  0:00:00s
epoch 12 | loss: 0.36428 |  0:00:00s
epoch 13 | loss: 0.35298 |  0:00:00s
epoch 14 | loss: 0.3491  |  0:00:00s
epoch 15 | loss: 0.33174 |  0:00:00s
epoch 16 | loss: 0.32006 |  0:00:00s
epoch 17 | loss: 0.30949 |  0:00:00s
epoch 18 | loss: 0.31723 |  0:00:00s
epoch 19 | loss: 0.31482 |  0:00:00s
epoch 20 | loss: 0.29885 |  0:00:00s
epoch 21 | loss: 0.2925  |  0:00:00s
epoch 22 | loss: 0.28384 |  0:00:00s
epoch 23 | loss: 0.28008 |  0:00:00s
epoch 24 | loss: 0.27746 |  0:00:00s
epoch 25 | loss: 0.27728 |  0:00:00s
epoch 26 | loss: 0.27112 |  0:00:01s
e

In [9]:
from datetime import datetime
def evaluate_and_store_metrics(models_dict, X_test, y_test, csv_filename='model_metrics_2.csv', spacing_rows=5):
    """
    Evaluate multiple models and store their metrics in a CSV file with spacing between different runs.
    
    Parameters:
    models_dict: Dictionary of model names and their corresponding trained model objects
    X_test: Test features
    y_test: Test labels
    csv_filename: Name of the output CSV file
    spacing_rows: Number of blank rows to add before new results
    """
    # Create a list to store results
    results = []
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    version = "ROS on training data"
    
    # Evaluate each model
    for model_name, model in models_dict.items():
        y_pred = model.predict(X_test)
        
        results.append({
            'version': version,
            'timestamp': timestamp,
            'model_name': model_name,
            'accuracy': round(accuracy_score(y_test, y_pred) * 100, 2),
            'precision': round(precision_score(y_test, y_pred), 4),
            'recall': round(recall_score(y_test, y_pred), 4),
            'f1_score': round(f1_score(y_test, y_pred), 4)
        })
    
    results_df = pd.DataFrame(results)
    
    try:
        # Read existing CSV
        existing_df = pd.read_csv(csv_filename)
        
        # Create empty rows for spacing
        empty_rows = pd.DataFrame([{'timestamp': '', 'model_name': '', 'accuracy': '', 
                                  'precision': '', 'recall': '', 'f1_score': ''}] * spacing_rows)
        
        # Concatenate existing data, empty rows, and new results
        final_df = pd.concat([existing_df, empty_rows, results_df], ignore_index=True)
        
    except FileNotFoundError:
        final_df = results_df
    
    # Save to CSV without index
    final_df.to_csv(csv_filename, index=False)
    
    return results_df

def run_evaluation(dataset, test_size_each_class=400, random_state=42):
    """
    Run the complete evaluation pipeline including data splitting and model training.
    """
    # Split data
    outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
    outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]
    
    test_0 = outcome_0.sample(n=test_size_each_class, random_state=random_state)
    test_1 = outcome_1.sample(n=test_size_each_class, random_state=random_state)
    test_data = pd.concat([test_0, test_1])
    
    # Create training set
    train_data = dataset.drop(test_data.index)
    
    # Prepare features and labels
    X_train = train_data.drop('MetabolicSyndrome', axis=1).values
    y_train = train_data['MetabolicSyndrome'].values
    X_test = test_data.drop('MetabolicSyndrome', axis=1).values
    y_test = test_data['MetabolicSyndrome'].values
    
    # Resample training data only
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    
    # Initialize and train models
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=30, criterion='entropy', random_state=0),
        'Decision Tree': DecisionTreeClassifier(criterion='entropy'),
        'XGBoost': XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.5),
        'Logistic Regression': LogisticRegression(),
        'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=10),
        'TabNet': TabNetClassifier()
    }
    
    # Train each model
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        if model_name == 'TabNet':
            model.fit(
                X_resampled, y_resampled,
                max_epochs=100,
                patience=10,
                batch_size=1024,
                virtual_batch_size=128,
                num_workers=0,
                drop_last=False
            )
        else:
            model.fit(X_resampled, y_resampled)
    
    # Evaluate and store results
    results = evaluate_and_store_metrics(models, X_test, y_test)
    
    return results

In [10]:
results = run_evaluation(dataset)
print(results)

Training Random Forest...
Training Decision Tree...
Training XGBoost...
Training Logistic Regression...
Training MLP...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training TabNet...




epoch 0  | loss: 0.74137 |  0:00:00s
epoch 1  | loss: 0.57173 |  0:00:00s
epoch 2  | loss: 0.5274  |  0:00:00s
epoch 3  | loss: 0.49877 |  0:00:00s
epoch 4  | loss: 0.46252 |  0:00:00s
epoch 5  | loss: 0.43444 |  0:00:00s
epoch 6  | loss: 0.42045 |  0:00:00s
epoch 7  | loss: 0.43382 |  0:00:00s
epoch 8  | loss: 0.41097 |  0:00:00s
epoch 9  | loss: 0.39625 |  0:00:00s
epoch 10 | loss: 0.38309 |  0:00:00s
epoch 11 | loss: 0.36921 |  0:00:00s
epoch 12 | loss: 0.36428 |  0:00:00s
epoch 13 | loss: 0.35298 |  0:00:00s
epoch 14 | loss: 0.3491  |  0:00:00s
epoch 15 | loss: 0.33174 |  0:00:00s
epoch 16 | loss: 0.32006 |  0:00:00s
epoch 17 | loss: 0.30949 |  0:00:00s
epoch 18 | loss: 0.31723 |  0:00:00s
epoch 19 | loss: 0.31482 |  0:00:00s
epoch 20 | loss: 0.29885 |  0:00:00s
epoch 21 | loss: 0.2925  |  0:00:00s
epoch 22 | loss: 0.28384 |  0:00:01s
epoch 23 | loss: 0.28008 |  0:00:01s
epoch 24 | loss: 0.27746 |  0:00:01s
epoch 25 | loss: 0.27728 |  0:00:01s
epoch 26 | loss: 0.27112 |  0:00:01s
e