In [9]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
print(dataset.shape)
print(len(dataset[dataset['MetabolicSyndrome'] == 0]))

(2401, 13)
1579


In [10]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)


In [11]:
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]

test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values

# Resampling the data to avoid overfitting
ros = RandomOverSampler(random_state=0)

# Resampling the data
X_resampled, y_resampled = ros.fit_resample(x_train, y_train)

# Making the Random Classifier model
classifier = RandomForestClassifier(n_estimators=30, criterion='entropy', random_state=0)

# Training the model
classifier.fit(x_train, y_train)

# Predicting the results 
y_pred = classifier.predict(x_test)

# Getting accuracy results and confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")


Accracy: 
	84.38% is the accuracy

Confusion matrix: 
[[376  24]
 [101 299]] 

Precision Score: 
	 0.9256965944272446 

Recall: 
	 0.7475 

F1 Score: 
	 0.8271092669432919 



In [12]:
# Decision Tree Classifier
classifier_decision_tree = DecisionTreeClassifier(criterion='entropy')
classifier_decision_tree.fit(x_train, y_train)
y_pred = classifier_decision_tree.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	83.50% is the accuracy

Confusion matrix: 
[[362  38]
 [ 94 306]] 

Precision Score: 
	 0.8895348837209303 

Recall: 
	 0.765 

F1 Score: 
	 0.8225806451612905 



In [13]:
# XGBoost Classifier
classifier_xgboost = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.5)
classifier_xgboost.fit(x_train, y_train)
y_pred = classifier_xgboost.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	86.50% is the accuracy

Confusion matrix: 
[[376  24]
 [ 84 316]] 

Precision Score: 
	 0.9294117647058824 

Recall: 
	 0.79 

F1 Score: 
	 0.8540540540540541 



In [14]:
# Logistic Regression
classifier_logistic_regression = LogisticRegression()

classifier_logistic_regression.fit(x_train, y_train)
y_pred = classifier_logistic_regression.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	75.00% is the accuracy

Confusion matrix: 
[[365  35]
 [165 235]] 

Precision Score: 
	 0.8703703703703703 

Recall: 
	 0.5875 

F1 Score: 
	 0.7014925373134329 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=10)
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")
mlp_accuracy = mlp.score(x_test, y_test)

Accracy: 
	75.50% is the accuracy

Confusion matrix: 
[[318  82]
 [114 286]] 

Precision Score: 
	 0.7771739130434783 

Recall: 
	 0.715 

F1 Score: 
	 0.7447916666666666 



In [17]:
tabnet = TabNetClassifier()
tabnet.fit(
    x_train, y_train,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)
y_pred = tabnet.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")




epoch 0  | loss: 0.73498 |  0:00:00s
epoch 1  | loss: 0.59064 |  0:00:00s
epoch 2  | loss: 0.50601 |  0:00:00s
epoch 3  | loss: 0.47053 |  0:00:00s
epoch 4  | loss: 0.44867 |  0:00:00s
epoch 5  | loss: 0.44047 |  0:00:00s
epoch 6  | loss: 0.40343 |  0:00:00s
epoch 7  | loss: 0.39061 |  0:00:00s
epoch 8  | loss: 0.40504 |  0:00:00s
epoch 9  | loss: 0.37485 |  0:00:00s
epoch 10 | loss: 0.36201 |  0:00:00s
epoch 11 | loss: 0.35828 |  0:00:00s
epoch 12 | loss: 0.34033 |  0:00:00s
epoch 13 | loss: 0.35459 |  0:00:00s
epoch 14 | loss: 0.34369 |  0:00:00s
epoch 15 | loss: 0.34217 |  0:00:00s
epoch 16 | loss: 0.3246  |  0:00:00s
epoch 17 | loss: 0.32525 |  0:00:00s
epoch 18 | loss: 0.32714 |  0:00:00s
epoch 19 | loss: 0.30914 |  0:00:00s
epoch 20 | loss: 0.31564 |  0:00:00s
epoch 21 | loss: 0.31014 |  0:00:00s
epoch 22 | loss: 0.30715 |  0:00:00s
epoch 23 | loss: 0.28128 |  0:00:00s
epoch 24 | loss: 0.29488 |  0:00:00s
epoch 25 | loss: 0.28843 |  0:00:00s
epoch 26 | loss: 0.2901  |  0:00:00s
e