In [33]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import RandomOverSampler 
from imblearn.combine import SMOTETomek, SMOTEENN
import joblib

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
print(dataset.shape)
print(len(dataset[dataset['MetabolicSyndrome'] == 0]))
print(dataset.head())

(2401, 13)
1579
   Age     Sex  Income   Race  WaistCirc   BMI  Albuminuria  UrAlbCr  \
0   22    Male  8200.0  White       81.0  23.3            0     3.88   
1   44  Female  4500.0  White       80.1  23.2            0     8.55   
2   21    Male   800.0  Asian       69.6  20.1            0     5.07   
3   43  Female  2000.0  Black      120.4  33.3            0     5.22   
4   51    Male     NaN  Asian       81.1  20.1            0     8.13   

   UricAcid  BloodGlucose  HDL  Triglycerides  MetabolicSyndrome  
0       4.9            92   41             84                  0  
1       4.5            82   28             56                  0  
2       5.4           107   43             78                  0  
3       5.0           104   73            141                  0  
4       5.0            95   43            126                  0  


In [34]:
from sklearn.preprocessing import MinMaxScaler

# Defining the columns that need scaling
columns_to_scale = ['Age', 'Income', 'WaistCirc', 'BMI', 'Albuminuria', 
                    'UrAlbCr', 'UricAcid', 'BloodGlucose', 'HDL', 'Triglycerides']

# Creating a MinMaxScaler instance
scaler = MinMaxScaler()

# Scaling the selected columns
dataset[columns_to_scale] = scaler.fit_transform(dataset[columns_to_scale])

# Check the transformed dataset
print(dataset.head())


        Age     Sex    Income   Race  WaistCirc       BMI  Albuminuria  \
0  0.033333    Male  0.908046  White   0.207012  0.179024          0.0   
1  0.400000  Female  0.482759  White   0.199499  0.177215          0.0   
2  0.016667    Male  0.057471  Asian   0.111853  0.121157          0.0   
3  0.383333  Female  0.195402  Black   0.535893  0.359855          0.0   
4  0.516667    Male       NaN  Asian   0.207846  0.121157          0.0   

    UrAlbCr  UricAcid  BloodGlucose       HDL  Triglycerides  \
0  0.000418  0.326316      0.154519  0.190141       0.037760   
1  0.001206  0.284211      0.125364  0.098592       0.019531   
2  0.000619  0.378947      0.198251  0.204225       0.033854   
3  0.000645  0.336842      0.189504  0.415493       0.074870   
4  0.001136  0.336842      0.163265  0.204225       0.065104   

   MetabolicSyndrome  
0                  0  
1                  0  
2                  0  
3                  0  
4                  0  


In [35]:
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

# This is the incorrect implementation
'''
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)
'''
# Fill NaN values in column with index 2
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())

# Fill NaN values in column with index 4
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())

# Fill NaN values in column with index 5
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())
print(dataset.head())

        Age  Sex    Income  Race  WaistCirc       BMI  Albuminuria   UrAlbCr  \
0  0.033333    0  0.908046     0   0.207012  0.179024          0.0  0.000418   
1  0.400000    1  0.482759     0   0.199499  0.177215          0.0  0.001206   
2  0.016667    0  0.057471     1   0.111853  0.121157          0.0  0.000619   
3  0.383333    1  0.195402     2   0.535893  0.359855          0.0  0.000645   
4  0.516667    0  0.425891     1   0.207846  0.121157          0.0  0.001136   

   UricAcid  BloodGlucose       HDL  Triglycerides  MetabolicSyndrome  
0  0.326316      0.154519  0.190141       0.037760                  0  
1  0.284211      0.125364  0.098592       0.019531                  0  
2  0.378947      0.198251  0.204225       0.033854                  0  
3  0.336842      0.189504  0.415493       0.074870                  0  
4  0.336842      0.163265  0.204225       0.065104                  0  


  dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
  dataset['Race'] = dataset['Race'].replace(race_mapping)


In [36]:
outcome_0 = dataset[dataset['MetabolicSyndrome'] == 0]
outcome_1 = dataset[dataset['MetabolicSyndrome'] == 1]


test_size_each_class = 400
test_0 = outcome_0.sample(n=test_size_each_class, random_state=42)
test_1 = outcome_1.sample(n=test_size_each_class, random_state=42)
#print(test_1)
test_data = pd.concat([test_0, test_1])

# Remove the test set rows from the original dataset to create the training set
train_data = dataset.drop(test_data.index)


x_train = train_data.drop('MetabolicSyndrome', axis=1).values
y_train = train_data['MetabolicSyndrome'].values
x_test = test_data.drop('MetabolicSyndrome', axis=1).values
y_test = test_data['MetabolicSyndrome'].values

In [37]:
classifier_xgboost = XGBClassifier(n_estimators=100, learning_rate=0.3, max_depth=3, random_state=42)
classifier_xgboost.fit(x_train, y_train)
y_pred_xgboost = classifier_xgboost.predict(x_test)

In [38]:
accuracy = accuracy_score(y_test, y_pred_xgboost)
f1 = f1_score(y_test, y_pred_xgboost)
recall = recall_score(y_test, y_pred_xgboost)
precision = precision_score(y_test, y_pred_xgboost)
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(confusion_matrix(y_test, y_pred_xgboost))

Accuracy: 0.8625
F1 Score: 0.8497267759562842
Recall: 0.7775
Precision: 0.9367469879518072
[[379  21]
 [ 89 311]]
