In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')

In [2]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)


In [3]:
# Seperating the database into dependent and independent variables
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values.reshape(-1, 1)

# Resampling the data to avoid overfitting
ros = RandomOverSampler(random_state=0)

# Resampling the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
x_train, y_train = ros.fit_resample(x_train, y_train)

# Making the Random Classifier model
classifier = RandomForestClassifier(n_estimators=30, criterion='entropy', random_state=0)

# Training the model
classifier.fit(x_train, y_train)

# Predicting the results 
y_pred = classifier.predict(x_test)

# Getting accuracy results and confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")


Accracy: 
	86.82% is the accuracy

Confusion matrix: 
[[406  54]
 [ 41 220]] 

Precision Score: 
	 0.8029197080291971 

Recall: 
	 0.842911877394636 

F1 Score: 
	 0.8224299065420559 



In [4]:
# Decision Tree Classifier
classifier_decision_tree = DecisionTreeClassifier(criterion='entropy')
classifier_decision_tree.fit(x_train, y_train)
y_pred = classifier_decision_tree.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	84.19% is the accuracy

Confusion matrix: 
[[403  57]
 [ 57 204]] 

Precision Score: 
	 0.7816091954022989 

Recall: 
	 0.7816091954022989 

F1 Score: 
	 0.781609195402299 



In [5]:
# XGBoost Classifier
classifier_xgboost = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.5)
classifier_xgboost.fit(x_train, y_train)
y_pred = classifier_xgboost.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	88.63% is the accuracy

Confusion matrix: 
[[422  38]
 [ 44 217]] 

Precision Score: 
	 0.8509803921568627 

Recall: 
	 0.8314176245210728 

F1 Score: 
	 0.8410852713178295 



In [6]:
# Logistic Regression
classifier_logistic_regression = LogisticRegression()

classifier_logistic_regression.fit(x_train, y_train)
y_pred = classifier_logistic_regression.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	77.67% is the accuracy

Confusion matrix: 
[[356 104]
 [ 57 204]] 

Precision Score: 
	 0.6623376623376623 

Recall: 
	 0.7816091954022989 

F1 Score: 
	 0.7170474516695957 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=10)
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")
mlp_accuracy = mlp.score(x_test, y_test)

Accracy: 
	76.84% is the accuracy

Confusion matrix: 
[[344 116]
 [ 51 210]] 

Precision Score: 
	 0.6441717791411042 

Recall: 
	 0.8045977011494253 

F1 Score: 
	 0.7155025553662692 



In [8]:
tabnet = TabNetClassifier()
tabnet.fit(
    x_train, y_train,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)
y_pred = tabnet.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")



epoch 0  | loss: 0.76056 |  0:00:00s
epoch 1  | loss: 0.5807  |  0:00:00s
epoch 2  | loss: 0.54618 |  0:00:00s
epoch 3  | loss: 0.50926 |  0:00:00s
epoch 4  | loss: 0.46863 |  0:00:00s
epoch 5  | loss: 0.46577 |  0:00:00s
epoch 6  | loss: 0.43012 |  0:00:00s
epoch 7  | loss: 0.42085 |  0:00:00s
epoch 8  | loss: 0.41205 |  0:00:00s
epoch 9  | loss: 0.40571 |  0:00:00s
epoch 10 | loss: 0.39258 |  0:00:00s
epoch 11 | loss: 0.39747 |  0:00:00s
epoch 12 | loss: 0.38264 |  0:00:00s
epoch 13 | loss: 0.37031 |  0:00:00s
epoch 14 | loss: 0.35485 |  0:00:00s
epoch 15 | loss: 0.34877 |  0:00:00s
epoch 16 | loss: 0.34168 |  0:00:00s
epoch 17 | loss: 0.3486  |  0:00:00s
epoch 18 | loss: 0.34229 |  0:00:00s
epoch 19 | loss: 0.35217 |  0:00:00s
epoch 20 | loss: 0.32786 |  0:00:00s
epoch 21 | loss: 0.33159 |  0:00:00s
epoch 22 | loss: 0.31159 |  0:00:00s
epoch 23 | loss: 0.332   |  0:00:00s
epoch 24 | loss: 0.30819 |  0:00:00s
epoch 25 | loss: 0.31229 |  0:00:00s
epoch 26 | loss: 0.30832 |  0:00:00s
e