In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')

In [2]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)

dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)


In [3]:
# Seperating the database into dependent and independent variables
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Resampling the data to avoid overfitting
# ros = RandomOverSampler(random_state=0)

# Resampling the data
#x_resampled, y_resampled = ros.fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Making the Random Classifier model
classifier = RandomForestClassifier(n_estimators=30, criterion='entropy', random_state=0)

# Training the model
classifier.fit(x_train, y_train)

# Predicting the results 
y_pred = classifier.predict(x_test)

# Getting accuracy results and confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")


Accracy: 
	86.69% is the accuracy

Confusion matrix: 
[[422  38]
 [ 58 203]] 

Precision Score: 
	 0.8423236514522822 

Recall: 
	 0.7777777777777778 

F1 Score: 
	 0.8087649402390438 



In [4]:
# Decision Tree Classifier
classifier_decision_tree = DecisionTreeClassifier(criterion='entropy')
classifier_decision_tree.fit(x_train, y_train)
y_pred = classifier_decision_tree.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	83.77% is the accuracy

Confusion matrix: 
[[407  53]
 [ 64 197]] 

Precision Score: 
	 0.788 

Recall: 
	 0.7547892720306514 

F1 Score: 
	 0.7710371819960863 



In [5]:
# XGBoost Classifier
classifier_xgboost = XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.5)
classifier_xgboost.fit(x_train, y_train)
y_pred = classifier_xgboost.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	87.24% is the accuracy

Confusion matrix: 
[[419  41]
 [ 51 210]] 

Precision Score: 
	 0.8366533864541833 

Recall: 
	 0.8045977011494253 

F1 Score: 
	 0.8203125 



In [6]:
# Logistic Regression
classifier_logistic_regression = LogisticRegression()

classifier_logistic_regression.fit(x_train, y_train)
y_pred = classifier_logistic_regression.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")

Accracy: 
	80.86% is the accuracy

Confusion matrix: 
[[404  56]
 [ 82 179]] 

Precision Score: 
	 0.7617021276595745 

Recall: 
	 0.685823754789272 

F1 Score: 
	 0.721774193548387 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=10)
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")
mlp_accuracy = mlp.score(x_test, y_test)

Accracy: 
	54.51% is the accuracy

Confusion matrix: 
[[163 297]
 [ 31 230]] 

Precision Score: 
	 0.4364326375711575 

Recall: 
	 0.8812260536398467 

F1 Score: 
	 0.5837563451776651 



In [8]:
tabnet = TabNetClassifier()
tabnet.fit(
    x_train, y_train,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)
y_pred = tabnet.predict(x_test)
print("Accracy: ")
print(f'\t{(accuracy_score(y_test, y_pred) * 100):.2f}% is the accuracy\n')
print("Confusion matrix: ")
print(cm, "\n")
print("Precision Score: ")
print("\t",precision_score(y_test, y_pred), "\n")
print("Recall: ")
print("\t", recall_score(y_test, y_pred), "\n")
print("F1 Score: ")
print("\t", f1_score(y_test, y_pred), "\n")
print(cm)




epoch 0  | loss: 0.75833 |  0:00:00s
epoch 1  | loss: 0.61693 |  0:00:00s
epoch 2  | loss: 0.55946 |  0:00:00s
epoch 3  | loss: 0.51014 |  0:00:00s
epoch 4  | loss: 0.47495 |  0:00:00s
epoch 5  | loss: 0.46685 |  0:00:00s
epoch 6  | loss: 0.43598 |  0:00:00s
epoch 7  | loss: 0.4336  |  0:00:00s
epoch 8  | loss: 0.42073 |  0:00:00s
epoch 9  | loss: 0.41756 |  0:00:00s
epoch 10 | loss: 0.39311 |  0:00:00s
epoch 11 | loss: 0.37762 |  0:00:00s
epoch 12 | loss: 0.3821  |  0:00:00s
epoch 13 | loss: 0.35578 |  0:00:00s
epoch 14 | loss: 0.35903 |  0:00:00s
epoch 15 | loss: 0.35517 |  0:00:00s
epoch 16 | loss: 0.33663 |  0:00:00s
epoch 17 | loss: 0.32486 |  0:00:00s
epoch 18 | loss: 0.30823 |  0:00:00s
epoch 19 | loss: 0.31806 |  0:00:00s
epoch 20 | loss: 0.31956 |  0:00:00s
epoch 21 | loss: 0.31486 |  0:00:00s
epoch 22 | loss: 0.3044  |  0:00:00s
epoch 23 | loss: 0.31591 |  0:00:00s
epoch 24 | loss: 0.3062  |  0:00:00s
epoch 25 | loss: 0.28708 |  0:00:00s
epoch 26 | loss: 0.28595 |  0:00:00s
e