In [1]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import time
from sklearn.preprocessing import MinMaxScaler, Binarizer
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv("../../data/processed/cleaned_augmented_2.csv")
df

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,number_inpatient,number_diagnoses,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,readmitted_<30,readmitted_>30,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,9,0,0,1,0,0,0,1,0
1,0,0,0,25,1,0,0,1,0,0,...,5,4,0,0,1,0,0,0,0,0
2,1,1,1,28,0,0,0,0,1,0,...,0,4,0,0,1,0,0,0,0,0
3,1,0,1,27,0,0,0,1,1,1,...,1,9,0,0,1,0,0,0,0,0
4,1,1,1,24,0,0,0,1,1,1,...,0,4,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,45,0,0,0,0,1,1,...,0,5,0,0,1,0,0,0,0,0
253676,1,1,1,18,0,0,0,0,0,0,...,0,7,1,0,0,0,0,0,0,1
253677,0,0,1,28,0,0,0,1,1,0,...,2,5,0,0,1,0,0,0,1,0
253678,1,0,1,23,0,0,0,0,1,1,...,1,9,0,0,1,0,0,0,0,0


## Naive Bayes

### Preparing training and Test Data

In [3]:
X = df.drop('Diabetes_binary',axis=1)
y = df['Diabetes_binary']

In [4]:
scaler=StandardScaler()
x_std=scaler.fit_transform(X)
x_std

array([[ 1.15368814,  1.16525449,  0.19692156, ..., -0.12394236,
        -0.32714466,  1.4611329 ],
       [-0.86678537, -0.85818163, -5.07816412, ..., -0.12394236,
        -0.32714466, -0.68440044],
       [ 1.15368814,  1.16525449,  0.19692156, ..., -0.12394236,
        -0.32714466, -0.68440044],
       ...,
       [-0.86678537, -0.85818163,  0.19692156, ..., -0.12394236,
        -0.32714466,  1.4611329 ],
       [ 1.15368814, -0.85818163,  0.19692156, ..., -0.12394236,
        -0.32714466, -0.68440044],
       [ 1.15368814,  1.16525449,  0.19692156, ..., -0.12394236,
        -0.32714466, -0.68440044]])

In [5]:
# prepare data for multinomial NB
min_max_scaler = MinMaxScaler()
x_minmax = min_max_scaler.fit_transform(X)

# prepare data for bernoulli NB
binarizer = Binarizer()
x_binary = binarizer.fit_transform(X)

In [6]:
def train_test_model(model, x_train, x_test, y_train, y_test):
    start_time = time.time()
    model.fit(x_train, y_train)
    end_time = time.time()
    training_time = end_time - start_time
    
    start_time = time.time()
    prediction = model.predict(x_test)
    end_time = time.time()
    testing_time = end_time - start_time
    
    accuracy = accuracy_score(y_test, prediction)
    precision = precision_score(y_test, prediction)
    recall = recall_score(y_test, prediction)
    f1 = f1_score(y_test, prediction)
    
    return accuracy, precision, recall, f1, training_time, testing_time

In [7]:
results_data = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': [], 'Training Time': [], 'Testing Time': []}

### Model Generation 

In [8]:
models = {'GaussianNB': GaussianNB(), 'MultinomialNB': MultinomialNB(), 'BernoulliNB': BernoulliNB()}

#### We need to convert data to be able to use for Multinomial since we can't have negative values and we will be scaling from 0 - 1

In [9]:
for model_name, model in models.items():
    if model_name == 'GaussianNB':
        x_data = x_std
    elif model_name == 'MultinomialNB':
        x_data = x_minmax  # Use MinMax scaled data for Multinomial
    elif model_name == 'BernoulliNB':
        x_data = x_binary  # Use Binarized data for Bernoulli
    else:
        raise ValueError("Invalid model name")
    
    x_train, x_test, y_train, y_test = train_test_split(x_data, y, test_size=0.2)
    accuracy, precision, recall, f1, training_time, testing_time = train_test_model(model, x_train, x_test, y_train, y_test)
    
    results_data['Model'].append(model_name)
    results_data['Accuracy'].append(accuracy)
    results_data['Precision'].append(precision)
    results_data['Recall'].append(recall)
    results_data['F1-Score'].append(f1)
    results_data['Training Time'].append(training_time)
    results_data['Testing Time'].append(testing_time)

## Store Results

In [10]:
results_df = pd.DataFrame(results_data)

In [11]:
#results_df.to_csv('NaiveBayes_trainingResults.csv', index=False)

In [12]:
print(results_df)

           Model  Accuracy  Precision    Recall  F1-Score  Training Time  \
0     GaussianNB  0.728240   0.300689  0.715517  0.423434       0.284005   
1  MultinomialNB  0.848076   0.423586  0.233962  0.301432       0.048501   
2    BernoulliNB  0.828899   0.385240  0.382461  0.383846       0.253318   

   Testing Time  
0      0.084349  
1      0.004497  
2      0.033086  


### Confusion Matrix

In [13]:
conf_matrix_gaussian = confusion_matrix(y_test, models['GaussianNB'].predict(x_test))
TN_gaussian, FP_gaussian, FN_gaussian, TP_gaussian = conf_matrix_gaussian.ravel()
print("Confusion Matrix for Gaussian NB:")
print(conf_matrix_gaussian)
print(f"True Negative (TN): {TN_gaussian}")
print(f"False Positive (FP): {FP_gaussian}")
print(f"False Negative (FN): {FN_gaussian}")
print(f"True Positive (TP): {TP_gaussian}")
print()

Confusion Matrix for Gaussian NB:
[[39444  4222]
 [ 4549  2521]]
True Negative (TN): 39444
False Positive (FP): 4222
False Negative (FN): 4549
True Positive (TP): 2521



In [14]:
conf_matrix_multinomial = confusion_matrix(y_test, models['MultinomialNB'].predict(x_test))
TN_multinomial, FP_multinomial, FN_multinomial, TP_multinomial = conf_matrix_multinomial.ravel()
print("Confusion Matrix for Multinomial NB:")
print(conf_matrix_multinomial)
print(f"True Negative (TN): {TN_multinomial}")
print(f"False Positive (FP): {FP_multinomial}")
print(f"False Negative (FN): {FN_multinomial}")
print(f"True Positive (TP): {TP_multinomial}")
print()

Confusion Matrix for Multinomial NB:
[[40180  3486]
 [ 4878  2192]]
True Negative (TN): 40180
False Positive (FP): 3486
False Negative (FN): 4878
True Positive (TP): 2192



In [15]:
conf_matrix_bernoulli = confusion_matrix(y_test, models['BernoulliNB'].predict(x_test))
TN_bernoulli, FP_bernoulli, FN_bernoulli, TP_bernoulli = conf_matrix_bernoulli.ravel()
print("Confusion Matrix for Bernoulli NB:")
print(conf_matrix_bernoulli)
print(f"True Negative (TN): {TN_bernoulli}")
print(f"False Positive (FP): {FP_bernoulli}")
print(f"False Negative (FN): {FN_bernoulli}")
print(f"True Positive (TP): {TP_bernoulli}")

Confusion Matrix for Bernoulli NB:
[[39351  4315]
 [ 4366  2704]]
True Negative (TN): 39351
False Positive (FP): 4315
False Negative (FN): 4366
True Positive (TP): 2704
