In [1]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import time
from sklearn.preprocessing import MinMaxScaler, Binarizer



In [2]:
df = pd.read_csv('cleaned_augmented_2.csv', index_col = 0)
df

Unnamed: 0_level_0,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,number_inpatient,number_diagnoses,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,readmitted_<30,readmitted_>30,Diabetes_binary
HighBP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,40,1,0,0,0,0,1,0,...,0,9,0,0,1,0,0,0,1,0
0,0,0,25,1,0,0,1,0,0,0,...,5,4,0,0,1,0,0,0,0,0
1,1,1,28,0,0,0,0,1,0,0,...,0,4,0,0,1,0,0,0,0,0
1,0,1,27,0,0,0,1,1,1,0,...,1,9,0,0,1,0,0,0,0,0
1,1,1,24,0,0,0,1,1,1,0,...,0,4,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1,1,45,0,0,0,0,1,1,0,...,0,5,0,0,1,0,0,0,0,0
1,1,1,18,0,0,0,0,0,0,0,...,0,7,1,0,0,0,0,0,0,1
0,0,1,28,0,0,0,1,1,0,0,...,2,5,0,0,1,0,0,0,1,0
1,0,1,23,0,0,0,0,1,1,0,...,1,9,0,0,1,0,0,0,0,0


## Naive Bayes

### Preparing training and Test Data

In [3]:
X = df.drop('Diabetes_binary',axis=1)
y = df['Diabetes_binary']

In [4]:
scaler=StandardScaler()
x_std=scaler.fit_transform(X)
x_std

array([[ 1.16525449,  0.19692156,  1.75793567, ..., -0.12394236,
        -0.32714466,  1.4611329 ],
       [-0.85818163, -5.07816412, -0.51180614, ..., -0.12394236,
        -0.32714466, -0.68440044],
       [ 1.16525449,  0.19692156, -0.05785778, ..., -0.12394236,
        -0.32714466, -0.68440044],
       ...,
       [-0.85818163,  0.19692156, -0.05785778, ..., -0.12394236,
        -0.32714466,  1.4611329 ],
       [-0.85818163,  0.19692156, -0.81443838, ..., -0.12394236,
        -0.32714466, -0.68440044],
       [ 1.16525449,  0.19692156, -0.51180614, ..., -0.12394236,
        -0.32714466, -0.68440044]])

In [5]:
# prepare data for multinomial NB
min_max_scaler = MinMaxScaler()
x_minmax = min_max_scaler.fit_transform(X)

# prepare data for bernoulli NB
binarizer = Binarizer()
x_binary = binarizer.fit_transform(X)

In [6]:
def train_test_model(model, x_train, x_test, y_train, y_test):
    start_time = time.time()
    model.fit(x_train, y_train)
    end_time = time.time()
    training_time = end_time - start_time
    
    start_time = time.time()
    prediction = model.predict(x_test)
    end_time = time.time()
    testing_time = end_time - start_time
    
    accuracy = accuracy_score(y_test, prediction)
    precision = precision_score(y_test, prediction)
    recall = recall_score(y_test, prediction)
    f1 = f1_score(y_test, prediction)
    
    return accuracy, precision, recall, f1, training_time, testing_time

In [8]:
results_data = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': [], 'Training Time': [], 'Testing Time': []}

### Model Generation 

In [9]:
models = {'GaussianNB': GaussianNB(), 'MultinomialNB': MultinomialNB(), 'BernoulliNB': BernoulliNB()}

#### We need to convert data to be able to use for Multinomial since we can't have negative values and we will be scaling from 0 - 1

In [10]:
for model_name, model in models.items():
    if model_name == 'GaussianNB':
        x_data = x_std
    elif model_name == 'MultinomialNB':
        x_data = x_minmax  # Use MinMax scaled data for Multinomial
    elif model_name == 'BernoulliNB':
        x_data = x_binary  # Use Binarized data for Bernoulli
    else:
        raise ValueError("Invalid model name")
    
    x_train, x_test, y_train, y_test = train_test_split(x_data, y, test_size=0.25, random_state=42)
    accuracy, precision, recall, f1, training_time, testing_time = train_test_model(model, x_train, x_test, y_train, y_test)
    
    results_data['Model'].append(model_name)
    results_data['Accuracy'].append(accuracy)
    results_data['Precision'].append(precision)
    results_data['Recall'].append(recall)
    results_data['F1-Score'].append(f1)
    results_data['Training Time'].append(training_time)
    results_data['Testing Time'].append(testing_time)

## Store Results

In [11]:
results_df = pd.DataFrame(results_data)

In [12]:
results_df.to_csv('NaiveBayes_trainingResults.csv', index=False)

In [13]:
print(results_df)

           Model  Accuracy  Precision    Recall  F1-Score  Training Time  \
0     GaussianNB  0.725544   0.292246  0.693712  0.411243       0.124495   
1  MultinomialNB  0.848124   0.401675  0.202556  0.269307       0.036250   
2    BernoulliNB  0.833081   0.371565  0.300924  0.332535       0.133116   

   Testing Time  
0      0.028882  
1      0.005132  
2      0.026311  
