In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import MinMaxScaler, Binarizer
import time



In [2]:
x_pca = pd.read_csv('pca_transformed_data.csv')

In [3]:
x_pca

Unnamed: 0.1,Unnamed: 0,num_lab_procedures_PC1,PhysHlth_PC2,num_medications_PC3,BMI_PC4,MentHlth_PC5
0,0,25.076510,20.621222,19.834983,5.323047,8.544266
1,1,-1.701741,-5.346275,0.462210,-2.412440,-0.493331
2,2,4.708536,34.899327,-9.053155,-6.016315,7.842525
3,3,29.842298,-5.972239,-4.806152,0.166638,-0.620421
4,4,11.941984,-5.181381,-11.052760,-2.345629,1.312165
...,...,...,...,...,...,...
253675,253675,-18.235291,1.475038,-1.538528,17.145959,-2.093546
253676,253676,9.785051,-6.968723,-4.429917,-8.641597,-1.307823
253677,253677,25.210340,-5.326833,-1.239819,0.689406,-0.273325
253678,253678,28.836723,-5.592002,2.885522,-4.797216,-0.440997


In [4]:
df = pd.read_csv('cleaned_augmented_2.csv')

### PCA split

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_pca, df['Diabetes_binary'], test_size=0.2)

#### Gaussian

In [6]:
start_time = time.time()

model_gaussian = GaussianNB()
model_gaussian.fit(x_train, y_train)

training_time_gaussian = time.time() - start_time

In [7]:
start_time = time.time()
prediction_gaussian = model_gaussian.predict(x_test)
testing_time_gaussian = time.time() - start_time

In [8]:
accuracy_gaussian = accuracy_score(y_test, prediction_gaussian)
precision_gaussian = precision_score(y_test, prediction_gaussian)
recall_gaussian = recall_score(y_test, prediction_gaussian)
f1_gaussian = f1_score(y_test, prediction_gaussian)

#### Multinomial NB

In [9]:
min_max_scaler = MinMaxScaler()
x_minmax = min_max_scaler.fit_transform(df.drop('Diabetes_binary', axis=1))

In [10]:
x_train_multinomial, x_test_multinomial, y_train_multinomial, y_test_multinomial = train_test_split(
    x_minmax, df['Diabetes_binary'], test_size=0.2)

In [11]:
start_time = time.time()

model_multinomial = MultinomialNB()
model_multinomial.fit(x_train_multinomial, y_train_multinomial)

training_time_multinomial = time.time() - start_time

In [12]:
start_time = time.time()
prediction_multinomial = model_multinomial.predict(x_test_multinomial)
testing_time_multinomial = time.time() - start_time

In [13]:
accuracy_multinomial = accuracy_score(y_test_multinomial, prediction_multinomial)
precision_multinomial = precision_score(y_test_multinomial, prediction_multinomial)
recall_multinomial = recall_score(y_test_multinomial, prediction_multinomial)
f1_multinomial = f1_score(y_test_multinomial, prediction_multinomial)

#### Bernoulli NB

In [14]:
binarizer = Binarizer()
x_binary = binarizer.fit_transform(df.drop('Diabetes_binary', axis=1))

In [15]:
x_train_bernoulli, x_test_bernoulli, y_train_bernoulli, y_test_bernoulli = train_test_split(
    x_binary, df['Diabetes_binary'], test_size=0.2)

In [16]:
start_time = time.time()

model_bernoulli = BernoulliNB()
model_bernoulli.fit(x_train_bernoulli, y_train_bernoulli)

training_time_bernoulli = time.time() - start_time

In [17]:
start_time = time.time()
prediction_bernoulli = model_bernoulli.predict(x_test_bernoulli)
testing_time_bernoulli = time.time() - start_time

In [18]:
prediction_bernoulli = model_bernoulli.predict(x_test_bernoulli)
accuracy_bernoulli = accuracy_score(y_test_bernoulli, prediction_bernoulli)
precision_bernoulli = precision_score(y_test_bernoulli, prediction_bernoulli)
recall_bernoulli = recall_score(y_test_bernoulli, prediction_bernoulli)
f1_bernoulli = f1_score(y_test_bernoulli, prediction_bernoulli)

## Results

In [19]:
results_df = pd.DataFrame({
    'Model': ['PCA + GaussianNB', 'MultinomialNB', 'BernoulliNB'],
    'Accuracy': [accuracy_gaussian, accuracy_multinomial, accuracy_bernoulli],
    'Precision': [precision_gaussian, precision_multinomial, precision_bernoulli],
    'Recall': [recall_gaussian, recall_multinomial, recall_bernoulli],
    'F1-Score': [f1_gaussian, f1_multinomial, f1_bernoulli],
    'Training Time': [training_time_gaussian, training_time_multinomial, training_time_bernoulli],
    'Testing Time': [testing_time_gaussian, testing_time_multinomial, testing_time_bernoulli]
})

In [20]:
results_df.to_csv('PCA_NaiveBayes_results.csv')

In [21]:
print(results_df)

              Model  Accuracy  Precision    Recall  F1-Score  Training Time  \
0  PCA + GaussianNB  0.835659   0.353330  0.228596  0.277595       0.045696   
1     MultinomialNB  0.843858   0.404331  0.221308  0.286049       0.049371   
2       BernoulliNB  0.826849   0.376614  0.376668  0.376641       0.207953   

   Testing Time  
0      0.007868  
1      0.007581  
2      0.041969  
