In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


In [2]:
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
from site_key import pw

In [3]:
#create engine and connection to postgres
engine = create_engine(f'postgresql://postgres:{pw}@localhost:5432/heart_failure')
connection = engine.connect()

In [4]:
#read tables from heart_failure DB
heart = pd.read_sql('select * from heart_failure', connection)
heart.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,OldPeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


### replace binary values with numbers

In [5]:
heart['Sex'] = heart['Sex'].map({'M': 0,
                                'F': 1})

heart['ExerciseAngina'] = heart['ExerciseAngina'].map({'N': 0, 'Y': 1})

In [6]:
heart.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,OldPeak,ST_Slope,HeartDisease
0,40,0,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,1,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,0,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,1,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,0,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [7]:
ChestPainType = pd.get_dummies(heart['ChestPainType'], prefix='ChestPainType')
ST_Slope = pd.get_dummies(heart['ST_Slope'], prefix='ST_Slope')
RestingECG = pd.get_dummies(heart['RestingECG'], prefix='RestingECG')


In [8]:
dfs = [heart, ChestPainType, ST_Slope, RestingECG]
heart = pd.concat(dfs, axis =1)

In [9]:
heart.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,OldPeak,...,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
0,40,0,ATA,140,289,0,Normal,172,0,0.0,...,0,1,0,0,0,0,1,0,1,0
1,49,1,NAP,160,180,0,Normal,156,0,1.0,...,0,0,1,0,0,1,0,0,1,0
2,37,0,ATA,130,283,0,ST,98,0,0.0,...,0,1,0,0,0,0,1,0,0,1
3,48,1,ASY,138,214,0,Normal,108,1,1.5,...,1,0,0,0,0,1,0,0,1,0
4,54,0,NAP,150,195,0,Normal,122,0,0.0,...,0,0,1,0,0,0,1,0,1,0


In [10]:
heart = heart.drop(columns = ['ChestPainType', 'ST_Slope', 'RestingECG'])

In [11]:
heart.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,OldPeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
0,40,0,140,289,0,172,0,0.0,0,0,1,0,0,0,0,1,0,1,0
1,49,1,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0
2,37,0,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1
3,48,1,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0
4,54,0,150,195,0,122,0,0.0,0,0,0,1,0,0,0,1,0,1,0


In [12]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   Sex                918 non-null    int64  
 2   RestingBP          918 non-null    int64  
 3   Cholesterol        918 non-null    int64  
 4   FastingBS          918 non-null    int64  
 5   MaxHR              918 non-null    int64  
 6   ExerciseAngina     918 non-null    int64  
 7   OldPeak            918 non-null    float64
 8   HeartDisease       918 non-null    int64  
 9   ChestPainType_ASY  918 non-null    uint8  
 10  ChestPainType_ATA  918 non-null    uint8  
 11  ChestPainType_NAP  918 non-null    uint8  
 12  ChestPainType_TA   918 non-null    uint8  
 13  ST_Slope_Down      918 non-null    uint8  
 14  ST_Slope_Flat      918 non-null    uint8  
 15  ST_Slope_Up        918 non-null    uint8  
 16  RestingECG_LVH     918 non

### Separating variables

In [13]:
X = heart.drop(columns=['HeartDisease'], axis=1)
y = heart['HeartDisease']

### Normalizing the data

In [14]:
X = (X - np.min(X)) / (np.max(X) - np.min(X))

### Splitting data for training and testing

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=55)
print(f'x_train shape: {x_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'x_test shape : {x_test.shape}')
print(f'y_test shape : {y_test.shape}')

x_train shape: (688, 18)
y_train shape: (688,)
x_test shape : (230, 18)
y_test shape : (230,)


### Implementing the GNB Model

In [16]:
nb= GaussianNB(var_smoothing=0.05)
nb.fit(x_train, y_train)
y_pred_GNB = nb.predict(x_test)


In [17]:
accuracy = accuracy_score(y_pred_GNB, y_test)
print('Gaussian Naive Bayes Accuracy:'+' ' + '{:.2f}%'.format(accuracy*100))

Gaussian Naive Bayes Accuracy: 88.26%
