### Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

### Data Collection and Processing

In [2]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('heart_dataset.csv')
heart_data

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,0.0,1.0,1.0,1.0,18.0,0.0,0.0,2.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [3]:
# print first 5 rows of the dataset
heart_data.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [4]:
# print last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,0.0,1.0,1.0,1.0,18.0,0.0,0.0,2.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0
253679,1.0,1.0,1.0,1.0,25.0,0.0,0.0,2.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,6.0,2.0


In [5]:
heart_data['HeartDiseaseorAttack'] = heart_data['HeartDiseaseorAttack'].astype('int64')
heart_data['HighBP'] = heart_data['HighBP'].astype('int64')
heart_data['HighChol'] = heart_data['HighChol'].astype('int64')
heart_data['CholCheck'] = heart_data['CholCheck'].astype('int64')
heart_data['BMI'] = heart_data['BMI'].astype('int64')
heart_data['Smoker'] = heart_data['Smoker'].astype('int64')
heart_data['Stroke'] = heart_data['Stroke'].astype('int64')
heart_data['Diabetes'] = heart_data['Diabetes'].astype('int64')
heart_data['PhysActivity'] = heart_data['PhysActivity'].astype('int64')
heart_data['Fruits'] = heart_data['Fruits'].astype('int64')
heart_data['Veggies'] = heart_data['Veggies'].astype('int64')
heart_data['HvyAlcoholConsump'] = heart_data['HvyAlcoholConsump'].astype('int64')
heart_data['AnyHealthcare'] = heart_data['AnyHealthcare'].astype('int64')
heart_data['NoDocbcCost'] = heart_data['NoDocbcCost'].astype('int64')
heart_data['GenHlth'] = heart_data['GenHlth'].astype('int64')
heart_data['MentHlth'] = heart_data['MentHlth'].astype('int64')
heart_data['PhysHlth'] = heart_data['PhysHlth'].astype('int64')
heart_data['DiffWalk'] = heart_data['DiffWalk'].astype('int64')
heart_data['Sex'] = heart_data['Sex'].astype('int64')
heart_data['Age'] = heart_data['Age'].astype('int64')
heart_data['Education'] = heart_data['Education'].astype('int64')
heart_data['Income'] = heart_data['Income'].astype('int64')


heart_data.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,9,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,11,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,11,5,4


In [6]:
# number of rows and columns in the dataset
heart_data.shape

(253680, 22)

In [7]:
# showing all the column headers
heart_data.columns

Index(['HeartDiseaseorAttack', 'HighBP', 'HighChol', 'CholCheck', 'BMI',
       'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [8]:
# getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   HeartDiseaseorAttack  253680 non-null  int64
 1   HighBP                253680 non-null  int64
 2   HighChol              253680 non-null  int64
 3   CholCheck             253680 non-null  int64
 4   BMI                   253680 non-null  int64
 5   Smoker                253680 non-null  int64
 6   Stroke                253680 non-null  int64
 7   Diabetes              253680 non-null  int64
 8   PhysActivity          253680 non-null  int64
 9   Fruits                253680 non-null  int64
 10  Veggies               253680 non-null  int64
 11  HvyAlcoholConsump     253680 non-null  int64
 12  AnyHealthcare         253680 non-null  int64
 13  NoDocbcCost           253680 non-null  int64
 14  GenHlth               253680 non-null  int64
 15  MentHlth              253680 non-n

In [9]:
# statistical measures about the data
heart_data.describe()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.094186,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.296921,0.756544,0.634256,...,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.292087,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.69816,0.429169,0.481639,...,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,0.0,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,1.0,98.0,1.0,1.0,2.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


In [10]:
# checking for missing values
heart_data.isnull().any()

HeartDiseaseorAttack    False
HighBP                  False
HighChol                False
CholCheck               False
BMI                     False
Smoker                  False
Stroke                  False
Diabetes                False
PhysActivity            False
Fruits                  False
Veggies                 False
HvyAlcoholConsump       False
AnyHealthcare           False
NoDocbcCost             False
GenHlth                 False
MentHlth                False
PhysHlth                False
DiffWalk                False
Sex                     False
Age                     False
Education               False
Income                  False
dtype: bool

In [11]:
heart_data['Age'] = heart_data['Age']*4
heart_data

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,36,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,28,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,36,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,44,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,44,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0,1,1,1,45,0,0,0,0,1,...,1,0,3,0,5,0,1,20,6,7
253676,0,1,1,1,18,0,0,2,0,0,...,1,0,4,0,0,1,0,44,2,4
253677,0,0,0,1,28,0,0,0,1,1,...,1,0,1,0,0,0,0,8,5,2
253678,0,1,0,1,23,0,0,0,0,1,...,1,0,3,0,0,0,1,28,5,1


In [12]:
# checking the distribution of HeartDiseaseorAttack Variable
heart_data['HeartDiseaseorAttack'].value_counts()

0    229787
1     23893
Name: HeartDiseaseorAttack, dtype: int64

In [13]:
# Maximum age present in the dataset
heart_data['Age'].max()

52

In [14]:
# Minimum age present in the dataset
heart_data['Age'].min()

4

### Splitting data

In [15]:
x = heart_data.drop(['HeartDiseaseorAttack', 'Education', 'Income'], axis=1)
y = heart_data['HeartDiseaseorAttack']

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
y_train.value_counts()

0    160830
1     16746
Name: HeartDiseaseorAttack, dtype: int64

In [17]:
sampler=SMOTE()
x_train,y_train=sampler.fit_resample(x_train,y_train)
y_train.value_counts()

0    160830
1    160830
Name: HeartDiseaseorAttack, dtype: int64

### Scaling 

In [18]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
x_train

array([[ 0.94283603,  1.00563673,  0.17255908, ..., -0.52316271,
        -0.88588309, -0.68182713],
       [ 0.94283603, -0.99439487,  0.17255908, ...,  1.91145124,
        -0.88588309,  0.69445082],
       [-1.06062981,  1.00563673,  0.17255908, ...,  1.91145124,
        -0.88588309,  0.69445082],
       ...,
       [ 0.94283603,  1.00563673,  0.17255908, ..., -0.52316271,
         1.12881712,  0.35038133],
       [ 0.94283603,  1.00563673,  0.17255908, ..., -0.52316271,
         1.12881712, -0.33775764],
       [ 0.94283603, -0.99439487,  0.17255908, ...,  1.91145124,
        -0.88588309, -1.28394874]])

### Model Training and Evaluation

### Logisitic Regression

In [19]:
model = LogisticRegression()

In [20]:
# training the LogisticRegression model with Training data
model.fit(x_train, y_train)

In [21]:
# accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(y_train,x_train_prediction)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.7514176459615743


In [22]:
# accuracy on testing data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(y_test,x_test_prediction,)
print('Accuracy on Testing data : ', test_data_accuracy)

Accuracy on Testing data :  0.7213549879112793


### XGB Classifier

In [23]:
model1 = XGBClassifier()
model1.fit(x_train, y_train)

In [24]:
# accuracy on training data
x_train_prediction = model1.predict(x_train)
training_data_accuracy = accuracy_score(y_train,x_train_prediction)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.7898091152148231


In [25]:
# accuracy on testing data
x_test_prediction = model1.predict(x_test)
test_data_accuracy = accuracy_score(y_test,x_test_prediction,)
print('Accuracy on Testing data : ', test_data_accuracy)

Accuracy on Testing data :  0.7297776726584674


### KNeighbors Classifier

In [26]:
model2 = KNeighborsClassifier()
model2.fit(x_train, y_train)

In [27]:
# accuracy on training data
x_train_prediction = model2.predict(x_train)
training_data_accuracy = accuracy_score(y_train,x_train_prediction)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8566996207175278


In [28]:
# accuracy on testing data
x_test_prediction = model2.predict(x_test)
test_data_accuracy = accuracy_score(y_test,x_test_prediction,)
print('Accuracy on Testing data : ', test_data_accuracy)

Accuracy on Testing data :  0.7510511931041732


### Random Forest Classifier

In [29]:
model3 = RandomForestClassifier()
model3.fit(x_train, y_train)

In [30]:
# accuracy on training data
x_train_prediction = model3.predict(x_train)
training_data_accuracy = accuracy_score(y_train,x_train_prediction)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9700429024435739


In [31]:
# accuracy on testing data
x_test_prediction = model3.predict(x_test)
test_data_accuracy = accuracy_score(y_test,x_test_prediction,)
print('Accuracy on Testing data : ', test_data_accuracy)

Accuracy on Testing data :  0.8097734678860506


### Decision Tree Classifier

In [32]:
model4 = DecisionTreeClassifier()
model4.fit(x_train, y_train)

In [33]:
# accuracy on training data
x_train_prediction = model4.predict(x_train)
training_data_accuracy = accuracy_score(y_train,x_train_prediction)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9700460113162968


In [34]:
# accuracy on testing data
x_test_prediction = model4.predict(x_test)
test_data_accuracy = accuracy_score(y_test,x_test_prediction,)
print('Accuracy on Testing data : ', test_data_accuracy)

Accuracy on Testing data :  0.7836513192473458


### GaussianNB

In [35]:
model5 = GaussianNB()
model5.fit(x_train, y_train)

In [36]:
# accuracy on training data
x_train_prediction = model5.predict(x_train)
training_data_accuracy = accuracy_score(y_train,x_train_prediction)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.7236616302928558


In [37]:
# accuracy on testing data
x_test_prediction = model5.predict(x_test)
test_data_accuracy = accuracy_score(y_test,x_test_prediction,)
print('Accuracy on Testingp data : ', test_data_accuracy)

Accuracy on Testingp data :  0.6725796278776411


### Building a Predictive System

In [41]:
#input_data = (1,1,1,40,1,0,0,0,0,1,0,1,0,5,18,15,1,0,9) # => 0
#input_data = (1,1,1,30,1,0,2,0,1,1,0,1,0,5,30,30,1,0,9) # => 1

#input_data = (0,0,0,23,0,0,0,0,0,1,0,1,0,2,15,0,0,0,2) # => 0


# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model3.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

NameError: name 'input_data' is not defined