In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('heart_disease.csv')

In [7]:
# https://www.kaggle.com/code/bayunova/diabetes-health-indicators/input
# 0 = female 1 = male
# 0 = no high BP 1 = high BP
# 0 = no diabetes 1 = prediabetes 2 = diabetes
# physical activity in past 30 days - not including job 0 = no 1 = yes
# Consume Fruit 1 or more times per day 0 = no 1 = yes
# Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week)
# Have any kind of health care coverage, including health insurance, prepaid plans such as HMO, etc. 0 = no 1 = yes
# Was there a time in the past 12 months when you needed to see a doctor but could not because of cost?
# Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor
# Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 
# Do you have serious difficulty walking or climbing stairs? 0 = no 1 = yes

In [8]:
df.head(2)

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1


In [9]:
df.shape

(253680, 22)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   HeartDiseaseorAttack  253680 non-null  int64
 1   HighBP                253680 non-null  int64
 2   HighChol              253680 non-null  int64
 3   CholCheck             253680 non-null  int64
 4   BMI                   253680 non-null  int64
 5   Smoker                253680 non-null  int64
 6   Stroke                253680 non-null  int64
 7   Diabetes              253680 non-null  int64
 8   PhysActivity          253680 non-null  int64
 9   Fruits                253680 non-null  int64
 10  Veggies               253680 non-null  int64
 11  HvyAlcoholConsump     253680 non-null  int64
 12  AnyHealthcare         253680 non-null  int64
 13  NoDocbcCost           253680 non-null  int64
 14  GenHlth               253680 non-null  int64
 15  MentHlth              253680 non-n

In [11]:
df.describe()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.094186,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.296921,0.756544,0.634256,...,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.292087,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.69816,0.429169,0.481639,...,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,0.0,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,1.0,98.0,1.0,1.0,2.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


In [12]:
df.isnull().sum()

HeartDiseaseorAttack    0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
Diabetes                0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

In [13]:
# 1 --> Defective Heart
# 0 --> Healthy Heart
df['HeartDiseaseorAttack'].value_counts()

HeartDiseaseorAttack
0    229787
1     23893
Name: count, dtype: int64

# Splitting the Features and Target

In [14]:
x = df.drop(columns='HeartDiseaseorAttack', axis=1)
y = df['HeartDiseaseorAttack']

In [15]:
x.head(2)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1,1,1,40,1,0,0,0,0,1,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,25,1,0,0,1,0,0,...,0,1,3,0,0,0,0,7,6,1


In [16]:
y.head(2)

0    0
1    0
Name: HeartDiseaseorAttack, dtype: int64

In [17]:
x = x.drop(['Income'], axis=1)

In [18]:
x.head(2)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education
0,1,1,1,40,1,0,0,0,0,1,0,1,0,5,18,15,1,0,9,4
1,0,0,0,25,1,0,0,1,0,0,0,0,1,3,0,0,0,0,7,6


In [19]:
df['Age'].value_counts()

Age
9     33244
10    32194
8     30832
7     26314
11    23533
6     19819
13    17363
5     16157
12    15980
4     13823
3     11123
2      7598
1      5700
Name: count, dtype: int64

In [20]:
x.shape

(253680, 20)

# Splitting the Data into Training data & Test Data

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=1)

# Model - RandomForestClassifier

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
Rclf = RandomForestClassifier()

In [24]:
Rclf.fit(x_train, y_train)

In [25]:
predict_Rclf = Rclf.predict(x_train)

In [26]:
accuracy_score(predict_Rclf, y_train)

0.9919302157949272

In [27]:
x.head(10)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education
0,1,1,1,40,1,0,0,0,0,1,0,1,0,5,18,15,1,0,9,4
1,0,0,0,25,1,0,0,1,0,0,0,0,1,3,0,0,0,0,7,6
2,1,1,1,28,0,0,0,0,1,0,0,1,1,5,30,30,1,0,9,4
3,1,0,1,27,0,0,0,1,1,1,0,1,0,2,0,0,0,0,11,3
4,1,1,1,24,0,0,0,1,1,1,0,1,0,2,3,0,0,0,11,5
5,1,1,1,25,1,0,0,1,1,1,0,1,0,2,0,2,0,1,10,6
6,1,0,1,30,1,0,0,0,0,0,0,1,0,3,0,14,0,0,9,6
7,1,1,1,25,1,0,0,1,0,1,0,1,0,3,0,0,1,0,11,4
8,1,1,1,30,1,0,2,0,1,1,0,1,0,5,30,30,1,0,9,5
9,0,0,1,24,0,0,0,0,0,1,0,1,0,2,0,0,0,1,8,4


In [28]:
Rclf.predict([[1, 1, 1,  30, 1, 1, 2, 0, 1, 1, 1, 1, 0, 5, 30, 30, 1, 1, 9, 5]])



array([1], dtype=int64)

In [29]:
Rclf.predict([[0, 0, 0,  25, 1, 0, 0, 1, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 9, 6]])



array([0], dtype=int64)

In [30]:
Rclf.predict([[0, 0, 0,  22, 0, 0, 0, 1, 1, 1, 0, 1, 0, 3, 4, 15, 1, 1, 9, 6]])



array([0], dtype=int64)

In [31]:
from joblib import dump

In [32]:
dump(Rclf, './../machine_deep_models/heart_disease_model.joblib')

['./../machine_deep_models/heart_disease_model.joblib']