In [1]:
pip list

Package                           Version
--------------------------------- --------------------
absl-py                           0.15.0
aiohttp                           3.8.1
aiosignal                         1.2.0
altair                            4.2.0
argon2-cffi                       21.3.0
argon2-cffi-bindings              21.2.0
astor                             0.8.1
asttokens                         2.0.5
astunparse                        1.6.3
async-timeout                     4.0.1
attrs                             21.4.0
backcall                          0.2.0
backports.zoneinfo                0.2.1
beautifulsoup4                    4.10.0
bleach                            4.1.0
blinker                           1.4
Bottleneck                        1.3.4
brotlipy                          0.7.0
cachetools                        4.2.2
certifi                           2021.10.8
cffi                              1.15.0
charset-normalizer                2.0.4
click          



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


## Classification Libraries

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

## Data Preproessing

In [4]:
df=pd.read_csv('heartdata.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [5]:
print(df['Sex'].unique())
print(df['ChestPainType'].unique())
print(df['RestingECG'].unique())
print(df['ExerciseAngina'].unique())
print(df['ST_Slope'].unique())

['M' 'F']
['ATA' 'NAP' 'ASY' 'TA']
['Normal' 'ST' 'LVH']
['N' 'Y']
['Up' 'Flat' 'Down']


In [6]:
df_copy=pd.get_dummies(df)

df_copy=df_copy.drop(["Sex_F","ChestPainType_NAP","RestingECG_Normal","ExerciseAngina_N","ST_Slope_Flat"],axis=1)
df_copy
# dropped chestpaintype_normal
# dropped sex_M
# dropped restingecg_normal
# dropped ExerciseAngina_N
# dropped ST_Slope_Flat

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_TA,RestingECG_LVH,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,0,1,0,0,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,0,0,0,0,0,0,0
2,37,130,283,0,98,0.0,0,1,0,1,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,1,0,0,0,0,1,0,0
4,54,150,195,0,122,0.0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,1,0,0,1,0,0,0,0,0
914,68,144,193,1,141,3.4,1,1,1,0,0,0,0,0,0,0
915,57,130,131,0,115,1.2,1,1,1,0,0,0,0,1,0,0
916,57,130,236,0,174,0.0,1,0,0,1,0,1,0,0,0,0


In [7]:
df_copy.columns

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'HeartDisease', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA',
       'ChestPainType_TA', 'RestingECG_LVH', 'RestingECG_ST',
       'ExerciseAngina_Y', 'ST_Slope_Down', 'ST_Slope_Up'],
      dtype='object')

In [8]:
df_copy.info

<bound method DataFrame.info of      Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  HeartDisease  \
0     40        140          289          0    172      0.0             0   
1     49        160          180          0    156      1.0             1   
2     37        130          283          0     98      0.0             0   
3     48        138          214          0    108      1.5             1   
4     54        150          195          0    122      0.0             0   
..   ...        ...          ...        ...    ...      ...           ...   
913   45        110          264          0    132      1.2             1   
914   68        144          193          1    141      3.4             1   
915   57        130          131          0    115      1.2             1   
916   57        130          236          0    174      0.0             1   
917   38        138          175          0    173      0.0             0   

     Sex_M  ChestPainType_ASY  ChestPainTyp

In [9]:
df_copy.isnull().sum() # The function dataframe. isnull(). sum(). sum() returns the number of missing values in the data set.

Age                  0
RestingBP            0
Cholesterol          0
FastingBS            0
MaxHR                0
Oldpeak              0
HeartDisease         0
Sex_M                0
ChestPainType_ASY    0
ChestPainType_ATA    0
ChestPainType_TA     0
RestingECG_LVH       0
RestingECG_ST        0
ExerciseAngina_Y     0
ST_Slope_Down        0
ST_Slope_Up          0
dtype: int64

In [10]:

df_copy.mean()

Age                   53.510893
RestingBP            132.396514
Cholesterol          198.799564
FastingBS              0.233115
MaxHR                136.809368
Oldpeak                0.887364
HeartDisease           0.553377
Sex_M                  0.789760
ChestPainType_ASY      0.540305
ChestPainType_ATA      0.188453
ChestPainType_TA       0.050109
RestingECG_LVH         0.204793
RestingECG_ST          0.193900
ExerciseAngina_Y       0.404139
ST_Slope_Down          0.068627
ST_Slope_Up            0.430283
dtype: float64

In [11]:
df_copy.fillna(df.mean()) 
# Fill The Null valuee with column's Mean

  df_copy.fillna(df.mean())


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_TA,RestingECG_LVH,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,0,1,0,0,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,0,0,0,0,0,0,0
2,37,130,283,0,98,0.0,0,1,0,1,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,1,0,0,0,0,1,0,0
4,54,150,195,0,122,0.0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,1,0,0,1,0,0,0,0,0
914,68,144,193,1,141,3.4,1,1,1,0,0,0,0,0,0,0
915,57,130,131,0,115,1.2,1,1,1,0,0,0,0,1,0,0
916,57,130,236,0,174,0.0,1,0,0,1,0,1,0,0,0,0


In [12]:

X =df_copy.iloc[:,[0,1,2,3,4,5,7,8,9,10,11,12,13,14,15]].values # INDEPENDENT VALUES
Y=df_copy.iloc[:,6].values  # DEPENDENT VALUES 

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)


In [15]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.fit_transform(X_test)

# Model Building

## Random Forest

In [16]:
rfc=RandomForestClassifier(n_estimators=1000,random_state=41)
rfc.fit(X_train,Y_train)
predictions=rfc.predict(X_test)
print("Accuracy Score=",format(metrics.accuracy_score(Y_test,predictions)))

Accuracy Score= 0.8532608695652174


## Decision Tree

In [17]:
dtree=DecisionTreeClassifier(random_state=41)
dtree.fit(X_train,Y_train)
predictions=dtree.predict(X_test)
print("Accuracy Score=",format(metrics.accuracy_score(Y_test,predictions)))

Accuracy Score= 0.75


## Support Vector Machine

In [18]:
svc_model=SVC(random_state=41)
svc_model.fit(X_train,Y_train)
svc_pred=svc_model.predict(X_test)
print("Accuracy Score", format(metrics.accuracy_score(Y_test,svc_pred)))

Accuracy Score 0.8586956521739131


## GradientBoostingClassifier

In [19]:
GBC=GradientBoostingClassifier(random_state=41)
GBC.fit(X_train,Y_train)
GBC_pred=GBC.predict(X_test)
print("Accuracy Score=",format(metrics.accuracy_score(Y_test,GBC_pred)))

Accuracy Score= 0.842391304347826


## XBG Classifier


In [20]:
from xgboost import XGBClassifier
xgb_model=XGBClassifier(gamma=0)
xgb_model.fit(X_train,Y_train)
xgb_pred=xgb_model.predict(X_test)
print("Accuracy Score=",format(metrics.accuracy_score(Y_test,xgb_pred)))

  from pandas import MultiIndex, Int64Index


Accuracy Score= 0.8315217391304348


## Saving Model

In [21]:
# we can observe SVC is giving More Accuracy. So Load that classiofier
import pickle
saved_model= pickle.dumps(svc_model)
svc_from_pickle= pickle.loads(saved_model)
svc_from_pickle.predict(X_test)


array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1], dtype=int64)

In [22]:
df_copy.tail(n=3)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_TA,RestingECG_LVH,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Up
915,57,130,131,0,115,1.2,1,1,1,0,0,0,0,1,0,0
916,57,130,236,0,174,0.0,1,0,0,1,0,1,0,0,0,0
917,38,138,175,0,173,0.0,0,1,0,0,0,0,0,0,0,1


In [23]:
svc_from_pickle.predict([[38,138,175,0,173,0.0,1,0,0,0,0,0,0,0,1]])   # 917 th patient

array([1], dtype=int64)

In [25]:
import pickle
with open('model.pkl','wb') as files:
    pickle.dump(svc_model,files)