## Load Data

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("heart.csv")
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


## Creating Pipeline for data preprocessing and model training

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [4]:
x = data.drop("HeartDisease", axis=1)
y = data["HeartDisease"].copy()
x_train,x_test,y_train,y_test=train_test_split(x, y,test_size=0.2, random_state=6)

In [5]:
numeric_features = ["Age", "RestingBP", "Cholesterol", "FastingBS" ,"MaxHR", "Oldpeak"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]
#categorical_transformer = OneHotEncoder(handle_unknown="ignore",sparse=False)
categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown="ignore",sparse=False))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", svm.SVC(kernel='linear'))]
)
clf.fit(x_train, y_train)
print("model score: %.3f" % clf.score(x_test, y_test))

model score: 0.875


In [6]:
x_test.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
136,43,F,ATA,120,215,0,ST,175,N,0.0,Up
273,55,M,NAP,120,220,0,LVH,134,N,0.0,Up
860,60,M,ASY,130,253,0,Normal,144,Y,1.4,Up
521,61,M,ASY,120,282,0,ST,135,Y,4.0,Down
240,55,M,ATA,145,326,0,Normal,155,N,0.0,Up


In [7]:
x_test['Sex']=x_test['Sex'].astype('string')
x_test['ChestPainType']=x_test['ChestPainType'].astype('string')
x_test['RestingECG']=x_test['RestingECG'].astype('string')
x_test['ExerciseAngina']=x_test['ExerciseAngina'].astype('string')
x_test['ST_Slope']=x_test['ST_Slope'].astype('string')
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184 entries, 136 to 786
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             184 non-null    int64  
 1   Sex             184 non-null    string 
 2   ChestPainType   184 non-null    string 
 3   RestingBP       184 non-null    int64  
 4   Cholesterol     184 non-null    int64  
 5   FastingBS       184 non-null    int64  
 6   RestingECG      184 non-null    string 
 7   MaxHR           184 non-null    int64  
 8   ExerciseAngina  184 non-null    string 
 9   Oldpeak         184 non-null    float64
 10  ST_Slope        184 non-null    string 
dtypes: float64(1), int64(5), string(5)
memory usage: 17.2 KB


In [8]:
clf.predict(x_test)

array([0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 1], dtype=int64)

In [9]:
np.array(y_test)

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1], dtype=int64)

## Check accuracy of model

In [10]:
clf.score(x_test, y_test)*100

87.5

## Save Model

In [11]:
from joblib import dump, load
dump(clf, 'Heart.joblib')

['Heart.joblib']

In [12]:
Age=43
Sex='F'