# Heart disease prediction using ML pipelines

## Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from hyperopt import fmin, tpe, hp


## Import data

In [2]:
data = pd.read_csv('heart_v2.csv')
data.head()

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


## Check null values

In [3]:
data.isnull().sum()

age              0
sex              0
BP               0
cholestrol       0
heart disease    0
dtype: int64

## Define independent & target features

In [5]:
y = data.pop('heart disease')
X = data

## Train test split

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X, y, train_size= 0.7, random_state= 42)

In [9]:
# Check datat types of each column
data.dtypes

age           int64
sex           int64
BP            int64
cholestrol    int64
dtype: object

## Create column transformers for feature scaling

In [14]:
trf1 = ColumnTransformer([
    ('scale', StandardScaler(),slice(0,4))
])

## Instantiate RandomForestClassifier object as the next transformer

In [19]:
trf2 = RandomForestClassifier()

## Create Pipeline

In [20]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2)
])

## Train the model

In [22]:
pipe.fit(X_train,y_train)

## Predict on test data

In [23]:
y_pred = pipe.predict(X_test)

In [24]:
y_pred

array([1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0], dtype=int64)

## Cross validation using pipelines

In [25]:
cross_val_score(pipe,X_train,y_train,cv= 5,scoring= 'accuracy').mean()

0.6718349928876245

In [26]:
cross_val_score(pipe,X_test,y_test,cv= 5,scoring= 'accuracy').mean()

0.6419117647058823