In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.shape

(918, 12)

In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [5]:
cols_to_fix = ['RestingBP', 'Cholesterol', 'MaxHR', 'Age', 'Oldpeak', 'FastingBS']

for col in cols_to_fix:
    mean = df[col].mean()
    std = df[col].std()
    
    # Filter df to keep only rows within 3 standard deviations for THIS column
    df = df[(df[col] > (mean - 3 * std)) & (df[col] < (mean + 3 * std))]

In [6]:
df.shape

(899, 12)

In [7]:
x = df.drop(['HeartDisease'], axis='columns')
y = df.HeartDisease

In [8]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='str')

In [9]:

ct = ColumnTransformer([("Model", OneHotEncoder(drop="first"), [1,2,6,8,10])], remainder="passthrough")
x = ct.fit_transform(x)
x

array([[  1. ,   1. ,   0. , ...,   0. , 172. ,   0. ],
       [  0. ,   0. ,   1. , ...,   0. , 156. ,   1. ],
       [  1. ,   1. ,   0. , ...,   0. ,  98. ,   0. ],
       ...,
       [  1. ,   0. ,   0. , ...,   0. , 115. ,   1.2],
       [  0. ,   1. ,   0. , ...,   0. , 174. ,   0. ],
       [  1. ,   0. ,   1. , ...,   0. , 173. ,   0. ]], shape=(899, 15))

In [10]:
ct.get_feature_names_out()

array(['Model__Sex_M', 'Model__ChestPainType_ATA',
       'Model__ChestPainType_NAP', 'Model__ChestPainType_TA',
       'Model__RestingECG_Normal', 'Model__RestingECG_ST',
       'Model__ExerciseAngina_Y', 'Model__ST_Slope_Flat',
       'Model__ST_Slope_Up', 'remainder__Age', 'remainder__RestingBP',
       'remainder__Cholesterol', 'remainder__FastingBS',
       'remainder__MaxHR', 'remainder__Oldpeak'], dtype=object)

In [11]:
x_transformed_df = pd.DataFrame(x, columns=ct.get_feature_names_out())
x_transformed_df.head()

Unnamed: 0,Model__Sex_M,Model__ChestPainType_ATA,Model__ChestPainType_NAP,Model__ChestPainType_TA,Model__RestingECG_Normal,Model__RestingECG_ST,Model__ExerciseAngina_Y,Model__ST_Slope_Flat,Model__ST_Slope_Up,remainder__Age,remainder__RestingBP,remainder__Cholesterol,remainder__FastingBS,remainder__MaxHR,remainder__Oldpeak
0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,40.0,140.0,289.0,0.0,172.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,49.0,160.0,180.0,0.0,156.0,1.0
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,37.0,130.0,283.0,0.0,98.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,48.0,138.0,214.0,0.0,108.0,1.5
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54.0,150.0,195.0,0.0,122.0,0.0


In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y , train_size=0.8)

In [17]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

scores = cross_val_score(SVC(), x, y, cv=5)
scores.mean()

np.float64(0.6895779019242706)

In [19]:
from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier(
    estimator=SVC(),
    random_state=0,
    max_samples=0.8,
    n_estimators=100,
)

scores = cross_val_score(model, x, y, cv=5)
scores.mean()

np.float64(0.6851334574798262)

In [21]:
from sklearn.tree import DecisionTreeClassifier

scores = cross_val_score(DecisionTreeClassifier(), x, y, cv=5)
scores.mean()

np.float64(0.7296027312228429)

In [22]:
model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    random_state=0,
    max_samples=0.8,
    n_estimators=100,
)

scores = cross_val_score(model, x, y, cv=5)
scores.mean()

np.float64(0.798578522656735)