In [51]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from scipy import stats

df = pd.read_csv('../assets/jobsheet-7/heart.csv')

In [52]:
# Data Preparation dengan One-Hot Encoding
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
encoder = OneHotEncoder(sparse_output=False)
encoder_df = pd.DataFrame(encoder.fit_transform(df[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))
df_encoded = pd.concat([df, encoder_df], axis=1)
df_encoded

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,M,ATA,140,289,0,Normal,172,N,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,M,ATA,130,283,0,ST,98,N,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,M,NAP,150,195,0,Normal,122,N,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
914,68,M,ASY,144,193,1,Normal,141,N,3.4,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
916,57,F,ATA,130,236,0,LVH,174,N,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [53]:
# Data Preparation dengan Outlier Removal
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
z_scores = np.abs(stats.zscore(df_encoded[['Age', 'RestingBP', 'Cholesterol', 'FastingBS']]))
df_clean = df_encoded[(z_scores < 3).all(axis=1)]
df_clean

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,M,ATA,140,289,0,Normal,172,N,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,M,ATA,130,283,0,ST,98,N,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,M,NAP,150,195,0,Normal,122,N,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
914,68,M,ASY,144,193,1,Normal,141,N,3.4,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
916,57,F,ATA,130,236,0,LVH,174,N,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [54]:
# Data Preparation dengan Normalization
minmax = MinMaxScaler()
df_scaled = pd.DataFrame(minmax.fit_transform(df[numerical_cols]), columns=numerical_cols)
df_scaled

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,0.244898,0.70,0.479270,0.788732,0.295455
1,0.428571,0.80,0.298507,0.676056,0.409091
2,0.183673,0.65,0.469320,0.267606,0.295455
3,0.408163,0.69,0.354892,0.338028,0.465909
4,0.530612,0.75,0.323383,0.436620,0.295455
...,...,...,...,...,...
913,0.346939,0.55,0.437811,0.507042,0.431818
914,0.816327,0.72,0.320066,0.570423,0.681818
915,0.591837,0.65,0.217247,0.387324,0.431818
916,0.591837,0.65,0.391376,0.802817,0.295455


In [55]:
# Data Preparation dengan Standarization
standard = StandardScaler()
df_scaled = pd.DataFrame(standard.fit_transform(df[numerical_cols]), columns=numerical_cols)
df_scaled

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,-1.433140,0.410909,0.825070,1.382928,-0.832432
1,-0.478484,1.491752,-0.171961,0.754157,0.105664
2,-1.751359,-0.129513,0.770188,-1.525138,-0.832432
3,-0.584556,0.302825,0.139040,-1.132156,0.574711
4,0.051881,0.951331,-0.034755,-0.581981,-0.832432
...,...,...,...,...,...
913,-0.902775,-1.210356,0.596393,-0.188999,0.293283
914,1.536902,0.627078,-0.053049,0.164684,2.357094
915,0.370100,-0.129513,-0.620168,-0.857069,0.293283
916,0.370100,-0.129513,0.340275,1.461525,-0.832432


In [56]:
# Pembuatan Dataset
X = df.drop(columns='HeartDisease')
y = df['HeartDisease']

categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]),columns=encoder.get_feature_names_out(categorical_cols))

X_numeric = X.drop(columns=categorical_cols).reset_index(drop=True)
X_final = pd.concat([X_numeric, X_encoded], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

print(f'Ukuran data latih: {X_train.shape}')
print(X_train.head())
print('\n' + '='*50 + '\n')

print(f'Ukuran data uji: {X_test.shape}')
print(X_test.head())

Ukuran data latih: (734, 15)
     Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  Sex_M  \
795   42        120          240          1    194      0.8    1.0   
25    36        130          209          0    178      0.0    1.0   
84    56        150          213          1    125      1.0    1.0   
10    37        130          211          0    142      0.0    0.0   
344   51        120            0          1    104      0.0    1.0   

     ChestPainType_ATA  ChestPainType_NAP  ChestPainType_TA  \
795                0.0                1.0               0.0   
25                 0.0                1.0               0.0   
84                 0.0                0.0               0.0   
10                 0.0                1.0               0.0   
344                0.0                0.0               0.0   

     RestingECG_Normal  RestingECG_ST  ExerciseAngina_Y  ST_Slope_Flat  \
795                1.0            0.0               0.0            0.0   
25                 1.0 

In [57]:
# Cross validation
dt = DecisionTreeClassifier()
scores = cross_val_score(dt, X_final, y, cv=5)

print('Hasil cross validation:', scores)

Hasil cross validation: [0.75       0.75543478 0.77717391 0.69945355 0.68852459]
