In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from scipy import stats

df = pd.read_csv('../assets/jobsheet-7/heart.csv')

In [39]:
# Data Preparation dengan One-Hot Encoding
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
encoder = OneHotEncoder(sparse_output=False)
encoder_df = pd.DataFrame(encoder.fit_transform(df[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))
df_encoded = pd.concat([df.drop(columns=categorical_cols).reset_index(drop=True), encoder_df], axis=1)
df_encoded

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,160,180,0,156,1.0,1,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,130,283,0,98,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,48,138,214,0,108,1.5,1,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,150,195,0,122,0.0,0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
914,68,144,193,1,141,3.4,1,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
915,57,130,131,0,115,1.2,1,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
916,57,130,236,0,174,0.0,1,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [40]:
# Data Preparation dengan Outlier Removal
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
z_scores = np.abs(stats.zscore(df_encoded[numerical_cols]))
df_clean = df_encoded[(z_scores < 3).all(axis=1)]
df_clean

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,160,180,0,156,1.0,1,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,130,283,0,98,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,48,138,214,0,108,1.5,1,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,150,195,0,122,0.0,0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
914,68,144,193,1,141,3.4,1,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
915,57,130,131,0,115,1.2,1,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
916,57,130,236,0,174,0.0,1,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [41]:
# Data Preparation dengan Normalization
minmax = MinMaxScaler()
df_scaled = pd.DataFrame(minmax.fit_transform(df_clean[numerical_cols]), columns=numerical_cols)
df_scaled

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,0.244898,0.571429,0.557915,0.784173,0.333333
1,0.428571,0.761905,0.347490,0.669065,0.500000
2,0.183673,0.476190,0.546332,0.251799,0.333333
3,0.408163,0.552381,0.413127,0.323741,0.583333
4,0.530612,0.666667,0.376448,0.424460,0.333333
...,...,...,...,...,...
894,0.346939,0.285714,0.509653,0.496403,0.533333
895,0.816327,0.609524,0.372587,0.561151,0.900000
896,0.591837,0.476190,0.252896,0.374101,0.533333
897,0.591837,0.476190,0.455598,0.798561,0.333333


In [42]:
# Data Preparation dengan Standarization
standard = StandardScaler()
df_scaled = pd.DataFrame(standard.fit_transform(df_clean[numerical_cols]), columns=numerical_cols)
df_scaled

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,-1.428154,0.465900,0.849636,1.384320,-0.855469
1,-0.475855,1.634714,-0.168122,0.752973,0.137516
2,-1.745588,-0.118507,0.793612,-1.535661,-0.855469
3,-0.581666,0.349019,0.149344,-1.141069,0.634008
4,0.053200,1.050307,-0.028064,-0.588640,-0.855469
...,...,...,...,...,...
894,-0.899099,-1.287320,0.616205,-0.194048,0.336112
895,1.534554,0.699663,-0.046738,0.161085,2.520678
896,0.370633,-0.118507,-0.625646,-0.864854,0.336112
897,0.370633,-0.118507,0.354763,1.463238,-0.855469


In [43]:
# Pembuatan Dataset
X = df.drop(columns='HeartDisease')
y = df['HeartDisease']

categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]),columns=encoder.get_feature_names_out(categorical_cols))

X_numeric = X.drop(columns=categorical_cols).reset_index(drop=True)
X_final = pd.concat([X_numeric, X_encoded], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

print(f'Ukuran data latih: {X_train.shape}')
print(X_train.head())
print('\n' + '='*50 + '\n')

print(f'Ukuran data uji: {X_test.shape}')
print(X_test.head())

Ukuran data latih: (734, 15)
     Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  Sex_M  \
795   42        120          240          1    194      0.8    1.0   
25    36        130          209          0    178      0.0    1.0   
84    56        150          213          1    125      1.0    1.0   
10    37        130          211          0    142      0.0    0.0   
344   51        120            0          1    104      0.0    1.0   

     ChestPainType_ATA  ChestPainType_NAP  ChestPainType_TA  \
795                0.0                1.0               0.0   
25                 0.0                1.0               0.0   
84                 0.0                0.0               0.0   
10                 0.0                1.0               0.0   
344                0.0                0.0               0.0   

     RestingECG_Normal  RestingECG_ST  ExerciseAngina_Y  ST_Slope_Flat  \
795                1.0            0.0               0.0            0.0   
25                 1.0 

In [44]:
# Cross validation
dt = DecisionTreeClassifier()
scores = cross_val_score(dt, X_final, y, cv=5)

print('Hasil cross validation:', scores)

Hasil cross validation: [0.75       0.72826087 0.7826087  0.68852459 0.68306011]
