In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib # Untuk menyimpan model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter

print("Library berhasil di-load!")

Library berhasil di-load!


Load & Cleaning Data

In [2]:
# 1. Load Data
df = pd.read_csv('Dataset Penyakit Jantung.csv')

# 2. Ganti '?' dengan NaN
df.replace('?', np.nan, inplace=True)

# 3. Konversi ke Numerik
cols_numeric = ['trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
for col in cols_numeric:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Imputasi (Mengisi data kosong)
# 'ca' dan 'thal' pakai Modus/Median agar bulat
df['ca'] = df['ca'].fillna(df['ca'].median())
df['thal'] = df['thal'].fillna(df['thal'].mode()[0])
# Sisanya pakai Mean (Rata-rata)
for col in ['trestbps', 'chol', 'thalach', 'oldpeak']:
    df[col] = df[col].fillna(df[col].mean())
# Sisa kategori biner pakai Modus
for col in ['fbs', 'restecg', 'exang', 'slope']:
    df[col] = df[col].fillna(df[col].mode()[0])

# 5. Target (num) ubah ke Binary (0=Sehat, 1=Sakit)
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df = df.drop(columns=['num'])

print("Data bersih siap digunakan!")
print(df.info())

Data bersih siap digunakan!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    int64  
 1   sex       920 non-null    int64  
 2   cp        920 non-null    int64  
 3   trestbps  920 non-null    float64
 4   chol      920 non-null    float64
 5   fbs       920 non-null    float64
 6   restecg   920 non-null    float64
 7   thalach   920 non-null    float64
 8   exang     920 non-null    float64
 9   oldpeak   920 non-null    float64
 10  slope     920 non-null    float64
 11  ca        920 non-null    float64
 12  thal      920 non-null    float64
 13  target    920 non-null    int64  
dtypes: float64(10), int64(4)
memory usage: 100.8 KB
None


Preprocessing & Splitting

In [None]:
# Definisi Fitur
# Kita pisahkan mana yang perlu scaling (numerik) dan encoding (kategori)
# Catatan: Walaupun 'sex' angka 0/1, kita bisa anggap kategori agar lebih aman di Streamlit
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

X = df.drop(columns=['target'])
y = df['target']

# Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Buat Pipeline Preprocessing
# Numerik: Standard Scaler
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Kategorikal: OneHot Encoder
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Gabung dalam ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit pada Training Data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Simpan nama fitur untuk keperluan SHAP
ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
feature_names = np.r_[numeric_features, ohe_feature_names]

print("Preprocessing selesai.")

Preprocessing selesai.


SMOTE & Training Model (Decision Tree vs XGBoost)

In [4]:
# 1. Handle Imbalance dengan SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

print(f"Distribusi Target Sebelum SMOTE: {Counter(y_train)}")
print(f"Distribusi Target Setelah SMOTE: {Counter(y_train_resampled)}")

# 2. Train Decision Tree (Baseline)
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_model.fit(X_train_resampled, y_train_resampled)

# 3. Train XGBoost (Advanced)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

print("Model selesai dilatih!")

Distribusi Target Sebelum SMOTE: Counter({1: 407, 0: 329})
Distribusi Target Setelah SMOTE: Counter({1: 407, 0: 407})
Model selesai dilatih!


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluasi & Simpan Model

In [None]:
# Evaluasi XGBoost pada Data Test
y_pred = xgb_model.predict(X_test_processed)
print("--- Classification Report (XGBoost) ---")
print(classification_report(y_test, y_pred))

# SIMPAN ARTIFACTS PENTING
# Kita bungkus semua yang dibutuhkan Streamlit dalam satu file dictionary
artifacts = {
    'model_xgb': xgb_model,
    'model_dt': dt_model,
    'preprocessor': preprocessor,
    'feature_names': feature_names,
    'X_train_sample': X_train_resampled[:100] # Simpan sampel kecil untuk SHAP background
}

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# 1. Buat Dictionary untuk menyimpan hasil
results = []
models = {'Decision Tree': dt_model, 'XGBoost': xgb_model}

for name, model in models.items():
    y_pred = model.predict(X_test_processed)
    results.append({
        'Model': name,
        'Akurasi': accuracy_score(y_test, y_pred),
        'Recall (Sensitivitas)': recall_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred)
    })

# 2. Ubah ke DataFrame agar rapi
df_comparison = pd.DataFrame(results)
print("\n--- Tabel Perbandingan Model ---")
print(df_comparison)

# 3. UPDATE ARTIFACTS
# Kita masukkan tabel ini ke dalam file .pkl agar bisa dibaca oleh Streamlit
artifacts = {
    'model_xgb': xgb_model,
    'model_dt': dt_model,
    'preprocessor': preprocessor,
    'feature_names': feature_names,
    'X_train_sample': X_train_resampled[:100],
    'comparison_metrics': df_comparison # <--- INI BARU
}

joblib.dump(artifacts, 'model_jantung.pkl')
print("✅ Model dan Metrics berhasil disimpan!")

--- Classification Report (XGBoost) ---
              precision    recall  f1-score   support

           0       0.85      0.77      0.81        82
           1       0.83      0.89      0.86       102

    accuracy                           0.84       184
   macro avg       0.84      0.83      0.83       184
weighted avg       0.84      0.84      0.84       184


--- Tabel Perbandingan Model ---
           Model   Akurasi  Recall (Sensitivitas)  Precision  F1-Score
0  Decision Tree  0.809783               0.813725   0.838384  0.825871
1        XGBoost  0.836957               0.892157   0.827273  0.858491
✅ Model dan Metrics berhasil disimpan!


In [6]:
# --- KODE UPDATE UNTUK MENYIMPAN CONFUSION MATRIX ---
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

# 1. Hitung Metrik & Confusion Matrix
results = []
confusion_matrices = {} # Dictionary untuk simpan matrix
models = {'Decision Tree': dt_model, 'XGBoost': xgb_model}

for name, model in models.items():
    y_pred = model.predict(X_test_processed)
    
    # Simpan Metrik Angka
    results.append({
        'Model': name,
        'Akurasi': accuracy_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred)
    })
    
    # Simpan Confusion Matrix (Array)
    confusion_matrices[name] = confusion_matrix(y_test, y_pred)

# 2. Buat DataFrame
df_comparison = pd.DataFrame(results)
print("\n--- Tabel Perbandingan Model ---")
print(df_comparison)

# 3. UPDATE ARTIFACTS
# Masukkan 'confusion_matrices' ke dalam artifacts
artifacts = {
    'model_xgb': xgb_model,
    'model_dt': dt_model,
    'preprocessor': preprocessor,
    'feature_names': feature_names,
    'X_train_sample': X_train_resampled[:100],
    'comparison_metrics': df_comparison,
    'confusion_matrices': confusion_matrices # <--- ITEM BARU
}

joblib.dump(artifacts, 'model_jantung.pkl')
print("✅ Model, Metrics, dan Confusion Matrix berhasil disimpan!")


--- Tabel Perbandingan Model ---
           Model   Akurasi    Recall  Precision  F1-Score
0  Decision Tree  0.809783  0.813725   0.838384  0.825871
1        XGBoost  0.836957  0.892157   0.827273  0.858491
✅ Model, Metrics, dan Confusion Matrix berhasil disimpan!
