# Prediksi EF Perawatan (%) - Dataset Pasien Jantung 2021
Notebook ini sudah diperbaiki agar:
- Path Google Drive sesuai: `/content/drive/MyDrive/Jantung/Pasien Jantung 2021 Lengkap.xlsx`
- Target `EF perawatan (%)` dibersihkan dari format range seperti `30-35`
- Kolom kategorikal aman dari campuran `int` dan `str`

Jalankan cell berurutan.


In [None]:
# 1) Mount Google Drive (Colab)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 2) Import library
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt

In [None]:
# 3) Load dataset
file_path = r"/content/drive/MyDrive/Jantung/Pasien Jantung 2021 Lengkap.xlsx"
df_raw = pd.read_excel(file_path)
print("Shape raw:", df_raw.shape)
df_raw.head()

In [None]:
# 4) Cleaning awal (hapus baris label jika ada)
df = df_raw.copy()

# Jika baris pertama berisi label seperti 'ya/tidak', buang
if df.iloc[0].astype(str).str.contains('ya/tidak', case=False, na=False).any():
    df = df.iloc[1:].reset_index(drop=True)

# Rapikan nama kolom & ganti Unnamed
new_cols = []
for c in df.columns:
    c2 = str(c).strip()
    c2 = re.sub(r'\s+', ' ', c2)
    if c2.startswith('Unnamed'):
        c2 = None
    new_cols.append(c2)

df.columns = new_cols

final_cols = []
unnamed_count = 0
for c in df.columns:
    if c is None:
        unnamed_count += 1
        final_cols.append(f"extra_{unnamed_count}")
    else:
        final_cols.append(c)

df.columns = final_cols

print('Shape setelah cleaning awal:', df.shape)
df.head()

In [None]:
# 5) Konversi kolom Ya/Tidak -> 1/0 (jika ada)
ya_vals = {'ya','y','yes','1','true','iya'}
tidak_vals = {'tidak','t','no','0','false'}

def map_ya_tidak(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    if s in ya_vals:
        return 1
    if s in tidak_vals:
        return 0
    return x

for col in df.columns:
    if df[col].dtype == 'object':
        uniq = set(df[col].dropna().astype(str).str.strip().str.lower().unique())
        if len(uniq) > 0 and uniq.issubset(ya_vals.union(tidak_vals)):
            df[col] = df[col].apply(map_ya_tidak).astype('float')

df.head()

In [None]:
# 6) Bersihkan TARGET: EF perawatan (%)
# Mengubah format seperti '30-35' -> 32.5

target_col = 'EF perawatan (%)'

def convert_ef(x):
    if pd.isna(x):
        return np.nan

    s = str(x).strip()

    # normalisasi dash (kadang ada – atau —)
    s = s.replace('–','-').replace('—','-')

    # hapus persen jika ada
    s = s.replace('%','').strip()

    # kalau range '30-35' atau '30 - 35'
    m = re.match(r"^(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)$", s)
    if m:
        a = float(m.group(1))
        b = float(m.group(2))
        return (a + b) / 2

    # kalau format '>50' atau '<30'
    m2 = re.match(r"^[<>]\s*(\d+(?:\.\d+)?)$", s)
    if m2:
        return float(m2.group(1))

    # angka biasa
    try:
        return float(s)
    except:
        return np.nan

if target_col in df.columns:
    df[target_col] = df[target_col].apply(convert_ef)
    print('Target dtype:', df[target_col].dtype)
    print('Jumlah NaN target:', df[target_col].isna().sum())
    print('Contoh target unik (10):', df[target_col].dropna().unique()[:10])
else:
    raise ValueError(f"Kolom target '{target_col}' tidak ditemukan. Cek nama kolom df.columns")

In [None]:
# 7) Training model RandomForestRegressor

X = df.drop(columns=[target_col]).copy()
y = df[target_col].copy()

# buang baris target NaN
mask = y.notna()
X = X.loc[mask]
y = y.loc[mask]

# Pisahkan fitur numerik dan kategorikal
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

print('Jumlah fitur numerik:', len(num_features))
print('Jumlah fitur kategorikal:', len(cat_features))

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('to_str', FunctionTransformer(lambda x: x.astype(str))),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features),
    ]
)

model = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

clf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', model)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# debug: pastikan target sudah numerik
print('y_train dtype:', y_train.dtype)

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)

print('MAE :', mae)
print('RMSE:', rmse)
print('R2  :', r2)

In [None]:
# 8) Visualisasi Prediksi vs Aktual
plt.figure(figsize=(6,6))
plt.scatter(y_test, preds)
plt.xlabel('Aktual')
plt.ylabel('Prediksi')
plt.title('Aktual vs Prediksi - EF perawatan (%)')
plt.tight_layout()
plt.show()

In [None]:
# 9) Simpan model (opsional)
import joblib

out_model = '/content/model_rf_ef_perawatan.pkl'
joblib.dump(clf, out_model)
print('Model tersimpan:', out_model)