In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from utils import automated_pipeline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [2]:
file_raw = r'E:\airta drafts\PREDIKSI KADAR HB\data\raw\erm_hd.xlsx'
df_raw = pd.read_excel(file_raw)

df = automated_pipeline(df_raw)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643 entries, 0 to 642
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id_pasien        643 non-null    int64         
 1   tgl_pemeriksaan  643 non-null    datetime64[ns]
 2   usia             643 non-null    Int64         
 3   jk               643 non-null    Int64         
 4   eritrosit        643 non-null    float64       
 5   hematokrit       643 non-null    float64       
 6   MCHC             643 non-null    float64       
 7   MCH              643 non-null    float64       
 8   MCV              643 non-null    float64       
 9   hemoglobin       643 non-null    float64       
 10  leukosit         643 non-null    Int64         
 11  trombosit        643 non-null    Int64         
 12  epo              643 non-null    Int64         
dtypes: Int64(5), datetime64[ns](1), float64(6), int64(1)
memory usage: 68.6 KB


In [3]:
df = df.sort_values(by=['tgl_pemeriksaan', 'id_pasien']).reset_index(drop=True)

#LAG FEATURE untuk TSCV
df['hb_now'] = df.groupby('id_pasien')['hemoglobin'].shift(1)
df = df.dropna(subset=['hb_now']).reset_index(drop=True)

#MENENTUKAN FITUR X DAN TARGET Y
X = df.drop(columns=['id_pasien', 'tgl_pemeriksaan', 'hemoglobin', 'hematokrit', 'eritrosit'])
y = df['hemoglobin']

#SPLIT 80:20 BERDASARKAN KRONOLOGIS
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    shuffle=False
)

print(f"Total baris data: {len(df)}")
print(f"Jumlah Data Latih (80%): {len(X_train)}")
print(f"Jumlah Data Uji (20%): {len(X_test)}")
print("-" * 30)
print("Fitur yang digunakan (X):", X.columns.tolist())

Total baris data: 571
Jumlah Data Latih (80%): 456
Jumlah Data Uji (20%): 115
------------------------------
Fitur yang digunakan (X): ['usia', 'jk', 'MCHC', 'MCH', 'MCV', 'leukosit', 'trombosit', 'epo', 'hb_now']


In [4]:
# Tabel untuk verifikasi lag feature
lag_table = df[
    ['id_pasien', 'tgl_pemeriksaan', 'hb_now', 'hemoglobin']
].sort_values(by=['id_pasien', 'tgl_pemeriksaan'])

print(lag_table.head(20))


     id_pasien tgl_pemeriksaan    hb_now  hemoglobin
28           1      2025-04-01  8.600000    8.100000
90           1      2025-05-01  8.100000    8.300000
151          1      2025-06-01  8.300000    7.900000
213          1      2025-07-01  7.900000    7.500000
267          1      2025-08-01  7.500000    7.900000
320          1      2025-09-01  7.900000    9.500000
372          1      2025-10-01  9.500000    9.100000
424          1      2025-11-01  9.100000    8.700000
475          1      2025-12-01  8.700000    8.600000
0            2      2025-03-01  7.900000    8.300000
29           2      2025-04-01  8.300000    7.700000
91           2      2025-05-01  7.700000    7.800000
152          2      2025-06-01  7.800000    7.900000
214          2      2025-07-01  7.900000    7.200000
268          2      2025-08-01  7.200000    8.000000
321          2      2025-09-01  8.000000    7.833333
373          2      2025-10-01  7.833333    8.000000
425          2      2025-11-01  8.000000    8.

In [5]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)

# Cek bagaimana TSCV membagi data X_train kamu
print(f"Total baris di X_train: {len(X_train)}")
print("-" * 30)

for i, (train_index, val_index) in enumerate(tscv.split(X_train)):
    print(f"Iterasi ke-{i+1}:")
    print(f"  Jumlah Data Latih: {len(train_index)} baris (Indeks {train_index[0]} s/d {train_index[-1]})")
    print(f"  Jumlah Data Validasi: {len(val_index)} baris (Indeks {val_index[0]} s/d {val_index[-1]})")
    print("-" * 30)

Total baris di X_train: 456
------------------------------
Iterasi ke-1:
  Jumlah Data Latih: 76 baris (Indeks 0 s/d 75)
  Jumlah Data Validasi: 76 baris (Indeks 76 s/d 151)
------------------------------
Iterasi ke-2:
  Jumlah Data Latih: 152 baris (Indeks 0 s/d 151)
  Jumlah Data Validasi: 76 baris (Indeks 152 s/d 227)
------------------------------
Iterasi ke-3:
  Jumlah Data Latih: 228 baris (Indeks 0 s/d 227)
  Jumlah Data Validasi: 76 baris (Indeks 228 s/d 303)
------------------------------
Iterasi ke-4:
  Jumlah Data Latih: 304 baris (Indeks 0 s/d 303)
  Jumlah Data Validasi: 76 baris (Indeks 304 s/d 379)
------------------------------
Iterasi ke-5:
  Jumlah Data Latih: 380 baris (Indeks 0 s/d 379)
  Jumlah Data Validasi: 76 baris (Indeks 380 s/d 455)
------------------------------


In [None]:
df_check[['id_pasien', 'tgl_pemeriksaan', 'hb_now', 'hemoglobin', 'set']].iloc[
    len(X_train)-5 : len(X_train)+5
]


Unnamed: 0,id_pasien,tgl_pemeriksaan,hb_now,hemoglobin,set
451,39,2025-11-01,7.4,6.7,train
452,40,2025-11-01,8.9,7.15,train
453,41,2025-11-01,6.1,6.9,train
454,42,2025-11-01,9.0,8.4,train
455,46,2025-11-01,8.5,7.5,train
456,47,2025-11-01,7.9,8.2,test
457,49,2025-11-01,6.25,6.9,test
458,50,2025-11-01,6.6,6.2,test
459,52,2025-11-01,8.3,8.1,test
460,53,2025-11-01,8.6,7.0,test


In [5]:
import optuna
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

# --- FUNGSI OBJECTIVE UNTUK MASING-MASING MODEL ---

def obj_svr(trial):
    # Parameter SVR (C dan Epsilon sangat krusial di sini)
    c = trial.suggest_float('C', 0.1, 100.0, log=True)
    epsilon = trial.suggest_float('epsilon', 0.01, 1.0, log=True)
    
    # SVR WAJIB Scaling (StandardScaler)
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVR(C=c, epsilon=epsilon))
    ])
    score = cross_val_score(pipe, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')
    return -score.mean()

def obj_rf(trial):
    n_est = trial.suggest_int('n_estimators', 50, 300)
    depth = trial.suggest_int('max_depth', 3, 15)
    model = RandomForestRegressor(n_estimators=n_est, max_depth=depth, random_state=42)
    # RF tidak wajib scaling karena berbasis pohon
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')
    return -score.mean()

def obj_lgbm(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'verbose': -1
    }
    model = LGBMRegressor(**param, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')
    return -score.mean()

def obj_xgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }
    model = XGBRegressor(**param, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')
    return -score.mean()

# --- EKSEKUSI TUNING (Bisa memakan waktu beberapa menit) ---

print("Menjalankan Tuning SVR...")
study_svr = optuna.create_study(direction='minimize')
study_svr.optimize(obj_svr, n_trials=30)

print("Menjalankan Tuning Random Forest...")
study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(obj_rf, n_trials=30)

print("Menjalankan Tuning LightGBM...")
study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(obj_lgbm, n_trials=30)

print("Menjalankan Tuning XGBoost...")
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(obj_xgb, n_trials=30)

print("\n--- TUNING SELESAI ---")
print(f"SVR MAE: {-study_svr.best_value:.4f}")
print(f"RF MAE: {-study_rf.best_value:.4f}")
print(f"LGBM MAE: {-study_lgbm.best_value:.4f}")
print(f"XGB MAE: {-study_xgb.best_value:.4f}")

[I 2026-02-05 00:11:40,906] A new study created in memory with name: no-name-2cfc6af7-4b70-4fa7-91b1-a6f7fb05ae6a


Menjalankan Tuning SVR...


[I 2026-02-05 00:11:41,339] Trial 0 finished with value: 0.8030019328600329 and parameters: {'C': 31.378271590607863, 'epsilon': 0.0891542805459877}. Best is trial 0 with value: 0.8030019328600329.
[I 2026-02-05 00:11:41,488] Trial 1 finished with value: 0.6668779716534728 and parameters: {'C': 0.6681316265014525, 'epsilon': 0.02813076064860852}. Best is trial 1 with value: 0.6668779716534728.
[I 2026-02-05 00:11:41,636] Trial 2 finished with value: 0.6918954264290572 and parameters: {'C': 3.9191496109293844, 'epsilon': 0.08971824395480109}. Best is trial 1 with value: 0.6668779716534728.
[I 2026-02-05 00:11:41,846] Trial 3 finished with value: 0.7528013205577773 and parameters: {'C': 22.4832884222223, 'epsilon': 0.2791425788102094}. Best is trial 1 with value: 0.6668779716534728.
[I 2026-02-05 00:11:42,022] Trial 4 finished with value: 0.6819665734745155 and parameters: {'C': 2.947041508893749, 'epsilon': 0.08241072772255462}. Best is trial 1 with value: 0.6668779716534728.
[I 2026-02

Menjalankan Tuning Random Forest...


[I 2026-02-05 00:11:47,174] Trial 0 finished with value: 0.6652211909136737 and parameters: {'n_estimators': 69, 'max_depth': 3}. Best is trial 0 with value: 0.6652211909136737.
[I 2026-02-05 00:11:52,587] Trial 1 finished with value: 0.6767365353421788 and parameters: {'n_estimators': 270, 'max_depth': 15}. Best is trial 0 with value: 0.6652211909136737.
[I 2026-02-05 00:11:54,523] Trial 2 finished with value: 0.6657394878398811 and parameters: {'n_estimators': 176, 'max_depth': 4}. Best is trial 0 with value: 0.6652211909136737.
[I 2026-02-05 00:11:58,012] Trial 3 finished with value: 0.6758261567782293 and parameters: {'n_estimators': 271, 'max_depth': 13}. Best is trial 0 with value: 0.6652211909136737.
[I 2026-02-05 00:11:59,586] Trial 4 finished with value: 0.674468155076047 and parameters: {'n_estimators': 95, 'max_depth': 6}. Best is trial 0 with value: 0.6652211909136737.
[I 2026-02-05 00:12:03,148] Trial 5 finished with value: 0.675229771941542 and parameters: {'n_estimators'

Menjalankan Tuning LightGBM...


[I 2026-02-05 00:12:37,887] Trial 0 finished with value: 0.695717874155897 and parameters: {'n_estimators': 267, 'learning_rate': 0.01690577724914156, 'max_depth': 5, 'num_leaves': 51}. Best is trial 0 with value: 0.695717874155897.
[I 2026-02-05 00:12:38,185] Trial 1 finished with value: 0.6858774422837781 and parameters: {'n_estimators': 254, 'learning_rate': 0.01583485111782879, 'max_depth': 3, 'num_leaves': 27}. Best is trial 1 with value: 0.6858774422837781.
[I 2026-02-05 00:12:38,354] Trial 2 finished with value: 0.7022994656620181 and parameters: {'n_estimators': 181, 'learning_rate': 0.005245433280664591, 'max_depth': 3, 'num_leaves': 76}. Best is trial 1 with value: 0.6858774422837781.
[I 2026-02-05 00:12:38,616] Trial 3 finished with value: 0.6878222997343262 and parameters: {'n_estimators': 238, 'learning_rate': 0.013210716864384507, 'max_depth': 10, 'num_leaves': 76}. Best is trial 1 with value: 0.6858774422837781.
[I 2026-02-05 00:12:38,776] Trial 4 finished with value: 0.

Menjalankan Tuning XGBoost...


[I 2026-02-05 00:12:45,998] Trial 0 finished with value: 0.7308228649264127 and parameters: {'n_estimators': 198, 'learning_rate': 0.005457089276286392, 'max_depth': 8}. Best is trial 0 with value: 0.7308228649264127.
[I 2026-02-05 00:12:46,955] Trial 1 finished with value: 0.7252700133665897 and parameters: {'n_estimators': 164, 'learning_rate': 0.008987220017350646, 'max_depth': 7}. Best is trial 1 with value: 0.7252700133665897.
[I 2026-02-05 00:12:47,277] Trial 2 finished with value: 0.6828521869650299 and parameters: {'n_estimators': 89, 'learning_rate': 0.0372710295162903, 'max_depth': 4}. Best is trial 2 with value: 0.6828521869650299.
[I 2026-02-05 00:12:47,846] Trial 3 finished with value: 0.7045128144472599 and parameters: {'n_estimators': 145, 'learning_rate': 0.007671405441855429, 'max_depth': 5}. Best is trial 2 with value: 0.6828521869650299.
[I 2026-02-05 00:12:48,419] Trial 4 finished with value: 0.6932363966648281 and parameters: {'n_estimators': 104, 'learning_rate': 


--- TUNING SELESAI ---
SVR MAE: -0.6580
RF MAE: -0.6634
LGBM MAE: -0.6803
XGB MAE: -0.6660


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# 1. Inisialisasi Model Final dengan Best Parameters dari Optuna
models_final = {
    "Linear Regression (Baseline)": LinearRegression(),
    "SVR": Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVR(**study_svr.best_params))
    ]),
    "Random Forest": RandomForestRegressor(**study_rf.best_params, random_state=42),
    "LightGBM": LGBMRegressor(**study_lgbm.best_params, random_state=42, verbose=-1),
    "XGBoost": XGBRegressor(**study_xgb.best_params, random_state=42)
}

# 2. Proses Training Final & Evaluasi di Data Test (20%)
final_results = []

for name, model in models_final.items():
    # Training menggunakan 80% data
    model.fit(X_train, y_train)
    
    # Prediksi menggunakan 20% data uji
    y_pred = model.predict(X_test)
    
    # Perhitungan Metrik Evaluasi
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    final_results.append({
        "Model": name,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    })

# 3. Menampilkan Tabel Perbandingan Akhir
df_final = pd.DataFrame(final_results).sort_values(by="MAE")
print(df_final)

                          Model       MAE      RMSE        R2
1                           SVR  0.586881  0.769638  0.468928
3                      LightGBM  0.637461  0.816735  0.401942
0  Linear Regression (Baseline)  0.638546  0.789638  0.440969
2                 Random Forest  0.651614  0.815175  0.404225
4                       XGBoost  0.651787  0.824313  0.390793


In [7]:
from sklearn.inspection import permutation_importance

# Analisis untuk SVR
result = permutation_importance(models_final["SVR"], X_test, y_test, n_repeats=10, random_state=42)

importance_svr = pd.DataFrame({
    'Fitur': X_train.columns,
    'Importance_Mean': result.importances_mean
}).sort_values(by='Importance_Mean', ascending=False)

print(importance_svr)

       Fitur  Importance_Mean
8     hb_now         0.435953
2       MCHC         0.070664
7        epo         0.064215
1         jk         0.062081
4        MCV         0.061514
6  trombosit         0.046443
5   leukosit         0.042322
0       usia         0.039836
3        MCH         0.027223
