In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from utils import automated_pipeline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [5]:
file_raw = r'E:\airta drafts\PREDIKSI KADAR HB\data\raw\erm_hd.xlsx'
df_raw = pd.read_excel(file_raw)

df = automated_pipeline(df_raw)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643 entries, 0 to 642
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id_pasien        643 non-null    int64         
 1   tgl_pemeriksaan  643 non-null    datetime64[ns]
 2   usia             643 non-null    Int64         
 3   jk               643 non-null    Int64         
 4   eritrosit        643 non-null    float64       
 5   hematokrit       643 non-null    float64       
 6   MCHC             643 non-null    float64       
 7   MCH              643 non-null    float64       
 8   MCV              643 non-null    float64       
 9   hemoglobin       643 non-null    float64       
 10  leukosit         643 non-null    Int64         
 11  trombosit        643 non-null    Int64         
 12  epo              643 non-null    Int64         
dtypes: Int64(5), datetime64[ns](1), float64(6), int64(1)
memory usage: 68.6 KB


In [6]:
df = df.sort_values(by=['tgl_pemeriksaan', 'id_pasien']).reset_index(drop=True)

#LAG FEATURE untuk TSCV
df['hb_now'] = df.groupby('id_pasien')['hemoglobin'].shift(1)
df = df.dropna(subset=['hb_now']).reset_index(drop=True)

#MENENTUKAN FITUR X DAN TARGET Y
X = df.drop(columns=['id_pasien', 'tgl_pemeriksaan', 'hemoglobin'])
y = df['hemoglobin']

#SPLIT 80:20 BERDASARKAN KRONOLOGIS
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    shuffle=False
)

print(f"Total baris data: {len(df)}")
print(f"Jumlah Data Latih (80%): {len(X_train)}")
print(f"Jumlah Data Uji (20%): {len(X_test)}")
print("-" * 30)
print("Fitur yang digunakan (X):", X.columns.tolist())

Total baris data: 571
Jumlah Data Latih (80%): 456
Jumlah Data Uji (20%): 115
------------------------------
Fitur yang digunakan (X): ['usia', 'jk', 'eritrosit', 'hematokrit', 'MCHC', 'MCH', 'MCV', 'leukosit', 'trombosit', 'epo', 'hb_now']


In [7]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)

# Cek bagaimana TSCV membagi data X_train kamu
print(f"Total baris di X_train: {len(X_train)}")
print("-" * 30)

for i, (train_index, val_index) in enumerate(tscv.split(X_train)):
    print(f"Iterasi ke-{i+1}:")
    print(f"  Jumlah Data Latih: {len(train_index)} baris (Indeks {train_index[0]} s/d {train_index[-1]})")
    print(f"  Jumlah Data Validasi: {len(val_index)} baris (Indeks {val_index[0]} s/d {val_index[-1]})")
    print("-" * 30)

Total baris di X_train: 456
------------------------------
Iterasi ke-1:
  Jumlah Data Latih: 76 baris (Indeks 0 s/d 75)
  Jumlah Data Validasi: 76 baris (Indeks 76 s/d 151)
------------------------------
Iterasi ke-2:
  Jumlah Data Latih: 152 baris (Indeks 0 s/d 151)
  Jumlah Data Validasi: 76 baris (Indeks 152 s/d 227)
------------------------------
Iterasi ke-3:
  Jumlah Data Latih: 228 baris (Indeks 0 s/d 227)
  Jumlah Data Validasi: 76 baris (Indeks 228 s/d 303)
------------------------------
Iterasi ke-4:
  Jumlah Data Latih: 304 baris (Indeks 0 s/d 303)
  Jumlah Data Validasi: 76 baris (Indeks 304 s/d 379)
------------------------------
Iterasi ke-5:
  Jumlah Data Latih: 380 baris (Indeks 0 s/d 379)
  Jumlah Data Validasi: 76 baris (Indeks 380 s/d 455)
------------------------------


In [8]:
import optuna
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

# --- FUNGSI OBJECTIVE UNTUK MASING-MASING MODEL ---

def obj_svr(trial):
    # Parameter SVR (C dan Epsilon sangat krusial di sini)
    c = trial.suggest_float('C', 0.1, 100.0, log=True)
    epsilon = trial.suggest_float('epsilon', 0.01, 1.0, log=True)
    
    # SVR WAJIB Scaling (StandardScaler)
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVR(C=c, epsilon=epsilon))
    ])
    score = cross_val_score(pipe, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')
    return -score.mean()

def obj_rf(trial):
    n_est = trial.suggest_int('n_estimators', 50, 300)
    depth = trial.suggest_int('max_depth', 3, 15)
    model = RandomForestRegressor(n_estimators=n_est, max_depth=depth, random_state=42)
    # RF tidak wajib scaling karena berbasis pohon
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')
    return -score.mean()

def obj_lgbm(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'verbose': -1
    }
    model = LGBMRegressor(**param, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')
    return -score.mean()

def obj_xgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }
    model = XGBRegressor(**param, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')
    return -score.mean()

# --- EKSEKUSI TUNING (Bisa memakan waktu beberapa menit) ---

print("Menjalankan Tuning SVR...")
study_svr = optuna.create_study(direction='minimize')
study_svr.optimize(obj_svr, n_trials=30)

print("Menjalankan Tuning Random Forest...")
study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(obj_rf, n_trials=30)

print("Menjalankan Tuning LightGBM...")
study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(obj_lgbm, n_trials=30)

print("Menjalankan Tuning XGBoost...")
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(obj_xgb, n_trials=30)

print("\n--- TUNING SELESAI ---")
print(f"SVR MAE: {-study_svr.best_value:.4f}")
print(f"RF MAE: {-study_rf.best_value:.4f}")
print(f"LGBM MAE: {-study_lgbm.best_value:.4f}")
print(f"XGB MAE: {-study_xgb.best_value:.4f}")

[I 2026-02-04 23:31:39,146] A new study created in memory with name: no-name-756105ed-5bc3-46e3-b333-84aa2c5b04e0


Menjalankan Tuning SVR...


[I 2026-02-04 23:31:39,463] Trial 0 finished with value: 0.39625795859430707 and parameters: {'C': 1.6329705839213176, 'epsilon': 0.7881922204834645}. Best is trial 0 with value: 0.39625795859430707.
[I 2026-02-04 23:31:39,676] Trial 1 finished with value: 0.32544036273629506 and parameters: {'C': 0.1526543388475578, 'epsilon': 0.01148173123662436}. Best is trial 1 with value: 0.32544036273629506.
[I 2026-02-04 23:31:39,881] Trial 2 finished with value: 0.32042740490777605 and parameters: {'C': 0.3247763056603174, 'epsilon': 0.32067519918442866}. Best is trial 2 with value: 0.32042740490777605.
[I 2026-02-04 23:31:40,072] Trial 3 finished with value: 0.2220361210144563 and parameters: {'C': 1.2622858277301936, 'epsilon': 0.19407377312522817}. Best is trial 3 with value: 0.2220361210144563.
[I 2026-02-04 23:31:40,164] Trial 4 finished with value: 0.3417658665625399 and parameters: {'C': 78.65062661094423, 'epsilon': 0.6071855077589446}. Best is trial 3 with value: 0.2220361210144563.
[I

Menjalankan Tuning Random Forest...


[I 2026-02-04 23:31:51,234] Trial 0 finished with value: 0.14672661461594144 and parameters: {'n_estimators': 295, 'max_depth': 6}. Best is trial 0 with value: 0.14672661461594144.
[I 2026-02-04 23:31:53,155] Trial 1 finished with value: 0.1419316510466085 and parameters: {'n_estimators': 95, 'max_depth': 11}. Best is trial 1 with value: 0.1419316510466085.
[I 2026-02-04 23:31:55,868] Trial 2 finished with value: 0.14123869610496162 and parameters: {'n_estimators': 122, 'max_depth': 14}. Best is trial 2 with value: 0.14123869610496162.
[I 2026-02-04 23:31:58,645] Trial 3 finished with value: 0.14143815349061692 and parameters: {'n_estimators': 106, 'max_depth': 10}. Best is trial 2 with value: 0.14123869610496162.
[I 2026-02-04 23:31:59,848] Trial 4 finished with value: 0.14407733412055784 and parameters: {'n_estimators': 50, 'max_depth': 12}. Best is trial 2 with value: 0.14123869610496162.
[I 2026-02-04 23:32:02,859] Trial 5 finished with value: 0.14149271931927485 and parameters: {'

Menjalankan Tuning LightGBM...


[I 2026-02-04 23:33:21,814] Trial 0 finished with value: 0.4490123706827525 and parameters: {'n_estimators': 149, 'learning_rate': 0.005120374389916627, 'max_depth': 9, 'num_leaves': 87}. Best is trial 0 with value: 0.4490123706827525.
[I 2026-02-04 23:33:21,987] Trial 1 finished with value: 0.3199410656703777 and parameters: {'n_estimators': 117, 'learning_rate': 0.010644553759084052, 'max_depth': 10, 'num_leaves': 22}. Best is trial 1 with value: 0.3199410656703777.
[I 2026-02-04 23:33:22,324] Trial 2 finished with value: 0.19105111511828993 and parameters: {'n_estimators': 270, 'learning_rate': 0.04756686303321451, 'max_depth': 7, 'num_leaves': 24}. Best is trial 2 with value: 0.19105111511828993.
[I 2026-02-04 23:33:22,608] Trial 3 finished with value: 0.1927366284842246 and parameters: {'n_estimators': 226, 'learning_rate': 0.061303936754439944, 'max_depth': 9, 'num_leaves': 83}. Best is trial 2 with value: 0.19105111511828993.
[I 2026-02-04 23:33:22,811] Trial 4 finished with val

Menjalankan Tuning XGBoost...


[I 2026-02-04 23:33:28,808] Trial 0 finished with value: 0.48939607696441945 and parameters: {'n_estimators': 64, 'learning_rate': 0.010576487376901384, 'max_depth': 4}. Best is trial 0 with value: 0.48939607696441945.
[I 2026-02-04 23:33:29,155] Trial 1 finished with value: 0.3906325405613658 and parameters: {'n_estimators': 50, 'learning_rate': 0.019435640366601392, 'max_depth': 6}. Best is trial 1 with value: 0.3906325405613658.
[I 2026-02-04 23:33:29,617] Trial 2 finished with value: 0.14904507860460545 and parameters: {'n_estimators': 75, 'learning_rate': 0.04553156787287357, 'max_depth': 6}. Best is trial 2 with value: 0.14904507860460545.
[I 2026-02-04 23:33:30,917] Trial 3 finished with value: 0.13668827455998234 and parameters: {'n_estimators': 236, 'learning_rate': 0.053838317414357226, 'max_depth': 7}. Best is trial 3 with value: 0.13668827455998234.
[I 2026-02-04 23:33:31,628] Trial 4 finished with value: 0.1709755960354965 and parameters: {'n_estimators': 169, 'learning_ra


--- TUNING SELESAI ---
SVR MAE: -0.1564
RF MAE: -0.1404
LGBM MAE: -0.1827
XGB MAE: -0.1165


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# 1. Inisialisasi Model Final dengan Best Parameters dari Optuna
models_final = {
    "Linear Regression (Baseline)": LinearRegression(),
    "SVR": Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVR(**study_svr.best_params))
    ]),
    "Random Forest": RandomForestRegressor(**study_rf.best_params, random_state=42),
    "LightGBM": LGBMRegressor(**study_lgbm.best_params, random_state=42, verbose=-1),
    "XGBoost": XGBRegressor(**study_xgb.best_params, random_state=42)
}

# 2. Proses Training Final & Evaluasi di Data Test (20%)
final_results = []

for name, model in models_final.items():
    # Training menggunakan 80% data
    model.fit(X_train, y_train)
    
    # Prediksi menggunakan 20% data uji
    y_pred = model.predict(X_test)
    
    # Perhitungan Metrik Evaluasi
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    final_results.append({
        "Model": name,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    })

# 3. Menampilkan Tabel Perbandingan Akhir
df_final = pd.DataFrame(final_results).sort_values(by="MAE")
print(df_final)

                          Model       MAE      RMSE        R2
0  Linear Regression (Baseline)  0.061925  0.092850  0.992271
4                       XGBoost  0.071231  0.111690  0.988816
1                           SVR  0.072682  0.120805  0.986916
2                 Random Forest  0.076262  0.127864  0.985342
3                      LightGBM  0.098769  0.152281  0.979209
