## 파일 목적
Surrogate Model(single-objective) 구현 및  
Surrogate Model(LightGBM, CatBoost, TabPFN)으로 multi-objective 구현하기 위해, end-to-end로 잘 동작하는 지 확인 

## Surrogate Model(LightGBM)으로 multi-objective 구현

In [None]:
import src.datasets as datasets
import src.surrogate as surrogate
import src.search as search

import numpy as np
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split

In [None]:
import fireducks.pandas as pd
df = pd.read_csv('./data/concrete_processed.csv')
df

In [None]:
target_cols = ['strength', 'cement', 'water']

In [None]:
X = df.drop(columns=target_cols)
y = df[target_cols]    

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True

    
)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print(X_train.shape, type(X_train))
print(y_train.shape, type(y_train))
print(X_test.shape, type(X_test))
print(y_test.shape, type(y_test))

In [None]:
def lightgbm_multi_train(X_train: np.ndarray,
                         y_train: np.ndarray, 
                         params: dict = None):
    
    if params is None:
        params = {
            "objective": "regression",   
            "boosting_type": "gbdt",     
            "learning_rate": 0.05,       
            "num_leaves": 31,
            "max_depth": -1,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "n_jobs": -1,
            "random_state": 42
        }
    # X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)
    base_estimator = LGBMRegressor(**params, n_estimators=1000)
    multi_model = MultiOutputRegressor(base_estimator)
    multi_model.fit(X_train, y_train,
                    # eval_set=[(X_val, y_val)],
                    # eval_metric="rmse",
                    # callbacks=[
                    # early_stopping(stopping_rounds=50),
                    # log_evaluation(period=100)
                    # ]
                    )

    return multi_model

In [None]:
model = lightgbm_multi_train(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred.shape

In [None]:
y_train.shape

In [None]:
# 모델 저장
import joblib

def lightgbm_save(model, path):
    """MultiOutputRegressor의 개별 LightGBM 모델을 저장"""
    # models = model.estimators_  # 내부의 개별 모델 리스트 가져오기
    # joblib.dump(models, path)
    joblib.dump(model, path)

In [None]:
lightgbm_save(model, './model_save/multi_lightgbm.pkl')

In [None]:
def lightgbm_load(path):
    """joblib을 이용해 MultiOutputRegressor를 불러오기"""
    return joblib.load(path)

In [None]:
loaded_model = lightgbm_load('./model_save/multi_lightgbm.pkl')

In [None]:
y_pred = loaded_model.predict(X_test)
y_pred.shape

In [None]:
import numpy as np

def lightgbm_multi_evaluate(model, y_train, y_pred, y_test, target_cols):
    rmse_list, mae_list, r2_list = [], [], []
    
    for idx, col in enumerate(target_cols):
        y_true_i = y_test[:, idx]  # target_dict의 key를 인덱스로 변환
        y_pred_i = y_pred[:, idx]
        
        mae_i = np.mean(np.abs(y_true_i - y_pred_i))
        mse_i = np.mean((y_true_i - y_pred_i) ** 2)
        rmse_i = np.sqrt(mse_i)
        
        sse_i = np.sum((y_true_i - y_pred_i) ** 2)
        sst_i = np.sum(y_true_i - np.mean(y_train) ** 2)
        
        r2_i = 1 - sse_i / sst_i
        
        rmse_list.append(rmse_i)
        mae_list.append(mae_i)
        r2_list.append(r2_i)
        
        print(f"Target '{col}' - RMSE: {rmse_i:.4f}, MAE: {mae_i:.4f}, R2: {r2_i:.4f}")
    
    rmse_mean = np.mean(rmse_list)
    mae_mean = np.mean(mae_list)
    r2_mean = np.mean(r2_list)
    
    print(f"[Average Metrics] RMSE: {rmse_mean:.4f}, MAE: {mae_mean:.4f}, R2: {r2_mean:.4f}")
    return rmse_mean, mae_mean, r2_mean

In [None]:
rmse_mean, mae_mean, r2_mean = lightgbm_multi_evaluate(model,
                        y_train, 
                        y_pred,
                        y_test,
                        target_cols
                        )

print(f'lightGBM 모델 mulit-objective RMSE: {rmse_mean:.4f}')
print(f'lightGBM 모델 mulit-objective MAE: {mae_mean:.4f}')
print(f'lightGBM 모델 mulit-objective R^2: {r2_mean:.4f}')

## Surrogate Model(CatBoost)으로 single-objective 구현

In [None]:
import src.datasets as datasets
import src.surrogate as surrogate
import src.search as search

In [None]:
load_data_func = getattr(datasets, f'cement_data')
X_train, X_test, y_train, y_test = load_data_func('./data/concrete_processed.csv')

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# 비교를 위한 선형회귀
from sklearn.linear_model import LinearRegression
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)
baseline_pred = baseline_model.predict(X_test)
baseline_r2 = 1 - np.sum((y_test - baseline_pred) ** 2) / (np.sum((y_test - y_train.mean()) ** 2) + 1e-10)
print(f'선형회귀 모델 R^2: {baseline_r2.item() :.4f}')

In [None]:
from catboost import CatBoostRegressor

In [None]:
model = CatBoostRegressor(
    iterations=2000,        # 학습 반복 횟수
    depth=7,                # 트리 깊이
    learning_rate=0.05,     # 학습률
    bagging_temperature=1, # 앙상블 다양성을 조절 (1~3 추천)
    # l2_leaf_reg=5,         # L2 정규화 (3~10 사이에서 튜닝 가능)
    loss_function='RMSE',   # 손실 함수 (회귀 문제이므로 RMSE 사용)
    # eval_metric='RMSE',     # 평가 지표
    random_seed=42,
    verbose=100,            # 학습 과정 출력
    early_stopping_rounds=100  # 조기 종료
)

model.fit(X_train, y_train)

In [None]:
def eval_surrogate_model(y_train,y_pred, y_test):
    rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
    mae = np.mean(np.abs(y_test - y_pred))
    SSE = np.sum(np.square(y_test - y_pred))    
    SST = np.sum(np.square(y_test - y_train.mean()))
    r2 = 1 - SSE/SST
    return rmse, mae, r2

In [None]:
y_pred = model.predict(X_test)
print(y_pred.shape)
if y_pred.ndim == 1:
        y_pred = y_pred.reshape(-1, 1)
print(y_pred.shape)

In [None]:
rmse, mae, r2 = eval_surrogate_model(y_train, y_pred, y_test)
print(f'catboost 모델 RMSE: {rmse:.4f}')
print(f'catboost 모델 MAE: {mae:.4f}')
print(f'catboost 모델 R^2: {r2:.4f}')

In [None]:
# 모델저장
def catboost_save(model, path):
    """CatBoost 모델 저장"""
    model.save_model(path, format="cbm")  # CatBoost 전용 포맷으로 저장

In [None]:
catboost_save(model,'./model_save/catboost_model.cbm')

In [None]:
# 모델불러오기
def catboost_load(path):
    """CatBoost 모델 불러오기"""
    model = CatBoostRegressor()  # 회귀 모델이면 CatBoostRegressor, 분류 모델이면 CatBoostClassifier
    model.load_model(path)
    return model

In [None]:
loaded_model = catboost_load('./model_save/catboost_model.cbm')

In [None]:
y_pred = loaded_model.predict(X_test)

In [None]:
y_pred.shape

## Surrogate Model(CatBoost)으로 multi-objective 구현

In [None]:
import fireducks.pandas as pd
df = pd.read_csv('./data/concrete_processed.csv')
df

In [None]:
target_cols = ['strength', 'cement', 'water']
X = df.drop(columns=target_cols)
y = df[target_cols]    

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [None]:
print(X_train.shape, type(X_train))
print(y_train.shape, type(y_train))
print(X_test.shape, type(X_test))
print(y_test.shape, type(y_test))

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print(X_train.shape, type(X_train))
print(y_train.shape, type(y_train))
print(X_test.shape, type(X_test))
print(y_test.shape, type(y_test))

### 방법 1 : 개별모델학습해서 multi-objective 최적화

In [None]:
from catboost import CatBoostRegressor

In [None]:
models = []
for i in range(y.shape[1]):
    model = CatBoostRegressor(
        iterations=2000,
        depth=7,
        learning_rate=0.05,
        loss_function='RMSE',
        random_seed=42,
        verbose=200
    )
    model.fit(X_train, y_train[:, i], early_stopping_rounds=100)
    models.append(model)

In [None]:
y_preds = []
for m in models:
    y_pred = m.predict(X_test)
    print(y_pred.shape)
    if y_pred.ndim == 1:
        y_pred = y_pred.reshape(-1, 1)
    print(y_pred.shape)
    y_preds.append(y_pred)
y_preds = np.column_stack(y_preds)
print(y_preds.shape)

# y_preds = np.column_stack([m.predict(X_test) for m in models])

In [None]:
import numpy as np

def catboost_multi_evaluate(model, y_train, y_pred, y_test, target_cols):
    rmse_list, mae_list, r2_list = [], [], []
    
    for idx, col in enumerate(target_cols):
        y_true_i = y_test[:, idx]  # target_dict의 key를 인덱스로 변환
        y_pred_i = y_pred[:, idx]
        
        mae_i = np.mean(np.abs(y_true_i - y_pred_i))
        mse_i = np.mean((y_true_i - y_pred_i) ** 2)
        rmse_i = np.sqrt(mse_i)
        
        sse_i = np.sum((y_true_i - y_pred_i) ** 2)
        sst_i = np.sum(y_true_i - np.mean(y_train) ** 2)
        
        r2_i = 1 - sse_i / sst_i
        
        rmse_list.append(rmse_i)
        mae_list.append(mae_i)
        r2_list.append(r2_i)
        
        print(f"Target '{col}' - RMSE: {rmse_i:.4f}, MAE: {mae_i:.4f}, R2: {r2_i:.4f}")
    
    rmse_mean = np.mean(rmse_list)
    mae_mean = np.mean(mae_list)
    r2_mean = np.mean(r2_list)
    
    print(f"[Average Metrics] RMSE: {rmse_mean:.4f}, MAE: {mae_mean:.4f}, R2: {r2_mean:.4f}")
    return rmse_mean, mae_mean, r2_mean

In [None]:
rmse_mean, mae_mean, r2_mean = catboost_multi_evaluate(model,
                            y_train,    
                            y_preds,  
                            y_test,   
                            target_cols)
print(f'catboost 모델 mulit-objective RMSE: {rmse_mean:.4f}')
print(f'catboost 모델 mulit-objective MAE: {mae_mean:.4f}')
print(f'catboost 모델 mulit-objective R^2: {r2_mean:.4f}')

### 방법 2 : custom loss function 활용 - 실패

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
class MultiObjectiveLoss:
    def __init__(self, alpha=0.5, beta=0.5):
        """
        Multi-Objective 손실 함수 (RMSE + MAE)
        alpha: RMSE 가중치
        beta: MAE 가중치
        """
        self.alpha = alpha
        self.beta = beta
    
    def calc_ders_range(self, approxes, targets, weights):
        """
        approxes: 예측값 (logits)
        targets: 실제 값
        weights: 샘플 가중치 (필요 없을 경우 None)
        """
        assert len(approxes) == len(targets)
        
        derivatives = []
        second_derivatives = []
        
        for i in range(len(approxes)):
            error1 = approxes[i] - targets[i]  # RMSE 오차
            error2 = approxes[i] - targets[i]  # MAE 오차
            
            grad_rmse = error1 # RMSE 미분값
            grad_mae = np.sign(error2) # MAE 미분값
            
            grad = self.alpha * grad_rmse + self.beta * grad_mae  # Gradient (1차 미분)
            # Hessian (2차 미분) - 보통 1로 설정
            hess = 1.0  # CatBoost에서는 Hessian을 안 쓰는 경우가 많음
            
            derivatives.append(grad)
            second_derivatives.append(hess)
        
        return zip(derivatives, second_derivatives)

In [None]:
model = CatBoostRegressor(
    iterations=2000,
    depth=7,
    learning_rate=0.05,
    loss_function=MultiObjectiveLoss(alpha=0.7, beta=0.3),  # 가중치 조정 가능
    random_seed=42,
    verbose=200
)

In [None]:
model.fit(X_train, y_train)

### 방법 3 :  CatBoost의 MultiRegression 모드

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
model = CatBoostRegressor(
    iterations=2000,
    depth=7,
    learning_rate=0.05,
    loss_function="MultiRMSE",  # MultiRegression을 위한 손실 함수
    random_seed=42,
    verbose=200
)

In [None]:
model.fit(X_train, y_train)


In [None]:
y_preds = model.predict(X_test)
print(y_preds.shape)
print(type(y_preds))

In [None]:
print(y_test.shape)
print(type(y_test))

In [None]:
import numpy as np

def catboost_multi_evaluate(model, y_train, y_pred, y_test, target_cols):
    rmse_list, mae_list, r2_list = [], [], []
    
    for idx, col in enumerate(target_cols):
        y_true_i = y_test[:, idx]  # target_dict의 key를 인덱스로 변환
        y_pred_i = y_pred[:, idx]
        
        mae_i = np.mean(np.abs(y_true_i - y_pred_i))
        mse_i = np.mean((y_true_i - y_pred_i) ** 2)
        rmse_i = np.sqrt(mse_i)
        
        sse_i = np.sum((y_true_i - y_pred_i) ** 2)
        sst_i = np.sum(y_true_i - np.mean(y_train) ** 2)
        
        r2_i = 1 - sse_i / sst_i
        
        rmse_list.append(rmse_i)
        mae_list.append(mae_i)
        r2_list.append(r2_i)
        
        print(f"Target '{col}' - RMSE: {rmse_i:.4f}, MAE: {mae_i:.4f}, R2: {r2_i:.4f}")
    
    rmse_mean = np.mean(rmse_list)
    mae_mean = np.mean(mae_list)
    r2_mean = np.mean(r2_list)
    
    print(f"[Average Metrics] RMSE: {rmse_mean:.4f}, MAE: {mae_mean:.4f}, R2: {r2_mean:.4f}")
    return rmse_mean, mae_mean, r2_mean

In [None]:
rmse_mean, mae_mean, r2_mean = catboost_multi_evaluate(model,
                            y_train,    
                            y_preds,  
                            y_test,   
                            target_cols)
print(f'catboost 모델 mulit-objective RMSE: {rmse_mean:.4f}')
print(f'catboost 모델 mulit-objective MAE: {mae_mean:.4f}')
print(f'catboost 모델 mulit-objective R^2: {r2_mean:.4f}')

In [None]:
# 모델저장
def catboost_save(model, path):
    """
    CatBoost 모델을 지정된 경로에 저장합니다.

    Args:
        model (CatBoostRegressor): 저장할 CatBoost 모델 객체
        path (str): 모델을 저장할 파일 경로

    Returns:
        None
    """
    model.save_model(path, format='cbm')

In [None]:
catboost_save(model,'./model_save/catboost_multi_model.cbm')

In [None]:
# 모델불러오기
def catboost_load(path):
    """CatBoost 모델 불러오기"""
    model = CatBoostRegressor()  # 회귀 모델이면 CatBoostRegressor, 분류 모델이면 CatBoostClassifier
    model.load_model(path)
    return model

In [None]:
loaded_model = catboost_load('./model_save/catboost_multi_model.cbm')

In [None]:
y_pred = loaded_model.predict(X_test)

## Surrogate Model(TabPFN)으로 multi-objective 구현

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from tabpfn import TabPFNRegressor

In [None]:
models = []
for i in range(y_train.shape[1]):
    model = TabPFNRegressor()
    # model.fit(X_train, y_train.iloc[:, i])
    model.fit(X_train, y_train[:, i])
    models.append(model)

In [None]:
y_preds = []
for m in models:
    y_pred = m.predict(X_test)
    print(y_pred.shape)
    if y_pred.ndim == 1:
        y_pred = y_pred.reshape(-1, 1)
    print(y_pred.shape)
    y_preds.append(y_pred)
y_preds = np.column_stack(y_preds)
print(y_preds.shape, type(y_preds))

In [None]:
import numpy as np

def tabpfn_multi_evaluate(model, y_train, y_pred, y_test, target_cols):
    rmse_list, mae_list, r2_list = [], [], []
    
    for idx, col in enumerate(target_cols):
        y_true_i = y_test[:, idx]  # target_dict의 key를 인덱스로 변환
        y_pred_i = y_pred[:, idx]
        
        mae_i = np.mean(np.abs(y_true_i - y_pred_i))
        mse_i = np.mean((y_true_i - y_pred_i) ** 2)
        rmse_i = np.sqrt(mse_i)
        
        sse_i = np.sum((y_true_i - y_pred_i) ** 2)
        sst_i = np.sum(y_true_i - np.mean(y_train) ** 2)
        
        r2_i = 1 - sse_i / sst_i
        
        rmse_list.append(rmse_i)
        mae_list.append(mae_i)
        r2_list.append(r2_i)
        
        print(f"Target '{col}' - RMSE: {rmse_i:.4f}, MAE: {mae_i:.4f}, R2: {r2_i:.4f}")
    
    rmse_mean = np.mean(rmse_list)
    mae_mean = np.mean(mae_list)
    r2_mean = np.mean(r2_list)
    
    print(f"[Average Metrics] RMSE: {rmse_mean:.4f}, MAE: {mae_mean:.4f}, R2: {r2_mean:.4f}")
    return rmse_mean, mae_mean, r2_mean

In [None]:
rmse_mean, mae_mean, r2_mean = tabpfn_multi_evaluate(model,
                            y_train,    
                            y_preds,  
                            y_test,   
                            target_cols)
print(f'tabpfn 모델 mulit-objective RMSE: {rmse_mean:.4f}')
print(f'tabpfn 모델 mulit-objective MAE: {mae_mean:.4f}')
print(f'tabpfn 모델 mulit-objective R^2: {r2_mean:.4f}')