### Library

In [1]:
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from data.feature_engineering import *
from model.inference import save_csv
from model.feature_select import select_features
from model.data_split import split_features_and_target
from model.model_train import set_model, optuna_train
#from model.TreeModel import XGBoost
from pytorch_tabnet.tab_model import TabNetRegressor
import optuna
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Data load

In [2]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()
# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

### Data Preprocessing

In [3]:
# 위치 중복도 낮은 행 삭제
groups = train_data.groupby(["latitude", "longitude"])["index"].count()
conditioned_groups_index = groups[(groups >= 2) & (groups <= 5)].index # 이 범위를 파라미터로 조정하는걸로
small_groups = train_data[
    train_data["latitude"].isin(conditioned_groups_index.get_level_values(0)) &
    train_data["longitude"].isin(conditioned_groups_index.get_level_values(1))
]
train_data.drop(small_groups.index, axis=0, inplace=True)

# built_year > 2024 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

### Feature engineering

**Clustering**

In [4]:
# 데이터 준비
feature_columns = ["latitude", "longitude"]
coords = train_data[feature_columns]

# ClusteringModel 클래스 인스턴스 생성
clustering_model = ClusteringModel(data=coords)
kmeans_model = clustering_model.kmeans_clustering(n_clusters=25, 
                                                train_data=train_data, 
                                                test_data=test_data, 
                                                feature_columns=feature_columns, 
                                                label_column="region")

region_mean_prices = train_data.groupby("region")["deposit"].mean().reset_index()
region_mean_prices.columns = ["region", "mean_deposit"]
region_mean_prices["mean_deposit_category"] = region_mean_prices["mean_deposit"] // 10000

# train_data와 region_mean_prices 병합
train_data = train_data.merge(region_mean_prices, on="region", how="left")
test_data = test_data.merge(region_mean_prices, on="region", how="left")

**Log변환**

In [5]:
train_data, test_data = apply_log_transformation(train_data, test_data)

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


**train_data split**

In [6]:
X, y = split_features_and_target(train_data)

**Feature select**

In [7]:
#X, test_data = select_features(X, y, test_data)

In [8]:
X.columns

Index(['index', 'area_m2', 'contract_year_month', 'contract_day',
       'contract_type', 'floor', 'built_year', 'latitude', 'longitude', 'age',
       'interest_rate', 'nearest_subway_distance', 'nearest_subway_latitude',
       'nearest_subway_longitude', 'nearest_school_distance',
       'nearest_school_latitude', 'nearest_school_longitude',
       'nearest_park_distance', 'nearest_park_latitude',
       'nearest_park_longitude', 'nearest_subway_num', 'nearest_school_num',
       'nearest_park_num', 'num_of_subways_within_radius',
       'num_of_schools_within_radius', 'num_of_parks_within_radius',
       'park_exists', 'region', 'region_mean', 'nearest_leader_distance',
       'nearest_leader_latitude', 'nearest_leader_longitude', 'mean_deposit',
       'mean_deposit_category', 'log_area_m2', 'log_subway_distance',
       'log_school_distance', 'log_park_distance', 'log_leader_distance'],
      dtype='object')

In [9]:
y.columns

Index(['deposit', 'log_deposit'], dtype='object')

In [10]:
X.shape

(1790125, 39)

In [11]:
y.shape

(1790125, 2)

### Model Train and Evaluate

**Tabnet**
- 테이블 데이터에서도 딥러닝이 잘 동작할 수 있게 만들어진 모델
- 자동으로 중요한 features를 선택하기 떄문에 feature select부분은 제외

model_train.py 따라한 ver

In [12]:
# from sklearn.metrics import mean_absolute_error
# from sklearn.model_selection import KFold

# def cv_train(model, X: pd.DataFrame, y: pd.DataFrame, verbose: bool = True) -> float:
#     """K-Fold를 이용하여 Cross Validation을 수행하는 함수입니다.

#     Args:
#         model: 수행하려는 모델
#         X (pd.DataFrame): 독립 변수
#         y (pd.DataFrame): 예측 변수. deposit과 log_deposit 열로 나뉨.
#         verbose (bool, optional): Fold별 진행상황을 출력할지 여부. Defaults to True.

#     Returns:
#         float: 평균 MAE
#     """
#     cv = 5
#     kfold = KFold(n_splits=cv, shuffle=True, random_state=42)

#     mae_list = []
#     for i, (train_idx, valid_idx) in enumerate(kfold.split(X, y), start=1):
#         if verbose: print(f"training...[{i}/{cv}]")

#         X_train, y_train = X.loc[train_idx, :].values, y.loc[train_idx, "log_deposit"].values.reshape(-1, 1)
#         X_valid, y_valid = X.loc[valid_idx, :].values, y.loc[valid_idx, "deposit"].values.reshape(-1, 1)

#         model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=["mae"])

#         y_pred = model.predict(X_valid)
#         y_pred = np.expm1(y_pred)
#         fold_mae = mean_absolute_error(y_valid, y_pred)
#         if verbose: print(f"Valid MAE: {fold_mae:.4f}")
#         mae_list.append(fold_mae)

#     mae = np.mean(mae_list)
#     if verbose:
#         print("### K-fold Result ###")
#         print(f"Valid MAE: {mae:.4f}")
    
#     return mae

In [13]:
# def objective(trial):
#     params = {
#         "n_d": trial.suggest_int("n_d", 16, 64),
#         "n_steps": trial.suggest_int("n_steps", 3, 10),
#         "gamma": trial.suggest_float("gamma", 1.0, 2.0),
#         "lambda_sparse": trial.suggest_float("lambda_sparse", 0.0001, 0.01),
#         "optimizer_fn": torch.optim.Adam,
#         "optimizer_params": dict(lr=trial.suggest_float("learning_rate", 0.01, 0.1)),
#     }
#     model = TabNetRegressor(**params)
#     return cv_train(model, X, y, verbose=False)

합친 ver

In [14]:
# from sklearn.metrics import mean_absolute_error
# from sklearn.model_selection import KFold, cross_val_predict

# def objective(trial):
#     params = {
#         "n_d": trial.suggest_int("n_d", 8, 64),
#         "n_a": trial.suggest_int("n_a", 8, 64),
#         "n_steps": trial.suggest_int("n_steps", 3, 10),
#         "gamma": trial.suggest_float("gamma", 1.0, 2.0),
#         "lambda_sparse": trial.suggest_float("lambda_sparse", 0.0001, 0.01),
#         "optimizer_params": dict(lr=trial.suggest_float("learning_rate", 0.01, 0.1)),
#     }
    
#     # K-Fold 교차 검증
#     cv = 5
#     kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
#     mae_list = []
    
#     for i, (train_idx, valid_idx) in enumerate(kfold.split(X, y), start=1):
#         if True: print(f"training...[{i}/{cv}]")
#         X_train, y_train = X.loc[train_idx, :].values, y.loc[train_idx, "log_deposit"].values.reshape(-1, 1)
#         X_valid, y_valid = X.loc[valid_idx, :].values, y.loc[valid_idx, "deposit"].values.reshape(-1, 1)
        
#         model = TabNetRegressor(**params)
#         # 모델 학습 (patience : 성능 개선되지않으면 early stopping)
#         model.fit(
#             X_train, y_train, 
#             eval_set=[(X_valid, y_valid)], 
#             eval_metric=["mae"], 
#             max_epochs=100,
#             patience=10,
#             batch_size=1024,
#             virtual_batch_size=128,
#         )
        
        
#         # 검증 데이터에 대한 예측
#         y_pred = model.predict(X_valid.values)
#         y_pred = np.expm1(y_pred)  # 로그 변환의 역변환
        
#         # MAE 계산
#         mae = mean_absolute_error(y_valid, y_pred) 
#         mae_list.append(mae)

#     # 교차 검증 후 MAE 평균값 반환
#     return np.mean(mae_list)

In [15]:
# import pandas as pd
# import numpy as np
# from sklearn.metrics import mean_absolute_error
# from sklearn.model_selection import KFold
# from model.TreeModel import XGBoost, LightGBM, CatBoost
# import optuna
# RANDOM_SEED = 42

# def set_model(model_name: str, **params):
#     """주어진 모델 이름에 따라 모델을 생성하고 반환하는 함수입니다.

#     Args:
#         model_name (str): 생성하려는 모델 이름
#         **params (dict): 모델 생성 시 사용할 하이퍼파라미터

#     Returns:
#         model (object): 생성된 모델 객체
#     """
#     match model_name:
#         case "xgboost":
#             model = XGBoost(**params)
#         case "lightgbm":
#             model = LightGBM(**params)
#         case "catboost":
#             model = CatBoost(**params)
#         case "tabnet":
#             model = TabNetRegressor(**params)
#     return model

# def cv_train(model, X: pd.DataFrame, y: pd.DataFrame, verbose: bool = True) -> float:
#     """K-Fold를 이용하여 Cross Validation을 수행하는 함수입니다.

#     Args:
#         model: 수행하려는 모델
#         X (pd.DataFrame): 독립 변수
#         y (pd.DataFrame): 예측 변수. deposit과 log_deposit 열로 나뉨.
#         verbose (bool, optional): Fold별 진행상황을 출력할지 여부. Defaults to True.

#     Returns:
#         float: 평균 MAE
#     """
#     cv = 5
#     kfold = KFold(n_splits=cv, shuffle=True, random_state=42)

#     mae_list = []
#     for i, (train_idx, valid_idx) in enumerate(kfold.split(X, y), start=1):
#         if verbose: print(f"training...[{i}/{cv}]")

#         X_train, y_train = X.loc[train_idx, :].values, y.loc[train_idx, "log_deposit"].values.reshape(-1,1)
#         X_valid, y_valid = X.loc[valid_idx, :].values, y.loc[valid_idx, "deposit"].values.reshape(-1,1)

#         model.fit(
#             X_train, y_train, 
#             eval_set=[(X_valid, y_valid)], 
#             eval_metric=["mae"], 
#             max_epochs=100,
#             patience=10
#         )

#         y_pred = model.predict(X_valid)
#         y_pred = np.expm1(y_pred)
#         fold_mae = mean_absolute_error(y_valid, y_pred)
#         if verbose: print(f"Valid MAE: {fold_mae:.4f}")
#         mae_list.append(fold_mae)

#     mae = np.mean(mae_list)
#     if verbose:
#         print("### K-fold Result ###")
#         print(f"Valid MAE: {mae:.4f}")
    
#     return mae

# def optuna_train(model_name: str, X: pd.DataFrame, y: pd.DataFrame) -> tuple[dict, float]:
#     """Optuna를 사용하여 주어진 모델의 하이퍼파라미터를 최적하는 함수

#     Args:
#         model_name (str): 최적화할 모델의 이름
#         X (pd.DataFrame): 독립 변수
#         y (pd.DataFrame): 예측 변수

#     Returns:
#         tuple[dict, float]:
#             - dict: 최적의 하이퍼파라미터
#             - float: 최적의 하이퍼파라미터에 대한 성능 지표(MAE)
#     """
#     def objective(trial):
#         match model_name:
#             case "xgboost":
#                 params = {
#                     "n_estimators": trial.suggest_int("n_estimators", 50, 300),
#                     "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
#                     "max_depth": trial.suggest_int("max_depth", 5, 12),
#                     "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#                 }
#             case "lightgbm":
#                 params = {
#                     "verbose": -1,
#                     "n_estimators": trial.suggest_int("n_estimators", 50, 300),
#                     "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
#                     "max_depth": trial.suggest_int("max_depth", 5, 12),
#                     "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#                     "num_leaves": trial.suggest_int("num_leaves", 20, 150),
#                     "objective": "regression_l1"
#             }
#             case "catboost":
#                 params = {
#                     "verbose": 0,
#                     "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#                     "iterations": trial.suggest_int("iterations", 50, 500),
#                     "depth": trial.suggest_int("depth", 3, 10),
#                     "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 1, 10),
#                     # "bagging_temperature": trial.suggest_loguniform("bagging_temperature", 0.01, 1),
#                     # "border_count": trial.suggest_int("border_count", 32, 255),
#                     "cat_features": ["contract_day"],
#                     "task_type": "GPU",
#                     "devices": "cuda",
#                 }
#             case "tabnet":
#                 params = {
#                     "n_d": trial.suggest_int("n_d", 8, 64),
#                     "n_steps": trial.suggest_int("n_steps", 3, 10),
#                     "gamma": trial.suggest_float("gamma", 1.0, 2.0),
#                     "lambda_sparse": trial.suggest_float("lambda_sparse", 0.0001, 0.01),
#                     "optimizer_params": dict(lr=trial.suggest_float("learning_rate", 0.01, 0.1)),
#                 }
#         model = set_model(model_name, **params)
#         return cv_train(model, X, y, verbose=False)
    
#     sampler = optuna.samplers.TPESampler(seed=42)
#     study = optuna.create_study(direction="minimize", sampler=sampler)
#     study.optimize(objective, n_trials=50)
#     return study.best_params, study.best_value

값이 너무 커서 hyperparameter 범위 조정 & kfold train,valid split부분 수정

In [16]:
# import numpy as np
# import pandas as pd
# from sklearn.metrics import mean_absolute_error
# from sklearn.model_selection import KFold
# import optuna
# import torch
# from pytorch_tabnet.tab_model import TabNetRegressor

# def cv_train(model, X: pd.DataFrame, y: pd.DataFrame, verbose: bool = True) -> float:
#     """K-Fold를 이용하여 Cross Validation을 수행하는 함수입니다.

#     Args:
#         model: 수행하려는 모델
#         X (pd.DataFrame): 독립 변수
#         y (pd.DataFrame): 예측 변수. deposit과 log_deposit 열로 나뉨.
#         verbose (bool, optional): Fold별 진행상황을 출력할지 여부. Defaults to True.

#     Returns:
#         float: 평균 MAE
#     """
#     cv = 5
#     kfold = KFold(n_splits=cv, shuffle=True, random_state=42)

#     mae_list = []
#     for i, (train_idx, valid_idx) in enumerate(kfold.split(X, y), start=1):
#         if verbose: print(f"training...[{i}/{cv}]")

#         X_train, y_train = X.iloc[train_idx, :].values, y.iloc[train_idx, 1].values.reshape(-1, 1)
#         X_valid, y_valid = X.iloc[valid_idx, :].values, y.iloc[valid_idx, 0].values.reshape(-1, 1)


#         model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=["mae"])

#         y_pred = model.predict(X_valid)
#         y_pred = np.expm1(y_pred)  # 로그 변환 복구
        
#         fold_mae = mean_absolute_error(y_valid, y_pred)
#         if verbose: print(f"Valid MAE: {fold_mae:.4f}")
#         mae_list.append(fold_mae)

#     mae = np.mean(mae_list)
#     if verbose:
#         print("### K-fold Result ###")
#         print(f"Valid MAE: {mae:.4f}")
    
#     return mae

# def objective(trial):
#     params = {
#         "n_d": trial.suggest_int("n_d", 8, 64),
#         'n_a': trial.suggest_int('n_a', 8, 64),
#         "n_steps": trial.suggest_int("n_steps", 3, 10),
#         "gamma": trial.suggest_float("gamma", 1.0, 2.0),
#         'n_independent': trial.suggest_int('n_independent', 1, 5),
#         'n_shared': trial.suggest_int('n_shared', 1, 5),
#         "lambda_sparse": trial.suggest_float("lambda_sparse", 0.0001, 0.01),
#         "optimizer_fn": torch.optim.Adam,
#         "optimizer_params": dict(lr=trial.suggest_float("learning_rate", 0.001, 0.01)),
#     }
#     model = TabNetRegressor(**params)
#     return cv_train(model, X, y, verbose=False)


In [17]:
# sampler = optuna.samplers.TPESampler(seed=42)
# study = optuna.create_study(direction="minimize", sampler=sampler)
# study.optimize(objective, n_trials=50)

kfold 제거 ver 그러나 실패

In [18]:
# import numpy as np
# import pandas as pd
# from sklearn.metrics import mean_absolute_error
# import optuna
# import torch
# from pytorch_tabnet.tab_model import TabNetRegressor

# def train_model(model, X: pd.DataFrame, y: pd.DataFrame) -> float:
#     """모델을 학습하고 검증 MAE를 계산하는 함수입니다.

#     Args:
#         model: 수행하려는 모델
#         X (pd.DataFrame): 독립 변수
#         y (pd.DataFrame): 예측 변수. deposit과 log_deposit 열로 나뉨.

#     Returns:
#         float: 검증 MAE
#     """
#     # 모델 학습
#     model.fit(X.values, y.iloc[:, 0].values.reshape(-1, 1), eval_metric=["mae"])

#     # 예측 및 로그 변환 복구
#     y_pred = model.predict(X.values)
#     y_pred = np.expm1(y_pred)  # log_deposit의 inverse log 처리

#     # 검증 MAE 계산
#     mae = mean_absolute_error(y.iloc[:, 1], y_pred)  # deposit 열을 사용
#     return mae

# def objective(trial):
#     """Optuna를 이용하여 Hyperparameter 튜닝을 수행하는 함수입니다."""
#     params = {
#         "n_d": trial.suggest_int("n_d", 8, 64),
#         "n_a": trial.suggest_int("n_a", 8, 64),
#         "n_steps": trial.suggest_int("n_steps", 3, 10),
#         "gamma": trial.suggest_float("gamma", 1.0, 2.0),
#         "n_independent": trial.suggest_int("n_independent", 1, 5),
#         "n_shared": trial.suggest_int("n_shared", 1, 5),
#         "lambda_sparse": trial.suggest_float("lambda_sparse", 0.0001, 0.01),
#         "optimizer_fn": torch.optim.Adam,
#         "optimizer_params": dict(lr=trial.suggest_float("learning_rate", 0.001, 0.01)),
#     }

#     # TabNet 모델 생성
#     model = TabNetRegressor(**params)
    
#     # 모델 학습 및 MAE 계산
#     mae = train_model(model, X, y)
    
#     return mae

# # Optuna 실험 세팅 및 실행
# sampler = optuna.samplers.TPESampler(seed=42)
# study = optuna.create_study(direction="minimize", sampler=sampler)
# study.optimize(objective, n_trials=50)

# # 최적 하이퍼파라미터 출력
# best_params = study.best_params
# print("Best hyperparameters: ", best_params)
# print("Best MAE: ", study.best_value)


kfold 없애고 max_epoch=100 + GPU

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
#from pytorch_tabnet.callbacks import Callback

X_train, X_val, y_train, y_val = train_test_split(X, y["log_deposit"], test_size=0.2, shuffle=True ,random_state=42)

In [20]:
#class MAECallback(Callback):
    # def on_epoch_end(self, epoch, logs=None):
    #     logs = logs or {}
    #     if 'valid_mae' in logs:
    #         print(f"Epoch {epoch+1} - Valid MAE: {logs['valid_mae']}")

def train_model(model, X: pd.DataFrame, y: pd.DataFrame) -> float:
    """모델을 학습하고 검증 MAE를 계산하는 함수입니다.

    Args:
        model: 수행하려는 모델
        X (pd.DataFrame): 독립 변수
        y (pd.DataFrame): 예측 변수. deposit과 log_deposit 열로 나뉨.

    Returns:
        float: 검증 MAE
    """
    # 모델 학습
    model.fit(
            X_train.values, y_train.values.reshape(-1, 1),
            eval_set=[(X_train.values, y_train.values.reshape(-1, 1)),(X_val.values, y_val.values.reshape(-1, 1))],
            eval_name=['train', 'valid'],
            eval_metric=["mae"],
            loss_fn=torch.nn.L1Loss(),
            max_epochs=30, 
            patience=10,
            batch_size=8192,
            drop_last=False,
            warm_start=True  # warm start 활성화
    )

    # 예측 및 로그 변환 복구
    y_pred_log = model.predict(X_val.values)
    y_pred = np.expm1(y_pred_log)  # log_deposit의 inverse log 처리

    # 검증 MAE 계산
    y_val_actual = np.expm1(y_val.values)
    mae = mean_absolute_error(y_val_actual, y_pred)  # deposit 열을 사용
    return mae

def objective(trial):
    """Optuna를 이용하여 Hyperparameter 튜닝을 수행하는 함수입니다."""
    # n_d를 먼저 제안합니다.
    n_d = trial.suggest_int("n_d", 8, 64)
    params = {
            "n_d": n_d,
            "n_a": n_d,  # n_a는 n_d와 동일하게 설정
            "n_steps": trial.suggest_int("n_steps", 3, 10),
            "gamma": trial.suggest_float("gamma", 1.0, 2.0),
            "n_independent": 2, # 필요하면 3, 4로 늘려본다
            "n_shared": 2, # 필요하면 3, 4로 늘려본다
            "lambda_sparse": trial.suggest_float("lambda_sparse", 0.001, 0.01),
            "optimizer_fn": torch.optim.Adam,
            "optimizer_params": dict(lr=trial.suggest_float("learning_rate", 0.001, 0.01)),
            "verbose": 1,
            "device_name" : "cuda" if torch.cuda.is_available() else "cpu",
            "seed" : 42
    }

    # TabNet 모델 생성
    model = TabNetRegressor(**params)
    
    # 모델 학습 및 MAE 계산
    mae = train_model(model, X, y)
    print(f"Trial {trial.number}: MAE = {mae}")
    
    return mae

In [None]:
# Optuna 실험 세팅 및 실행
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=10)

# 최적 하이퍼파라미터 출력
best_params = study.best_params
print("Best hyperparameters: ", best_params)
print("Best MAE: ", study.best_value)


In [22]:
# best_params = study.best_params
# print("Best parameters for Tabnet: ", best_params)

In [20]:
best_params = {
    'n_d': 42, 
    "n_a": 48,  # n_a는 n_d와 동일하게 설정
    'n_steps': 3,
    'gamma': 1.9699098521619942, 
    'lambda_sparse': 0.0019000671753502962, 
    'optimizer_params': {"lr" : 0.0026506405886809045},
    "n_independent": 5, # 필요하면 3, 4로 늘려본다
    "n_shared": 2, # 필요하면 3, 4로 늘려본다
    "optimizer_fn": torch.optim.Adam,
    "verbose": 1,
    "device_name" : "cuda" if torch.cuda.is_available() else "cpu",
    "seed" : 42
}
best_model = TabNetRegressor(**best_params)
best_model.fit(
            X_train.values, y_train.values.reshape(-1, 1),
            eval_set=[(X_train.values, y_train.values.reshape(-1, 1)),(X_val.values, y_val.values.reshape(-1, 1))],
            eval_name=['train', 'valid'],
            eval_metric=["mae"],
            loss_fn=torch.nn.L1Loss(),
            max_epochs=200, 
            patience=5,
            batch_size=8192,
            drop_last=False,
            warm_start=True
)



epoch 0  | loss: 1.50878 | train_mae: 138.04847| valid_mae: 138.0774|  0:01:49s
epoch 1  | loss: 0.28786 | train_mae: 16.48729| valid_mae: 16.49411|  0:03:37s
epoch 2  | loss: 0.25177 | train_mae: 1.72534 | valid_mae: 1.71847 |  0:05:24s
epoch 3  | loss: 0.2341  | train_mae: 0.30367 | valid_mae: 0.30375 |  0:07:12s
epoch 4  | loss: 0.22398 | train_mae: 0.21018 | valid_mae: 0.2099  |  0:08:59s
epoch 5  | loss: 0.21568 | train_mae: 0.19938 | valid_mae: 0.19905 |  0:10:46s
epoch 6  | loss: 0.2095  | train_mae: 0.19614 | valid_mae: 0.19607 |  0:12:33s
epoch 7  | loss: 0.20657 | train_mae: 0.1894  | valid_mae: 0.18946 |  0:14:20s
epoch 8  | loss: 0.19955 | train_mae: 0.18907 | valid_mae: 0.1892  |  0:16:08s
epoch 9  | loss: 0.19978 | train_mae: 0.18605 | valid_mae: 0.18594 |  0:17:55s
epoch 10 | loss: 0.19775 | train_mae: 0.18399 | valid_mae: 0.18397 |  0:19:44s
epoch 11 | loss: 0.19124 | train_mae: 0.17838 | valid_mae: 0.1786  |  0:21:34s
epoch 12 | loss: 0.18928 | train_mae: 0.17997 | val



model_train.py에 합친다면 ver

In [24]:
# best_params, mae = optuna_train("tabnet", X, y)
# best_model = set_model("tabnet", **best_params)
# best_model = best_model.fit(X.values, y["log_deposit"].values.reshape(-1,1))

### Inference

In [26]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cuda.matmul.allow_tf32 = True

In [None]:
test_data.isnull().sum()

In [None]:
# interest_rate 컬럼의 결측값을 평균값으로 대체
test_data["interest_rate"].fillna(test_data["interest_rate"].mean(), inplace=True)

In [None]:
test_data.columns

In [30]:
X_test = test_data.values

In [31]:
y_pred = best_model.predict(X_test)
y_pred = np.expm1(y_pred)

In [33]:
sample_submission["deposit"] = y_pred
sample_submission.to_csv("output.csv", index=False)

In [None]:
# save_csv(best_model, test_data, sample_submission)