### Library

In [1]:
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from data.feature_engineering import *
from model.inference import save_csv
from model.feature_select import select_features
from model.data_split import split_features_and_target
from model.log_transformation import apply_log_transformation
from model.model_train import set_model, optuna_train
#from model.TreeModel import XGBoost
from pytorch_tabnet.tab_model import TabNetRegressor
import optuna
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Data load

In [2]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()
# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

### Data Preprocessing

In [3]:
# 위치 중복도 낮은 행 삭제
groups = train_data.groupby(["latitude", "longitude"])["index"].count()
conditioned_groups_index = groups[(groups >= 2) & (groups <= 5)].index # 이 범위를 파라미터로 조정하는걸로
small_groups = train_data[
    train_data["latitude"].isin(conditioned_groups_index.get_level_values(0)) &
    train_data["longitude"].isin(conditioned_groups_index.get_level_values(1))
]
train_data.drop(small_groups.index, axis=0, inplace=True)

# built_year > 2024 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

### Feature engineering

**Clustering**

In [4]:
# 데이터 준비
feature_columns = ["latitude", "longitude"]
coords = train_data[feature_columns]

# ClusteringModel 클래스 인스턴스 생성
clustering_model = ClusteringModel(data=coords)
kmeans_model = clustering_model.kmeans_clustering(n_clusters=25, 
                                                train_data=train_data, 
                                                test_data=test_data, 
                                                feature_columns=feature_columns, 
                                                label_column="region")

region_mean_prices = train_data.groupby("region")["deposit"].mean().reset_index()
region_mean_prices.columns = ["region", "mean_deposit"]
region_mean_prices["mean_deposit_category"] = region_mean_prices["mean_deposit"] // 10000

# train_data와 region_mean_prices 병합
train_data = train_data.merge(region_mean_prices, on="region", how="left")
test_data = test_data.merge(region_mean_prices, on="region", how="left")

**Log변환**

In [5]:
train_data, test_data = apply_log_transformation(train_data, test_data)

**Feature select**

In [6]:
#train_data, test_data = select_features(train_data, test_data)

**train_data split**

In [7]:
X, y = split_features_and_target(train_data)

In [8]:
X.columns

Index(['index', 'area_m2', 'contract_year_month', 'contract_day',
       'contract_type', 'floor', 'built_year', 'latitude', 'longitude', 'age',
       'interest_rate', 'nearest_subway_distance', 'nearest_subway_latitude',
       'nearest_subway_longitude', 'nearest_school_distance',
       'nearest_school_latitude', 'nearest_school_longitude',
       'nearest_park_distance', 'nearest_park_latitude',
       'nearest_park_longitude', 'nearest_subway_num', 'nearest_school_num',
       'nearest_park_num', 'num_of_subways_within_radius',
       'num_of_schools_within_radius', 'num_of_parks_within_radius', 'region',
       'mean_deposit', 'mean_deposit_category', 'log_area_m2',
       'log_school_distance', 'log_park_distance', 'log_subway_distance'],
      dtype='object')

In [9]:
y.columns

Index(['deposit', 'log_deposit'], dtype='object')

### Model Train and Evaluate

**Tabnet**
- 테이블 데이터에서도 딥러닝이 잘 동작할 수 있게 만들어진 모델
- 자동으로 중요한 features를 선택하기 떄문에 feature select부분은 제외

optuna + kfold

In [13]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, cross_val_predict

def objective(trial):
    params = {
        "n_d": trial.suggest_int("n_d", 8, 64),
        "n_a": trial.suggest_int("n_a", 8, 64),
        "n_steps": trial.suggest_int("n_steps", 3, 10),
        "gamma": trial.suggest_float("gamma", 1.0, 2.0),
        "lambda_sparse": trial.suggest_float("lambda_sparse", 0.0001, 0.01),
        "optimizer_params": dict(lr=trial.suggest_float("learning_rate", 0.001, 0.01))
    }
    
    # K-Fold 교차 검증
    cv = 5
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    
    model = TabNetRegressor(**params)
    y_pred = cross_val_predict(
        model,
        X.values, 
        y["log_deposit"].values.reshape(-1, 1),
        cv = kfold,
        method="predict",
        fit_params={"max_epochs": 5}
    )
    
    y_pred = np.expm1(y_pred)
        
    # MAE 계산
    mae = mean_absolute_error(y["deposit"].values, y_pred) 
    print(f"Trial {trial.number}: MAE = {mae}")

    # 교차 검증 후 MAE 반환
    return mae

In [14]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[I 2024-10-22 00:40:04,725] A new study created in memory with name: no-name-dad22c9f-d7f3-4031-8448-182b9d41f357


epoch 0  | loss: 0.92112 |  0:01:24s
epoch 1  | loss: 0.11554 |  0:02:50s
epoch 2  | loss: 0.1012  |  0:04:14s
epoch 3  | loss: 0.09596 |  0:05:39s
epoch 4  | loss: 0.08746 |  0:07:04s




epoch 0  | loss: 0.90485 |  0:01:28s
epoch 1  | loss: 0.12519 |  0:02:57s
epoch 2  | loss: 0.12009 |  0:04:29s
epoch 3  | loss: 0.1069  |  0:05:56s
epoch 4  | loss: 0.09982 |  0:07:23s




epoch 0  | loss: 0.90966 |  0:01:33s
epoch 1  | loss: 0.11886 |  0:03:06s
epoch 2  | loss: 0.11324 |  0:04:41s
epoch 3  | loss: 0.10262 |  0:06:15s
epoch 4  | loss: 0.09738 |  0:07:50s




epoch 0  | loss: 0.93688 |  0:01:36s
epoch 1  | loss: 0.12266 |  0:03:10s
epoch 2  | loss: 0.10829 |  0:04:49s
epoch 3  | loss: 0.0978  |  0:06:28s
epoch 4  | loss: 0.08609 |  0:08:05s




epoch 0  | loss: 0.90702 |  0:01:35s
epoch 1  | loss: 0.11977 |  0:03:11s
epoch 2  | loss: 0.10518 |  0:04:48s
epoch 3  | loss: 0.09764 |  0:06:27s
epoch 4  | loss: 0.09478 |  0:08:03s


[I 2024-10-22 02:14:40,477] Trial 0 finished with value: 7750.06456263911 and parameters: {'n_d': 40, 'n_a': 19, 'n_steps': 5, 'gamma': 1.5785615064234828, 'lambda_sparse': 0.008560115234202274, 'learning_rate': 0.0069468457058344275}. Best is trial 0 with value: 7750.06456263911.


Trial 0: MAE = 7750.06456263911




epoch 0  | loss: 4.38441 |  0:01:48s
epoch 1  | loss: 0.16498 |  0:03:39s
epoch 2  | loss: 0.13353 |  0:05:31s
epoch 3  | loss: 0.11883 |  0:07:21s
epoch 4  | loss: 0.11044 |  0:09:10s




epoch 0  | loss: 4.53358 |  0:01:50s
epoch 1  | loss: 0.16427 |  0:03:41s
epoch 2  | loss: 0.13299 |  0:05:30s
epoch 3  | loss: 0.11952 |  0:07:19s
epoch 4  | loss: 0.11037 |  0:09:08s




epoch 0  | loss: 4.45401 |  0:01:51s
epoch 1  | loss: 0.15249 |  0:03:44s
epoch 2  | loss: 0.11946 |  0:05:36s
epoch 3  | loss: 0.10604 |  0:07:27s
epoch 4  | loss: 0.09998 |  0:09:18s




epoch 0  | loss: 4.55481 |  0:01:50s
epoch 1  | loss: 0.16603 |  0:03:42s
epoch 2  | loss: 0.13025 |  0:05:35s
epoch 3  | loss: 0.11369 |  0:07:26s
epoch 4  | loss: 0.10509 |  0:09:18s




epoch 0  | loss: 4.44643 |  0:01:53s
epoch 1  | loss: 0.17421 |  0:03:46s
epoch 2  | loss: 0.13583 |  0:05:38s
epoch 3  | loss: 0.11938 |  0:07:29s
epoch 4  | loss: 0.10901 |  0:09:20s


[I 2024-10-22 04:08:38,984] Trial 1 finished with value: 8239.989720503087 and parameters: {'n_d': 40, 'n_a': 10, 'n_steps': 6, 'gamma': 1.188300870384936, 'lambda_sparse': 0.004283262095175114, 'learning_rate': 0.001213602712779988}. Best is trial 0 with value: 7750.06456263911.


Trial 1: MAE = 8239.989720503087




epoch 0  | loss: 0.72445 |  0:02:28s
epoch 1  | loss: 0.11601 |  0:04:55s
epoch 2  | loss: 0.10661 |  0:07:22s
epoch 3  | loss: 0.10076 |  0:09:48s
epoch 4  | loss: 0.09797 |  0:12:14s




epoch 0  | loss: 0.69661 |  0:02:25s
epoch 1  | loss: 0.13635 |  0:04:52s
epoch 2  | loss: 0.12509 |  0:07:17s
epoch 3  | loss: 0.11437 |  0:09:42s
epoch 4  | loss: 0.10854 |  0:12:09s




epoch 0  | loss: 0.74685 |  0:02:28s
epoch 1  | loss: 0.12597 |  0:04:50s
epoch 2  | loss: 0.11045 |  0:07:14s
epoch 3  | loss: 0.10319 |  0:09:40s
epoch 4  | loss: 0.09803 |  0:12:04s




epoch 0  | loss: 0.75037 |  0:02:24s
epoch 1  | loss: 0.12164 |  0:04:51s
epoch 2  | loss: 0.11366 |  0:07:14s
epoch 3  | loss: 0.11239 |  0:09:39s
epoch 4  | loss: 0.10944 |  0:12:05s




epoch 0  | loss: 0.75739 |  0:02:28s
epoch 1  | loss: 0.13436 |  0:04:57s
epoch 2  | loss: 0.12026 |  0:07:27s
epoch 3  | loss: 0.11578 |  0:09:54s
epoch 4  | loss: 0.11193 |  0:12:22s


[I 2024-10-22 06:50:07,892] Trial 2 finished with value: 8483.747154519178 and parameters: {'n_d': 10, 'n_a': 17, 'n_steps': 9, 'gamma': 1.2906004321273796, 'lambda_sparse': 0.004605229074960597, 'learning_rate': 0.005984639058373303}. Best is trial 0 with value: 7750.06456263911.


Trial 2: MAE = 8483.747154519178




epoch 0  | loss: 0.66096 |  0:02:05s
epoch 1  | loss: 0.11584 |  0:04:10s
epoch 2  | loss: 0.10405 |  0:06:15s
epoch 3  | loss: 0.09893 |  0:08:20s
epoch 4  | loss: 0.09309 |  0:10:25s




epoch 0  | loss: 0.70508 |  0:02:04s
epoch 1  | loss: 0.12352 |  0:04:09s
epoch 2  | loss: 0.11187 |  0:06:14s
epoch 3  | loss: 0.10644 |  0:08:18s
epoch 4  | loss: 0.10211 |  0:10:22s




epoch 0  | loss: 0.64561 |  0:02:05s
epoch 1  | loss: 0.12907 |  0:04:10s
epoch 2  | loss: 0.1259  |  0:06:15s
epoch 3  | loss: 0.11858 |  0:08:20s
epoch 4  | loss: 0.11101 |  0:10:25s




epoch 0  | loss: 0.67115 |  0:02:05s
epoch 1  | loss: 0.12304 |  0:04:10s
epoch 2  | loss: 0.11718 |  0:06:13s
epoch 3  | loss: 0.10692 |  0:08:19s
epoch 4  | loss: 0.09808 |  0:10:23s




epoch 0  | loss: 0.62963 |  0:02:04s
epoch 1  | loss: 0.12405 |  0:04:09s
epoch 2  | loss: 0.10507 |  0:06:14s
epoch 3  | loss: 0.09665 |  0:08:18s
epoch 4  | loss: 0.09787 |  0:10:23s


[I 2024-10-22 09:00:44,997] Trial 3 finished with value: 8244.085053029368 and parameters: {'n_d': 53, 'n_a': 37, 'n_steps': 7, 'gamma': 1.2899020676345958, 'lambda_sparse': 0.009606517486402084, 'learning_rate': 0.009422190239997318}. Best is trial 0 with value: 7750.06456263911.


Trial 3: MAE = 8244.085053029368




epoch 0  | loss: 1.06432 |  0:01:26s
epoch 1  | loss: 0.13761 |  0:02:51s
epoch 2  | loss: 0.12141 |  0:04:17s
epoch 3  | loss: 0.1118  |  0:05:45s
epoch 4  | loss: 0.10479 |  0:07:11s




epoch 0  | loss: 1.06266 |  0:01:24s
epoch 1  | loss: 0.12595 |  0:02:50s
epoch 2  | loss: 0.11303 |  0:04:15s
epoch 3  | loss: 0.10446 |  0:05:41s
epoch 4  | loss: 0.09725 |  0:07:04s




epoch 0  | loss: 1.06034 |  0:01:26s
epoch 1  | loss: 0.12533 |  0:02:50s
epoch 2  | loss: 0.1095  |  0:04:13s
epoch 3  | loss: 0.10147 |  0:05:34s
epoch 4  | loss: 0.09636 |  0:06:58s




epoch 0  | loss: 1.07665 |  0:01:25s
epoch 1  | loss: 0.13304 |  0:02:51s


[W 2024-10-22 09:53:30,996] Trial 4 failed with parameters: {'n_d': 35, 'n_a': 22, 'n_steps': 4, 'gamma': 1.9231837903415596, 'lambda_sparse': 0.0067503971171585, 'learning_rate': 0.0026968850072040243} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_2986082/3040363673.py", line 19, in objective
    y_pred = cross_val_predict(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 1282, in cross_val_predict
    predictions = parallel(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 67, in __call__
    return super().__call__(iterable_with_config)
  File "/opt/conda/lib/python3.10/site-

KeyboardInterrupt: 

In [None]:
best_params = study.best_params
print("Best parameters for Tabnet: ", best_params)

In [None]:
best_model = TabNetRegressor(**best_params)
best_model.fit(X.values, y["log_deposit"].values.reshape(-1,1))

### Inference

In [15]:
save_csv(best_model, test_data, sample_submission)