### Library

In [1]:
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from data.feature_engineering import *
from model.inference import save_csv
from model.feature_select import select_features
from model.data_split import split_features_and_target
from model.log_transformation import apply_log_transformation
from model.model_train import set_model, optuna_train
#from model.TreeModel import XGBoost
from pytorch_tabnet.tab_model import TabNetRegressor
import optuna
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Data load

In [2]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()
# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

### Data Preprocessing

In [3]:
# 위치 중복도 낮은 행 삭제
groups = train_data.groupby(["latitude", "longitude"])["index"].count()
conditioned_groups_index = groups[(groups >= 2) & (groups <= 5)].index # 이 범위를 파라미터로 조정하는걸로
small_groups = train_data[
    train_data["latitude"].isin(conditioned_groups_index.get_level_values(0)) &
    train_data["longitude"].isin(conditioned_groups_index.get_level_values(1))
]
train_data.drop(small_groups.index, axis=0, inplace=True)

# built_year > 2024 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

### Feature engineering

**Clustering**

In [4]:
# 데이터 준비
feature_columns = ["latitude", "longitude"]
coords = train_data[feature_columns]

# ClusteringModel 클래스 인스턴스 생성
clustering_model = ClusteringModel(data=coords)
kmeans_model = clustering_model.kmeans_clustering(n_clusters=25, 
                                                train_data=train_data, 
                                                test_data=test_data, 
                                                feature_columns=feature_columns, 
                                                label_column="region")

region_mean_prices = train_data.groupby("region")["deposit"].mean().reset_index()
region_mean_prices.columns = ["region", "mean_deposit"]
region_mean_prices["mean_deposit_category"] = region_mean_prices["mean_deposit"] // 10000

# train_data와 region_mean_prices 병합
train_data = train_data.merge(region_mean_prices, on="region", how="left")
test_data = test_data.merge(region_mean_prices, on="region", how="left")

**Log변환**

In [5]:
train_data, test_data = apply_log_transformation(train_data, test_data)

**Feature select**

In [6]:
#train_data, test_data = select_features(train_data, test_data)

**train_data split**

In [7]:
X, y = split_features_and_target(train_data)

In [8]:
X.columns

Index(['index', 'area_m2', 'contract_year_month', 'contract_day',
       'contract_type', 'floor', 'built_year', 'latitude', 'longitude', 'age',
       'interest_rate', 'nearest_subway_distance', 'nearest_subway_latitude',
       'nearest_subway_longitude', 'nearest_school_distance',
       'nearest_school_latitude', 'nearest_school_longitude',
       'nearest_park_distance', 'nearest_park_latitude',
       'nearest_park_longitude', 'nearest_subway_num', 'nearest_school_num',
       'nearest_park_num', 'num_of_subways_within_radius',
       'num_of_schools_within_radius', 'num_of_parks_within_radius', 'region',
       'mean_deposit', 'mean_deposit_category', 'log_area_m2',
       'log_school_distance', 'log_park_distance', 'log_subway_distance'],
      dtype='object')

In [9]:
y.columns

Index(['deposit', 'log_deposit'], dtype='object')

### Model Train and Evaluate

**Tabnet**
- 테이블 데이터에서도 딥러닝이 잘 동작할 수 있게 만들어진 모델
- 자동으로 중요한 features를 선택하기 떄문에 feature select부분은 제외

optuna + kfold

In [13]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, cross_val_predict

def objective(trial):
    params = {
        "n_d": trial.suggest_int("n_d", 8, 64),
        "n_a": trial.suggest_int("n_a", 8, 64),
        "n_steps": trial.suggest_int("n_steps", 3, 10),
        "gamma": trial.suggest_float("gamma", 1.0, 2.0),
        "lambda_sparse": trial.suggest_float("lambda_sparse", 0.0001, 0.01),
        "optimizer_params": dict(lr=trial.suggest_float("learning_rate", 0.001, 0.01))
    }
    
    # K-Fold 교차 검증
    cv = 5
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    
    model = TabNetRegressor(**params)
    y_pred = cross_val_predict(
        model,
        X.values, 
        y["log_deposit"].values.reshape(-1, 1),
        cv = kfold,
        method="predict",
        fit_params={"max_epochs": 5}
    )
    
    y_pred = np.expm1(y_pred)
        
    # MAE 계산
    mae = mean_absolute_error(y["deposit"].values, y_pred) 
    print(f"Trial {trial.number}: MAE = {mae}")

    # 교차 검증 후 MAE 반환
    return mae

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[I 2024-10-22 00:40:04,725] A new study created in memory with name: no-name-dad22c9f-d7f3-4031-8448-182b9d41f357


epoch 0  | loss: 0.92112 |  0:01:24s
epoch 1  | loss: 0.11554 |  0:02:50s
epoch 2  | loss: 0.1012  |  0:04:14s
epoch 3  | loss: 0.09596 |  0:05:39s
epoch 4  | loss: 0.08746 |  0:07:04s




epoch 0  | loss: 0.90485 |  0:01:28s
epoch 1  | loss: 0.12519 |  0:02:57s
epoch 2  | loss: 0.12009 |  0:04:29s
epoch 3  | loss: 0.1069  |  0:05:56s
epoch 4  | loss: 0.09982 |  0:07:23s




epoch 0  | loss: 0.90966 |  0:01:33s
epoch 1  | loss: 0.11886 |  0:03:06s
epoch 2  | loss: 0.11324 |  0:04:41s
epoch 3  | loss: 0.10262 |  0:06:15s
epoch 4  | loss: 0.09738 |  0:07:50s


In [None]:
best_params = study.best_params
print("Best parameters for Tabnet: ", best_params)

In [None]:
best_model = TabNetRegressor(**best_params)
best_model.fit(X.values, y["log_deposit"].values.reshape(-1,1))

### Inference

In [15]:
save_csv(best_model, test_data, sample_submission)