### Library

In [33]:
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from data.data_preprocessing import *
from data.feature_engineering import *
from model.inference import save_csv
#from model.feature_select import select_features
from model.data_split import split_features_and_target
#from model.model_train import set_model, optuna_train
#from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
import os
import numpy as np
import pandas as pd
import optuna
import torch

### Data load

In [34]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()
# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

### Data Preprocessing

In [35]:
# 위치 중복도 낮은 행 삭제
train_data = delete_low_density(train_data, 2, 6)

# built_year가 2024인 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

### Feature engineering

**Log변환**

In [37]:
train_data, test_data = apply_log_transformation(train_data, test_data)

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


**train_data split**

In [38]:
X, y = split_features_and_target(train_data)

**Feature select**

In [39]:
#train_data, test_data = select_features(X, y, test_data)

In [40]:
selected_cols = [
    "log_area_m2", "built_year", "latitude", "longitude", "log_subway_distance", "contract_year_month", "num_of_subways_within_radius", "park_exists", "region", "region_mean", "log_leader_distance",
]
X = X[selected_cols]
#y.drop(columns="deposit", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.drop(columns="deposit", inplace=True)


In [41]:
X.columns

Index(['log_area_m2', 'built_year', 'latitude', 'longitude',
       'log_subway_distance', 'contract_year_month',
       'num_of_subways_within_radius', 'park_exists', 'region', 'region_mean',
       'log_leader_distance'],
      dtype='object')

In [42]:
y.columns

Index(['log_deposit'], dtype='object')

### Model Train and Evaluate

**GradientBoostingRegressor**

In [46]:
X_train, X_val, y_train, y_val = train_test_split(X, y["log_deposit"], test_size=0.2, shuffle=True, random_state=42)

**optuna off**

In [47]:
gb_model = GradientBoostingRegressor(
    n_estimators=276,
    learning_rate=0.1688538348076114,
    max_depth=12,
    subsample=0.796862768617941,
    min_samples_split=4,  # 추가 튜닝
    min_samples_leaf=3,  # 추가 튜닝
    random_state=42
)
gb_model.fit(X_train, y_train)

In [49]:
y.columns

Index(['log_deposit'], dtype='object')

In [50]:
gb_val_pred = gb_model.predict(X_val)
gb_val_pred = np.expm1(gb_val_pred)
y_val_exp = np.expm1(y_val)
val_mae = mean_absolute_error(y_val_exp, gb_val_pred)
print(f"Validation MAE: {val_mae:.4f}")

Validation MAE: 4431.5030


**optuna on**

In [None]:
""" def objective(trial):
    params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "max_depth": trial.suggest_int("max_depth", 5, 12),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    }

    model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    y_val_pred = np.expm1(y_val_pred)

    val_mae = mean_absolute_error(y["deposit"].iloc[X_val.index], y_val_pred)
    print(f"Validation MAE: {val_mae:.4f}")
    return val_mae """

In [None]:
""" sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=50)
best_params = study.best_params
print("Best hyperparameters: ", best_params)
print("Best MAE: ", study.best_value) """

In [19]:
# best_model = GradientBoostingRegressor(**best_params)
# best_model.fit(X, y["log_deposit"])

**Kfold**

In [20]:
# gb_model = GradientBoostingRegressor(
#     n_estimators=276,
#     learning_rate=0.1688538348076114,
#     max_depth=12,
#     subsample=0.796862768617941,
#     min_samples_split=4,  # 추가 튜닝
#     min_samples_leaf=3,  # 추가 튜닝
#     random_state=42
# )
# from sklearn.model_selection import KFold, cross_val_score

# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# cv_scores = cross_val_score(gb_model, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')
# print(f"Cross-validation MAE: {-np.mean(cv_scores):.4f}")

# gb_model.fit(X_train, y_train)

In [21]:
# # 검증 데이터 예측 및 평가
# gb_val_pred = gb_model.predict(X_val)
# val_mae = mean_absolute_error(y_val, gb_val_pred)
# print(f"Validation MAE: {val_mae:.4f}")

### Inference

In [51]:
X_test = test_data[selected_cols]

In [52]:
y_pred = gb_model.predict(X_test)
y_pred = np.expm1(y_pred)

In [53]:
y_pred = np.where(y_pred < 0, 0, y_pred)

In [54]:
sample_submission["deposit"] = y_pred
sample_submission.to_csv("output.csv", index=False)

In [55]:
y_pred.mean()

38988.959592372325

In [56]:
pd.Series(y_pred.flatten()).describe()

count    150172.000000
mean      38988.959592
std       24571.850075
min        2583.692958
25%       22970.143238
50%       33300.480920
75%       48716.693874
max      643831.916562
dtype: float64

In [57]:
negative_values = pd.Series(y_pred.flatten())[pd.Series(y_pred.flatten()) < 0]
print(negative_values)

Series([], dtype: float64)
