# 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold

import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import wandb
from wandb.integration.lightgbm import log_summary
from wandb.integration.catboost import WandbCallback
import optuna
optuna.logging.set_verbosity(optuna.logging.INFO)  # optuna log 설정

import warnings
warnings.filterwarnings('ignore')

import func.features as ft
from func.utils import lgb_wandb_callback

# WandB 설정

In [2]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

## 올바른 팀에 올리는지 확인!

In [3]:
print(wandb.api.default_entity)

recsys008-naver-boostcamp


# 랜덤 시드 설정

In [4]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 데이터 불러오기

In [5]:
path = "../../../data/"  # 알잘딱 수정
train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

#추가
interest = pd.read_csv(path + 'interestRate.csv')
park = pd.read_csv(path + 'parkInfo.csv')
school = pd.read_csv(path + 'schoolinfo.csv')
subway = pd.read_csv(path + 'subwayInfo.csv')


## 중복 값 확인 및 처리

In [6]:
duplicates = train_data[train_data.drop(columns=['index']).duplicated(keep=False)]
duplicates

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit
15,15,84.9342,201907,31,2,7,2016,36.965423,127.048779,3,18000.0
16,16,84.9342,201907,31,2,7,2016,36.965423,127.048779,3,18000.0
28,28,146.4005,201911,21,2,5,2016,36.965423,127.048779,3,37000.0
29,29,146.4005,201911,21,2,5,2016,36.965423,127.048779,3,37000.0
33,33,84.9342,201912,14,2,14,2016,36.965423,127.048779,3,19000.0
...,...,...,...,...,...,...,...,...,...,...,...
1801197,1801197,101.9088,202308,22,2,11,2010,37.528394,126.659398,13,33000.0
1801198,1801198,114.9285,202308,28,1,18,2010,37.528394,126.659398,13,30000.0
1801199,1801199,114.9285,202308,28,1,18,2010,37.528394,126.659398,13,30000.0
1801210,1801210,114.9285,202310,26,2,9,2010,37.528394,126.659398,13,39000.0


In [7]:
# 중복 제거 (첫 번째 발생 유지)
train_data = train_data.drop_duplicates(subset=train_data.columns.drop('index'), keep='first')
train_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0
3,3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,5000.0
4,4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,1800.0
...,...,...,...,...,...,...,...,...,...,...,...
1801223,1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,39000.0
1801224,1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,38000.0
1801225,1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,37000.0
1801226,1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,34400.0


# 가장 가까운 지하철 거리 추가

In [8]:
# 지하철 역 데이터의 경위도 배열 생성
subway_coords = subway[['latitude', 'longitude']].to_numpy()

# 아파트 데이터 경위도 배열 생성
apart_coords_train = train_data[['latitude', 'longitude']].to_numpy()
apart_coords_test = test_data[['latitude', 'longitude']].to_numpy()

# 각 아파트에 대해 가까운 지하철역 거리 추가
train_data.loc[:, 'nearest_subway_distance'] = ft.calculate_nearest_subway_distance(apart_coords_train, subway_coords)
test_data.loc[:, 'nearest_subway_distance'] = ft.calculate_nearest_subway_distance(apart_coords_test, subway_coords)

# 학교 레벨별로 가장 가까운 학교 추가

In [9]:
# 각 아파트에 대해 가까운 학교 거리 추가
nearest_school_distances = ft.calculate_nearest_school_distance(apart_coords_train, school)
for level in nearest_school_distances:
    train_data[f'nearest_{level}_distance'] = nearest_school_distances[level]

nearest_school_distances = ft.calculate_nearest_school_distance(apart_coords_test, school)
for level in nearest_school_distances:
    test_data[f'nearest_{level}_distance'] = nearest_school_distances[level]

# 특정 반경(radius) 내 공원 밀도

In [10]:
radius_km = 3
item_name = 'park'

# 유니크한 아파트 좌표로 공원 개수와 밀도 계산 후 결과를 원래 데이터에 매핑
train_data = ft.map_item_count_or_density_with_area(train_data, park, radius_km, item_name)
test_data = ft.map_item_count_or_density_with_area(test_data, park, radius_km, item_name)

100%|██████████| 18491/18491 [00:11<00:00, 1613.59it/s]
100%|██████████| 11885/11885 [00:06<00:00, 1900.16it/s]


# 특정 거리(distance) 내 레벨별 학교 개수

In [11]:
# 각 레벨에 대해 다른 거리 범위를 설정
distance_kms = {
    'elementary': 1,  # 1km 이내
    'middle': 5,      # 3km 이내
    'high': 5         # 5km 이내
}

train_data = ft.map_school_level_counts(train_data, school, distance_kms, n_jobs=8)
test_data = ft.map_school_level_counts(test_data, school, distance_kms, n_jobs=8)

100%|██████████| 18491/18491 [00:12<00:00, 1467.98it/s]
100%|██████████| 11885/11885 [00:08<00:00, 1478.01it/s]


# Holdout 데이터셋 설정 (예: 2023년 7월부터 12월까지의 데이터)

In [12]:
# 전체 재학습 데이터를 따로 빼놓음
all_data = train_data.copy()

holdout_start = 202307
holdout_end = 202312
holdout_data = train_data[(train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end)]
train_data = train_data[~((train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end))]

# 학습 데이터와 정답 데이터 분리

In [13]:
X_train = train_data.drop(columns=['deposit'])
y_train = train_data['deposit']
X_holdout = holdout_data.drop(columns=['deposit'])
y_holdout = holdout_data['deposit']
# 전체 재학습 추가 부분
X_all = all_data.drop(columns=['deposit'])
y_all = all_data['deposit']
X_test = test_data.copy()

# DBSCAN 클러스터링 
* 경도, 위도 스케일링 후 DBSCAN으로 train 기반하여 클러스터 생성
* holdout(=validation)은 경도,위도 기준 가장 가까운 train 샘플의 라벨을 할당

In [14]:
# # Train 데이터에 DBSCAN 적용
# # 클러스터 정보를 포함한 데이터셋 생성
# X_train['cluster'], X_holdout['cluster'] = ft.apply_dbscan_clustering(X_train, X_holdout)

In [15]:
X_train.head()

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,nearest_subway_distance,nearest_elementary_distance,nearest_middle_distance,nearest_high_distance,park_density,elementary,middle,high
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,0.857916,0.144165,0.577958,1.131958,411203.841208,1,8,7
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,0.857916,0.144165,0.577958,1.131958,411203.841208,1,8,7
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,0.857916,0.144165,0.577958,1.131958,411203.841208,1,8,7
3,3,59.34,201907,15,2,1,1986,36.964647,127.055847,33,3.698483,0.169996,0.819788,0.749925,5255.402324,2,3,3
4,4,59.81,201904,12,2,6,1995,36.97239,127.084514,24,1.231527,1.032098,2.471571,2.548584,19038.552853,0,9,9


In [16]:
X_holdout.head()

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,nearest_subway_distance,nearest_elementary_distance,nearest_middle_distance,nearest_high_distance,park_density,elementary,middle,high
736113,774291,102.6654,202307,8,0,7,2023,37.200075,126.820401,0,8.553991,0.148796,0.539747,0.623334,28980.668595,2,3,1
736114,774292,102.9485,202307,19,0,8,2023,37.200075,126.820401,0,8.553991,0.148796,0.539747,0.623334,28980.668595,2,3,1
736115,774293,102.9485,202307,27,0,12,2023,37.200075,126.820401,0,8.553991,0.148796,0.539747,0.623334,28980.668595,2,3,1
736116,774294,94.3147,202307,30,0,4,2023,37.200075,126.820401,0,8.553991,0.148796,0.539747,0.623334,28980.668595,2,3,1
736117,774295,94.3147,202308,12,0,1,2023,37.200075,126.820401,0,8.553991,0.148796,0.539747,0.623334,28980.668595,2,3,1


# 모델 훈련

## 피처 선택

In [17]:
# 피처 선택
train_columns = ['area_m2', 'contract_year_month', 'floor', 'built_year', 'latitude', 'longitude',
                 'nearest_subway_distance',  'nearest_elementary_distance', 'nearest_middle_distance',
                 'nearest_high_distance', 'park_density', 'elementary', 'middle', 'high']

X_train = X_train[train_columns]
X_holdout = X_holdout[train_columns]
X_all = X_all[train_columns]
X_test = X_test[train_columns]

# LightGBM 모델 훈련

In [None]:
# wandb 초기화
wandb.init(project="house_price_prediction", name="v6 + seasonal month", entity="recsys008-naver-boostcamp")  # 실험명에 따라 name, entity 등 변경해주기!!

# LightGBM 파라미터 설정
lgb_params = {
    'objective': 'regression',
    'metric': ['mae', 'rmse'],
    'boosting_type': 'gbdt',
    'num_leaves': 1200,  # 각 트리의 최대 리프 수
    'min_samples': 20,  # 각 리프의 최소 샘플 수
    'learning_rate': 0.035,
    'n_estimators': 2000,  # 트리를 몇 개 사용하여 부스팅할건지, epoch와 비슷함
    'feature_fraction': 0.65,  # 각 트리가 사용할 컬럼의 비율 eg. 0.8이면 10개의 컬럼 중 8개만 사용
    # 'bagging_fraction': 0.65,  # 각 트리가 사용할 데이터의 비율 eg. 0.8이면 80퍼센트의 데이터 샘플만 사용
    # 'bagging_freq': 0,  # 몇번째 트리마다 배깅을 적용할건지 eg. 5이면 5번째 트리마다 배깅 적용
    'lambda_l1': 1.1939606848809192,
    'lambda_l2': 4.389852271719141,
    'verbose': -1,
    'random_state': RANDOM_SEED
}

# wandb에 파라미터 로깅
wandb.config.update(lgb_params)

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_holdout, label=y_holdout, reference=train_data)

# LightGBM 모델 학습
lgb_model = lgb.train(
    lgb_params,
    train_data,
    valid_sets=[valid_data],
    valid_names='validation',
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100),
        lgb_wandb_callback()
    ]
)

# 특성 중요도 로깅
log_summary(lgb_model)

# Holdout 데이터 예측 - validation 데이터에 대해 가장 좋았던 iteration 사용
lgb_holdout_pred = lgb_model.predict(X_holdout, num_iteration=lgb_model.best_iteration)

# 성능 메트릭 계산
lgb_holdout_mae = mean_absolute_error(y_holdout, lgb_holdout_pred)
lgb_holdout_rmse = np.sqrt(mean_squared_error(y_holdout, lgb_holdout_pred))
lgb_holdout_r2 = r2_score(y_holdout, lgb_holdout_pred)

# wandb에 성능 지표 로깅
wandb.log({
    "holdout_mae": lgb_holdout_mae,
    "holdout_rmse": lgb_holdout_rmse,
    "holdout_r2": lgb_holdout_r2
})

# 결과 출력
print("Holdout 데이터셋 LGBM 성능:")
print(f"MAE: {lgb_holdout_mae:.2f}")
print(f"RMSE: {lgb_holdout_rmse:.2f}")
print(f"R²: {lgb_holdout_r2:.2f}")

# wandb 실험 종료
wandb.finish()

## 교차검증 학습

In [15]:
# LightGBM 파라미터 설정
lgb_params = {
    'objective': 'regression',
    'metric': ['mae', 'rmse'],
    'boosting_type': 'gbdt',
    'num_leaves': 1200,
    'min_samples': 20,
    'learning_rate': 0.035,
    'n_estimators': 2000,
    'feature_fraction': 0.65,
    'lambda_l1': 1.1939606848809192,
    'lambda_l2': 4.389852271719141,
    'verbose': -1,
    'random_state': RANDOM_SEED
}

# 5-fold 교차 검증 준비
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# 각 폴드의 예측과 실제 값을 저장할 리스트
oof_predictions = np.zeros(len(y_all))
oof_targets = np.zeros(len(y_all))

# 각 폴드의 모델을 저장할 리스트
models = []

# 5-fold 교차 검증 수행
for fold, (train_idx, val_idx) in enumerate(kf.split(X_all), 1):
    # 각 폴드마다 새로운 wandb run 시작
    run = wandb.init(project="lgbm CV", name=f"lgb_cv_fold_{fold}", entity="recsys008-naver-boostcamp", reinit=True)

    # wandb에 파라미터 로깅
    wandb.config.update(lgb_params)
    
    print(f"Fold {fold}")
    
    X_train, X_val = X_all.iloc[train_idx], X_all.iloc[val_idx]
    y_train, y_val = y_all.iloc[train_idx], y_all.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        lgb_params,
        train_data,
        valid_sets=[val_data],
        valid_names='validation',
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100),
            lgb_wandb_callback()
        ]
    )
    
    # 검증 세트에 대한 예측
    oof_predictions[val_idx] = model.predict(X_val)
    oof_targets[val_idx] = y_val
    
    # 모델 저장
    models.append(model)
    
    # 폴드별 성능 로깅
    fold_mae = mean_absolute_error(y_val, oof_predictions[val_idx])
    fold_rmse = np.sqrt(mean_squared_error(y_val, oof_predictions[val_idx]))
    wandb.log({f"validation-MAE": fold_mae, f"validation-RMSE": fold_rmse})

     # wandb run 종료
    run.finish()

# 최종 결과를 위한 새로운 wandb run 시작
final_run = wandb.init(project="Feature Transform", name="lgb_cv_final_results", entity="recsys008-naver-boostcamp", reinit=True)

# 전체 OOF 성능 계산
oof_mae = mean_absolute_error(oof_targets, oof_predictions)
oof_rmse = np.sqrt(mean_squared_error(oof_targets, oof_predictions))
oof_r2 = r2_score(oof_targets, oof_predictions)

# wandb에 OOF 성능 로깅
wandb.log({
    "oof_mae": oof_mae,
    "oof_rmse": oof_rmse,
    "oof_r2": oof_r2
})

# 결과 출력
print("5-fold 교차 검증 LGBM 성능 (OOF):")
print(f"MAE: {oof_mae:.2f}")
print(f"RMSE: {oof_rmse:.2f}")
print(f"R²: {oof_r2:.2f}")

# 특성 중요도 계산 (gain)
feature_importance = np.mean([model.feature_importance(importance_type='gain') for model in models], axis=0)
feature_names = X_all.columns.tolist()

# 특성 중요도를 wandb에 로깅
feature_importance_data = [
    [feature, importance] for feature, importance in zip(feature_names, feature_importance)
]
feature_importance_table = wandb.Table(data=feature_importance_data, columns=["feature", "importance"])
wandb.log({"feature_importance": wandb.plot.bar(feature_importance_table, "feature", "importance", title="Feature Importance (Gain)")})

# 최종 wandb run 종료
final_run.finish()

Fold 1
Training until validation scores don't improve for 100 rounds
[100]	validation's l1: 5090.64	validation's rmse: 8101.45
[200]	validation's l1: 4688.14	validation's rmse: 7631.77
[300]	validation's l1: 4564.1	validation's rmse: 7514.94
[400]	validation's l1: 4499.91	validation's rmse: 7464.28
[500]	validation's l1: 4465.13	validation's rmse: 7442.11
[600]	validation's l1: 4440.06	validation's rmse: 7426.4
[700]	validation's l1: 4416.38	validation's rmse: 7412.14
[800]	validation's l1: 4399.82	validation's rmse: 7403
[900]	validation's l1: 4385.92	validation's rmse: 7394.97
[1000]	validation's l1: 4375.18	validation's rmse: 7390.74
[1100]	validation's l1: 4365.68	validation's rmse: 7386.98
[1200]	validation's l1: 4357.92	validation's rmse: 7385.07
[1300]	validation's l1: 4351.76	validation's rmse: 7383.27
[1400]	validation's l1: 4345.06	validation's rmse: 7382.63
[1500]	validation's l1: 4339.62	validation's rmse: 7381.36
[1600]	validation's l1: 4334.89	validation's rmse: 7380.5
Ea

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
fold_1_mae,▁
fold_1_rmse,▁
validation-MAE,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation-RMSE,█▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
fold_1_mae,4335.53792
fold_1_rmse,7380.23481
validation-MAE,4332.02881
validation-RMSE,7380.48292


Fold 2
Training until validation scores don't improve for 100 rounds
[100]	validation's l1: 5096.13	validation's rmse: 8086.55
[200]	validation's l1: 4685.44	validation's rmse: 7599.04
[300]	validation's l1: 4558.4	validation's rmse: 7479.58
[400]	validation's l1: 4492.32	validation's rmse: 7424.89
[500]	validation's l1: 4454.67	validation's rmse: 7399.01
[600]	validation's l1: 4427.81	validation's rmse: 7379.97
[700]	validation's l1: 4407.46	validation's rmse: 7367.61
[800]	validation's l1: 4390.05	validation's rmse: 7356.5
[900]	validation's l1: 4376	validation's rmse: 7348.62
[1000]	validation's l1: 4366.17	validation's rmse: 7345.05
[1100]	validation's l1: 4356.18	validation's rmse: 7340.16
[1200]	validation's l1: 4347.09	validation's rmse: 7335.77
[1300]	validation's l1: 4340.37	validation's rmse: 7333.22
[1400]	validation's l1: 4334.51	validation's rmse: 7331.45
[1500]	validation's l1: 4329.81	validation's rmse: 7329.76
[1600]	validation's l1: 4325.66	validation's rmse: 7329.36
[

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
fold_2_mae,▁
fold_2_rmse,▁
validation-MAE,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation-RMSE,█▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
fold_2_mae,4318.89777
fold_2_rmse,7328.14576
validation-MAE,4315.86594
validation-RMSE,7328.79922


Fold 3
Training until validation scores don't improve for 100 rounds
[100]	validation's l1: 5079.39	validation's rmse: 8058.73
[200]	validation's l1: 4676.65	validation's rmse: 7582.49
[300]	validation's l1: 4549.24	validation's rmse: 7466.85
[400]	validation's l1: 4488.76	validation's rmse: 7417.86
[500]	validation's l1: 4452.27	validation's rmse: 7392.37
[600]	validation's l1: 4424.33	validation's rmse: 7373.4
[700]	validation's l1: 4403.54	validation's rmse: 7360.74
[800]	validation's l1: 4386.39	validation's rmse: 7350.93
[900]	validation's l1: 4372.58	validation's rmse: 7343.91
[1000]	validation's l1: 4362.6	validation's rmse: 7339.37
[1100]	validation's l1: 4352.98	validation's rmse: 7334.81
[1200]	validation's l1: 4345.33	validation's rmse: 7332.58
[1300]	validation's l1: 4338.67	validation's rmse: 7330.99
[1400]	validation's l1: 4332.98	validation's rmse: 7330.34
[1500]	validation's l1: 4327.78	validation's rmse: 7329.12
[1600]	validation's l1: 4322.48	validation's rmse: 7327.6

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
fold_3_mae,▁
fold_3_rmse,▁
validation-MAE,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation-RMSE,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
fold_3_mae,4314.81393
fold_3_rmse,7326.42256
validation-MAE,4311.73178
validation-RMSE,7327.48352


Fold 4
Training until validation scores don't improve for 100 rounds
[100]	validation's l1: 5089.7	validation's rmse: 8085.1
[200]	validation's l1: 4685.35	validation's rmse: 7608.81
[300]	validation's l1: 4562.11	validation's rmse: 7493.89
[400]	validation's l1: 4499.05	validation's rmse: 7440.98
[500]	validation's l1: 4462.95	validation's rmse: 7414.82
[600]	validation's l1: 4436.19	validation's rmse: 7396.25
[700]	validation's l1: 4415.17	validation's rmse: 7382.45
[800]	validation's l1: 4398.01	validation's rmse: 7372.41
[900]	validation's l1: 4385.04	validation's rmse: 7365.22
[1000]	validation's l1: 4374.84	validation's rmse: 7361.05
[1100]	validation's l1: 4365.3	validation's rmse: 7355.87
[1200]	validation's l1: 4356.49	validation's rmse: 7351.53
[1300]	validation's l1: 4349.16	validation's rmse: 7347.38
[1400]	validation's l1: 4343.43	validation's rmse: 7346.82
[1500]	validation's l1: 4338.12	validation's rmse: 7344.11
[1600]	validation's l1: 4332.61	validation's rmse: 7342.14

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
fold_4_mae,▁
fold_4_rmse,▁
validation-MAE,█▅▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
validation-RMSE,█▇▆▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
fold_4_mae,4324.43763
fold_4_rmse,7340.39989
validation-MAE,4321.36484
validation-RMSE,7341.1167


Fold 5
Training until validation scores don't improve for 100 rounds
[100]	validation's l1: 5088.04	validation's rmse: 8158.68
[200]	validation's l1: 4693.33	validation's rmse: 7686.33
[300]	validation's l1: 4572.79	validation's rmse: 7567.74
[400]	validation's l1: 4510.05	validation's rmse: 7515.94
[500]	validation's l1: 4474.08	validation's rmse: 7489.51
[600]	validation's l1: 4447.73	validation's rmse: 7469.6
[700]	validation's l1: 4425.41	validation's rmse: 7455.31
[800]	validation's l1: 4408.83	validation's rmse: 7444.77
[900]	validation's l1: 4396.09	validation's rmse: 7436.81
[1000]	validation's l1: 4384.82	validation's rmse: 7430.52
[1100]	validation's l1: 4375.56	validation's rmse: 7424.18
[1200]	validation's l1: 4366.32	validation's rmse: 7419.49
[1300]	validation's l1: 4359.54	validation's rmse: 7417.03
[1400]	validation's l1: 4353.99	validation's rmse: 7415.35
[1500]	validation's l1: 4349.02	validation's rmse: 7413.18
[1600]	validation's l1: 4344.63	validation's rmse: 7411.

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
fold_5_mae,▁
fold_5_rmse,▁
validation-MAE,█▅▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation-RMSE,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
fold_5_mae,4335.63193
fold_5_rmse,7409.40481
validation-MAE,4332.60132
validation-RMSE,7409.52689


5-fold 교차 검증 LGBM 성능 (OOF):
MAE: 4325.86
RMSE: 7356.99
R²: 0.92


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
oof_mae,▁
oof_r2,▁
oof_rmse,▁

0,1
oof_mae,4325.86384
oof_r2,0.921
oof_rmse,7356.99402


### 교차검증 소프트 보팅 제출

In [16]:
# 테스트 데이터에 대한 예측 (소프트 보팅)
def soft_voting_predict(models, X_test):
    predictions = np.column_stack([model.predict(X_test) for model in models])
    return np.mean(predictions, axis=1)

# 테스트 데이터가 있다면 아래 코드를 사용
test_predictions = soft_voting_predict(models, X_test)
sample_submission['deposit'] = test_predictions
sample_submission.to_csv('output.csv', index=False, encoding='utf-8-sig')

## Optuna 파라미터 서칭

### 단일 학습/검증

In [None]:
def objective(trial):

    # 하이퍼 파라미터 범위 지정
    params = {
        'metric': ['mae', 'rmse'],
        'num_leaves': trial.suggest_int('num_leaves', 1100, 1500, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.04, step=0.005),
        # 'n_estimators': trial.suggest_int('n_estimators', 1500, 2000, step=100),
        'n_estimators': 2000,
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30, step=5),
        # 'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 0.8, step=0.05),  # 각 트리가 사용할 데이터의 비율 eg. 0.8이면 80퍼센트의 데이터 샘플만 사용
        # 'bagging_freq': trial.suggest_int('bagging_freq', 0, 5),  # 몇번째 트리마다 배깅을 적용할건지 eg. 5이면 5번째 트리마다 배깅 적용
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0, step=0.05),  # 각 트리가 사용할 컬럼의 비율 eg. 0.8이면 10개의 컬럼 중 8개만 사용
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        'num_threads': 4,
        'random_state': RANDOM_SEED,
        'verbose': 0,
    }

    # lgb model 선언 및 훈련
    # LightGBM 데이터셋 생성
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_holdout, label=y_holdout, reference=train_data)

    # LightGBM 모델 학습
    lgb_model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        valid_names='validation',
        callbacks=[lgb.early_stopping(stopping_rounds=100)]
    )

    # holdout에 대한 예측
    holdout_pred = lgb_model.predict(X_holdout)
    mae = mean_absolute_error(y_holdout, holdout_pred)
    # rmse = np.sqrt(mean_squared_error(y_holdout, holdout_pred))
    # r2 = r2_score(y_holdout, holdout_pred)

    return mae


# Optuna 객체 생성
study = optuna.create_study(direction='minimize')

# MAE 최적화 수행
study.optimize(objective,
               n_trials=100,  # 몇번의 서칭을 할건지
               n_jobs=2,  # 사용할 쓰레드의 수, -1이면 최대 실제 코어 개수
)

# 최적 런과 파라미터 출력
best_params = study.best_params
print("Best parameters:", best_params)
print("Best MAE:", study.best_value)

### kfold CV

In [None]:
def objective(trial):
    # 하이퍼파라미터 범위 지정
    params = {
        'objective': 'regression',
        'metric': ['mae', 'rmse'],
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 1100, 1500, step=100),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 30, step=5),
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.05, step=0.005),
        'n_estimators': 2000,
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 0.8, step=0.05),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'random_state': RANDOM_SEED,
        'num_threads': 4,
        'verbose': 0
    }

    # 5-fold 교차 검증 준비
    kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    
    mae_list = []

    # 5-fold 교차 검증 수행
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_all), 1):
        X_train, X_val = X_all.iloc[train_idx], X_all.iloc[val_idx]
        y_train, y_val = y_all.iloc[train_idx], y_all.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            valid_names='validation',
            callbacks=[lgb.early_stopping(stopping_rounds=100)]
        )
        
        # 검증 세트에 대한 예측
        val_pred = model.predict(X_val)
        
        # MAE 계산
        mae = mean_absolute_error(y_val, val_pred)
        mae_list.append(mae)

    # 평균 MAE 반환
    return np.mean(mae_list)

# Optuna 연구 생성 및 최적화 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20, n_jobs=2)

# 최적의 하이퍼파라미터 및 성능 출력
print("Best parameters:", study.best_params)
print("Best MAE:", study.best_value)

[I 2024-10-21 16:12:52,312] A new study created in memory with name: no-name-b1302280-56b9-4759-b9a9-afecfc972b74


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1211]	validation's l1: 4343.8	validation's rmse: 7384.13
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1806]	validation's l1: 4330.82	validation's rmse: 7387.05
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1604]	validation's l1: 4314.88	validation's rmse: 7332.04
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1867]	validation's l1: 4317.05	validation's rmse: 7344.47
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1289]	validation's l1: 4323.11	validation's rmse: 7333.82
Training until validation scores don't improve for 100 rounds


# Catboost 모델 훈련

In [31]:
# # wandb 초기화
# wandb.init(project="house_price_prediction", name="catboost_gpu")  # 실험명에 따라 name 변경해주기!!

# # 카테고리형 변수 정의
# cat_features = ['floor', 'built_year', 'elementary', 'middle', 'high', 'cluster']

# # CatBoost 파라미터 설정
# cat_params = {
#     'iterations': 10000,
#     'loss_function': 'RMSE',
#     'eval_metric': 'MAE',
#     'early_stopping_rounds': 100,
#     'verbose': 100,
#     'random_seed': RANDOM_SEED,
#     'task_type': 'GPU',
#     'devices': '0'
# }

# # wandb에 파라미터 로깅
# wandb.config.update(cat_params)

# # CatBoost 데이터셋 생성
# train_pool = Pool(X_train, y_train)
# valid_pool = Pool(X_holdout, y_holdout)

# # CatBoost 모델 생성
# cat_model = CatBoostRegressor(**cat_params)

# # 모델 학습
# cat_model.fit(
#     train_pool,
#     eval_set=valid_pool,
#     use_best_model=True,
#     # callbacks=[WandbCallback()]
# )

# # Holdout 데이터 예측
# cat_holdout_pred = cat_model.predict(X_holdout)

# # 성능 메트릭 계산
# cat_holdout_mae = mean_absolute_error(y_holdout, cat_holdout_pred)
# cat_holdout_rmse = np.sqrt(mean_squared_error(y_holdout, cat_holdout_pred))
# cat_holdout_r2 = r2_score(y_holdout, cat_holdout_pred)

# # wandb에 성능 지표 로깅
# wandb.log({
#     "holdout_mae": cat_holdout_mae,
#     "holdout_rmse": cat_holdout_rmse,
#     "holdout_r2": cat_holdout_r2
# })

# # 결과 출력
# print("Holdout 데이터셋 CatBoost 성능:")
# print(f"MAE: {cat_holdout_mae:.2f}")
# print(f"RMSE: {cat_holdout_rmse:.2f}")
# print(f"R²: {cat_holdout_r2:.2f}")

# # 특성 중요도 로깅
# feature_importance = cat_model.get_feature_importance()
# feature_importance_dict = dict(zip(X_train.columns, feature_importance))
# wandb.log({"feature_importance": wandb.plot.bar(wandb.Table(data=[[k, v] for k, v in feature_importance_dict.items()], columns=["feature", "importance"]), "feature", "importance", title="Feature Importance")})

# # wandb 실험 종료
# wandb.finish()

# Sample Submission 제출하기

## 전체 데이터로 클러스터링 다시하기

### DBSCAN

In [17]:
# # DBSCAN 클러스터 정보를 포함한 데이터셋 생성
# X_all['cluster'], X_test['cluster'] = ft.apply_dbscan_clustering(X_all, X_test)
# # 피처 선택
# X_all = X_all[train_columns]
# X_test = X_test[train_columns]

## 전체 데이터로 LGBM 재학습

In [88]:
# 피처 선택
X_all = X_all[train_columns]
X_test = X_test[train_columns]

# LightGBM 파라미터 설정
lgb_params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 1200,  # 각 트리의 최대 리프 수
    'min_samples': 20,  # 각 리프의 최소 샘플 수
    'learning_rate': 0.035,
    'n_estimators': 2000,  # 트리를 몇 개 사용하여 부스팅할건지, epoch와 비슷함
    'feature_fraction': 0.65,  # 각 트리가 사용할 컬럼의 비율 eg. 0.8이면 10개의 컬럼 중 8개만 사용
    # 'bagging_fraction': 0.65,  # 각 트리가 사용할 데이터의 비율 eg. 0.8이면 80퍼센트의 데이터 샘플만 사용
    # 'bagging_freq': 0,  # 몇번째 트리마다 배깅을 적용할건지 eg. 5이면 5번째 트리마다 배깅 적용
    'lambda_l1': 1.1939606848809192,
    'lambda_l2': 4.389852271719141,
    'verbose': -1,
    'random_state': RANDOM_SEED
}

# LightGBM 데이터셋 생성
all_data = lgb.Dataset(X_all, label=y_all)

# LightGBM 모델 학습
lgb_model = lgb.train(
    lgb_params,
    all_data,
)

# 추론
lgb_test_pred = lgb_model.predict(X_test)
sample_submission['deposit'] = lgb_test_pred
sample_submission.to_csv('output.csv', index=False, encoding='utf-8-sig')

## 전체 데이터로 Catboost 재학습

In [35]:
# # 피처 선택
# X_all = X_all[train_columns]
# X_test = X_test[train_columns]

# # CatBoost 파라미터 설정
# cat_params = {
#     'iterations': 1000,
#     'loss_function': 'RMSE',
#     'eval_metric': 'MAE',
#     'verbose': 100,
#     'random_seed': RANDOM_SEED
# }

# # CatBoost 데이터셋 생성
# all_pool = Pool(X_all, y_all)
# test_pool = Pool(X_test)

# # CatBoost 모델 생성
# cat_model = CatBoostRegressor(**cat_params)

# # 모델 학습
# cat_model.fit(all_pool)

# # 추론
# cat_test_pred = cat_model.predict(test_pool)
# sample_submission['deposit'] = cat_test_pred
# sample_submission.to_csv('output.csv', index=False, encoding='utf-8-sig')