# 라이브러리 불러오기

In [14]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

import wandb
from wandb.integration.lightgbm import wandb_callback, log_summary
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool


# WandB 설정

In [15]:
wandb.login()

True

# 랜덤 시드 설정

In [16]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 데이터 불러오기

In [17]:
path = "../../../data/"  # 알잘딱 수정
train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

#추가
interest = pd.read_csv(path + 'interestRate.csv')
park = pd.read_csv(path + 'parkInfo.csv')
school = pd.read_csv(path + 'schoolinfo.csv')
subway = pd.read_csv(path + 'subwayInfo.csv')


## 중복 값 확인 및 처리

In [18]:
duplicates = train_data[train_data.drop(columns=['index']).duplicated(keep=False)]
duplicates

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit
15,15,84.9342,201907,31,2,7,2016,36.965423,127.048779,3,18000.0
16,16,84.9342,201907,31,2,7,2016,36.965423,127.048779,3,18000.0
28,28,146.4005,201911,21,2,5,2016,36.965423,127.048779,3,37000.0
29,29,146.4005,201911,21,2,5,2016,36.965423,127.048779,3,37000.0
33,33,84.9342,201912,14,2,14,2016,36.965423,127.048779,3,19000.0
...,...,...,...,...,...,...,...,...,...,...,...
1801197,1801197,101.9088,202308,22,2,11,2010,37.528394,126.659398,13,33000.0
1801198,1801198,114.9285,202308,28,1,18,2010,37.528394,126.659398,13,30000.0
1801199,1801199,114.9285,202308,28,1,18,2010,37.528394,126.659398,13,30000.0
1801210,1801210,114.9285,202310,26,2,9,2010,37.528394,126.659398,13,39000.0


In [19]:
# 중복 제거 (첫 번째 발생 유지)
train_data = train_data.drop_duplicates(subset=train_data.columns.drop('index'), keep='first')
train_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0
3,3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,5000.0
4,4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,1800.0
...,...,...,...,...,...,...,...,...,...,...,...
1801223,1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,39000.0
1801224,1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,38000.0
1801225,1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,37000.0
1801226,1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,34400.0


# Holdout 데이터셋 설정 (예: 2023년 7월부터 12월까지의 데이터)

In [20]:
# 전체 재학습 데이터를 따로 빼놓음
all_data = train_data.copy()

holdout_start = 202307
holdout_end = 202312
holdout_data = train_data[(train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end)]
train_data = train_data[~((train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end))]

# 학습 데이터와 정답 데이터 분리

In [21]:
X_train = train_data.drop(columns=['deposit'])
y_train = train_data['deposit']
X_holdout = holdout_data.drop(columns=['deposit'])
y_holdout = holdout_data['deposit']
# 전체 재학습 추가 부분
X_all = all_data.drop(columns=['deposit'])
y_all = all_data['deposit']
X_test = test_data.copy()

# DBSCAN 클러스터링으로 'cluster' 피처 추가

### 경도, 위도 스케일링 후 DBSCAN으로 train 기반하여 클러스터 생성, holdout(=validation)은 경도,위도 기준 가장 가까운 train 샘플의 라벨을 할당

In [22]:
# 데이터 스케일링
scaler = StandardScaler()

X_train_location = X_train[['latitude', 'longitude']].copy()
X_holdout_location = X_holdout[['latitude', 'longitude']].copy()

X_train_location_scaled = scaler.fit_transform(X_train_location)
X_holdout_location_scaled = scaler.transform(X_holdout_location)

# 2. Train 데이터에 DBSCAN 적용
dbscan = DBSCAN(eps=0.02, min_samples=15)  # 파라미터는 데이터에 맞게 조정 필요
train_clusters = dbscan.fit_predict(X_train_location_scaled)

# 3. Validation 데이터에 클러스터 라벨 할당
nn = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn.fit(X_train_location_scaled)  # train의 경도, 위도로 NN을 피팅

distances, indices = nn.kneighbors(X_holdout_location_scaled)  # holdout 샘플 당, 가장 근처의 train 샘플을 찾음
holdout_clusters = train_clusters[indices.flatten()]  # 가장 가까운 train 샘플의 클러스터 라벨을 상속받음

# 4. 클러스터 정보를 포함한 데이터셋 생성
X_train['cluster'] = train_clusters
X_holdout['cluster'] = holdout_clusters

In [23]:
X_train.head()

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,cluster
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,0
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,0
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,0
3,3,59.34,201907,15,2,1,1986,36.964647,127.055847,33,1
4,4,59.81,201904,12,2,6,1995,36.97239,127.084514,24,-1


In [24]:
X_holdout.head()

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,cluster
774291,774291,102.6654,202307,8,0,7,2023,37.200075,126.820401,0,158
774292,774292,102.9485,202307,19,0,8,2023,37.200075,126.820401,0,158
774293,774293,102.9485,202307,27,0,12,2023,37.200075,126.820401,0,158
774294,774294,94.3147,202307,30,0,4,2023,37.200075,126.820401,0,158
774295,774295,94.3147,202308,12,0,1,2023,37.200075,126.820401,0,158


# 모델 훈련

## 피처 선택

In [25]:
# 피처 선택
train_columns = ['area_m2', 'contract_year_month', 'contract_day', 'floor', 'latitude', 'longitude', 'age', 'cluster']
X_train = X_train[train_columns]
X_holdout = X_holdout[train_columns]

# LightGBM 모델 훈련

In [13]:
# wandb 초기화
wandb.init(project="house_price_prediction", name="lgbm_base")  # 실험명에 따라 name 변경해주기!!

# LightGBM 파라미터 설정
lgb_params = {
    'objective': 'regression',
    'metric': ['rmse', 'mae'],
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'verbose': -1,
    'random_state': RANDOM_SEED
}

# wandb에 파라미터 로깅
wandb.config.update(lgb_params)

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_holdout, label=y_holdout, reference=train_data)

# LightGBM 모델 학습
lgb_model = lgb.train(
    lgb_params,
    train_data,
    num_boost_round=1000,
    valid_sets=[valid_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100),
        wandb_callback()
    ]
)

# Log feature importance plot
log_summary(lgb_model)

# Holdout 데이터 예측 - validation 데이터에 대해 가장 좋았던 iteration 사용
lgb_holdout_pred = lgb_model.predict(X_holdout, num_iteration=lgb_model.best_iteration)

# 성능 메트릭 계산
lgb_holdout_mae = mean_absolute_error(y_holdout, lgb_holdout_pred)
lgb_holdout_rmse = np.sqrt(mean_squared_error(y_holdout, lgb_holdout_pred))
lgb_holdout_r2 = r2_score(y_holdout, lgb_holdout_pred)

# wandb에 성능 지표 로깅
wandb.log({
    "holdout_mae": lgb_holdout_mae,
    "holdout_rmse": lgb_holdout_rmse,
    "holdout_r2": lgb_holdout_r2
})

# 결과 출력
print("Holdout 데이터셋 LGBM 성능:")
print(f"MAE: {lgb_holdout_mae:.2f}")
print(f"RMSE: {lgb_holdout_rmse:.2f}")
print(f"R²: {lgb_holdout_r2:.2f}")

# wandb 실험 종료
wandb.finish()

Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 10480.1	valid_0's l1: 6494.46
[200]	valid_0's rmse: 9578.42	valid_0's l1: 5901.93
[300]	valid_0's rmse: 9118.7	valid_0's l1: 5643.51
[400]	valid_0's rmse: 8900.45	valid_0's l1: 5481.44
[500]	valid_0's rmse: 8679.83	valid_0's l1: 5341.92
[600]	valid_0's rmse: 8529.5	valid_0's l1: 5248.95
[700]	valid_0's rmse: 8428.49	valid_0's l1: 5166.18
[800]	valid_0's rmse: 8325.86	valid_0's l1: 5088.8
[900]	valid_0's rmse: 8255.8	valid_0's l1: 5032.1
[1000]	valid_0's rmse: 8201.88	valid_0's l1: 4987.76
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 8200.44	valid_0's l1: 4987.23
Holdout 데이터셋 LGBM 성능:
MAE: 4987.23
RMSE: 8200.44
R²: 0.92


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
holdout_mae,▁
holdout_r2,▁
holdout_rmse,▁
iteration,▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
valid_0_l1,██▇▇▆▅▅▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
valid_0_rmse,█▇▇▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_iteration,999.0
holdout_mae,4987.22758
holdout_r2,0.91762
holdout_rmse,8200.43729
iteration,999.0


# Catboost 모델 훈련

In [27]:
# # wandb 초기화
# wandb.init(project="house_price_prediction", name="catboost_base")  # 실험명에 따라 name 변경해주기!!

# # CatBoost 파라미터 설정
# cat_params = {
#     'iterations': 1000,
#     'loss_function': 'RMSE',
#     'eval_metric': 'MAE',
#     'early_stopping_rounds': 100,
#     'verbose': 100,
#     'random_seed': RANDOM_SEED
# }

# # wandb에 파라미터 로깅
# wandb.config.update(cat_params)

# # CatBoost 데이터셋 생성
# train_pool = Pool(X_train, y_train)
# valid_pool = Pool(X_holdout, y_holdout)

# # CatBoost 모델 생성
# cat_model = CatBoostRegressor(**cat_params)

# # 모델 학습
# cat_model.fit(
#     train_pool,
#     eval_set=valid_pool,
#     use_best_model=True,
#     callbacks=[wandb.catboost.WandbCallback()]
# )

# # Holdout 데이터 예측 - 최적의 iteration 사용
# cat_holdout_pred = cat_model.predict(X_holdout)

# # 성능 메트릭 계산
# cat_holdout_mae = mean_absolute_error(y_holdout, cat_holdout_pred)
# cat_holdout_rmse = np.sqrt(mean_squared_error(y_holdout, cat_holdout_pred))
# cat_holdout_r2 = r2_score(y_holdout, cat_holdout_pred)

# # wandb에 성능 지표 로깅
# wandb.log({
#     "holdout_mae": cat_holdout_mae,
#     "holdout_rmse": cat_holdout_rmse,
#     "holdout_r2": cat_holdout_r2
# })

# # 결과 출력
# print("Holdout 데이터셋 CatBoost 성능:")
# print(f"MAE: {cat_holdout_mae:.2f}")
# print(f"RMSE: {cat_holdout_rmse:.2f}")
# print(f"R²: {cat_holdout_r2:.2f}")

# # 특성 중요도 로깅
# feature_importance = cat_model.get_feature_importance()
# feature_importance_dict = dict(zip(X_train.columns, feature_importance))
# wandb.log({"feature_importance": wandb.plot.bar(wandb.Table(data=[[k, v] for k, v in feature_importance_dict.items()], columns=["feature", "importance"]), "feature", "importance", title="Feature Importance")})

# # wandb 실험 종료
# wandb.finish()

Learning rate set to 0.160842
0:	learn: 16299.4441947	test: 17132.0434156	best: 17132.0434156 (0)	total: 55.8ms	remaining: 55.7s
100:	learn: 6851.8061565	test: 6871.8222894	best: 6859.8635464 (99)	total: 4.96s	remaining: 44.2s
200:	learn: 6394.3908480	test: 6289.0475517	best: 6286.7079695 (197)	total: 10s	remaining: 39.8s
300:	learn: 6167.6357506	test: 5979.5901259	best: 5979.5901259 (300)	total: 15.2s	remaining: 35.3s
400:	learn: 6014.6146587	test: 5800.9865376	best: 5800.9865376 (400)	total: 20.3s	remaining: 30.3s
500:	learn: 5895.3839725	test: 5660.4571884	best: 5660.4571884 (500)	total: 25.3s	remaining: 25.2s
600:	learn: 5806.8521713	test: 5559.9746673	best: 5559.9746673 (600)	total: 30.3s	remaining: 20.1s
700:	learn: 5731.3889401	test: 5484.7380679	best: 5484.7380679 (700)	total: 35.3s	remaining: 15s
800:	learn: 5661.7667809	test: 5418.5377767	best: 5418.5377767 (800)	total: 40.4s	remaining: 10s
900:	learn: 5604.3578442	test: 5345.8625785	best: 5345.8625785 (900)	total: 45.3s	rema

VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
holdout_mae,▁
holdout_r2,▁
holdout_rmse,▁
iteration@metric-period-1,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█████
learn-MAE,█▆▆▆▅▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
learn-RMSE,█▅▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation-MAE,█▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation-RMSE,█▆▆▅▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
holdout_mae,5293.868
holdout_r2,0.90679
holdout_rmse,8723.13559
iteration@metric-period-1,1000.0
learn-MAE,5554.20712
learn-RMSE,8692.03465
validation-MAE,5293.868
validation-RMSE,8723.13559


# Sample Submission 제출하기

### 전체 데이터로 클러스터링 다시하기

In [14]:
# 데이터 스케일링
scaler = StandardScaler()

X_all_location = X_all[['latitude', 'longitude']].copy()
X_test_location = X_test[['latitude', 'longitude']].copy()

X_all_location_scaled = scaler.fit_transform(X_all_location)
X_test_location_scaled = scaler.transform(X_test_location)

# 2. Train 데이터에 DBSCAN 적용
dbscan = DBSCAN(eps=0.02, min_samples=15)  # 파라미터는 데이터에 맞게 조정 필요
train_clusters = dbscan.fit_predict(X_all_location_scaled)

# 3. Validation 데이터에 클러스터 라벨 할당
nn = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn.fit(X_all_location_scaled)  # train의 경도, 위도로 NN을 피팅

distances, indices = nn.kneighbors(X_test_location_scaled)  # test 샘플 당, 가장 근처의 train 샘플을 찾음
test_clusters = train_clusters[indices.flatten()]  # 가장 가까운 train 샘플의 클러스터 라벨을 상속받음

# 4. 클러스터 정보를 포함한 데이터셋 생성
X_all['cluster'] = train_clusters
X_test['cluster'] = test_clusters

## 전체 데이터로 LGBM 재학습

In [16]:
# 피처 선택
X_all = X_all[train_columns]
X_test = X_test[train_columns]

# LightGBM 파라미터 설정
lgb_params = {
    'objective': 'regression',
    'metric': ['rmse', 'mae'],
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'verbose': -1,
    'random_state': RANDOM_SEED
}

# LightGBM 데이터셋 생성
all_data = lgb.Dataset(X_all, label=y_all)

# LightGBM 모델 학습
lgb_model = lgb.train(
    lgb_params,
    all_data,
    num_boost_round=1000,
)

# 추론
lgb_test_pred = lgb_model.predict(X_test)
sample_submission['deposit'] = lgb_test_pred
sample_submission.to_csv('output.csv', index=False, encoding='utf-8-sig')

## 전체 데이터로 Catboost 재학습

In [19]:
# # 피처 선택
# X_all = X_all[train_columns]
# X_test = X_test[train_columns]

# # CatBoost 파라미터 설정
# cat_params = {
#     'iterations': 1000,
#     'loss_function': 'RMSE',
#     'eval_metric': 'MAE',
#     'verbose': 100,
#     'random_seed': RANDOM_SEED
# }

# # CatBoost 데이터셋 생성
# all_pool = Pool(X_all, y_all)
# test_pool = Pool(X_test)

# # CatBoost 모델 생성
# cat_model = CatBoostRegressor(**cat_params)

# # 모델 학습
# cat_model.fit(all_pool)

# # 추론
# cat_test_pred = cat_model.predict(test_pool)
# sample_submission['deposit'] = cat_test_pred
# sample_submission.to_csv('output.csv', index=False, encoding='utf-8-sig')

0:	learn: 17673.5525613	total: 57.6ms	remaining: 57.6s
100:	learn: 8995.5864430	total: 5.41s	remaining: 48.2s
200:	learn: 7878.9127595	total: 10.7s	remaining: 42.6s
300:	learn: 7389.9889004	total: 16s	remaining: 37.2s
400:	learn: 7097.0909837	total: 21.4s	remaining: 32s
500:	learn: 6892.9699931	total: 26.8s	remaining: 26.7s
600:	learn: 6746.8562676	total: 32.1s	remaining: 21.3s
700:	learn: 6633.3392882	total: 37.7s	remaining: 16.1s
800:	learn: 6534.1110931	total: 43.1s	remaining: 10.7s
900:	learn: 6446.6317386	total: 48.6s	remaining: 5.34s
999:	learn: 6371.3089101	total: 54s	remaining: 0us
