# 라이브러리 불러오기

In [72]:
import os
import pandas as pd
import numpy as np
import wandb

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

import lightgbm as lgb
from catboost import CatBoostRegressor


# WandB 설정

In [73]:
# wandb.login()

# 랜덤 시드 설정

In [74]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 데이터 불러오기

In [75]:
path = "../../../data/"  # 알잘딱 수정
train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

#추가
interest = pd.read_csv(path + 'interestRate.csv')
park = pd.read_csv(path + 'parkInfo.csv')
school = pd.read_csv(path + 'schoolinfo.csv')
subway = pd.read_csv(path + 'subwayInfo.csv')


## 중복 값 확인 및 처리

In [76]:
duplicates = train_data[train_data.drop(columns=['index']).duplicated(keep=False)]
duplicates

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit
15,15,84.9342,201907,31,2,7,2016,36.965423,127.048779,3,18000.0
16,16,84.9342,201907,31,2,7,2016,36.965423,127.048779,3,18000.0
28,28,146.4005,201911,21,2,5,2016,36.965423,127.048779,3,37000.0
29,29,146.4005,201911,21,2,5,2016,36.965423,127.048779,3,37000.0
33,33,84.9342,201912,14,2,14,2016,36.965423,127.048779,3,19000.0
...,...,...,...,...,...,...,...,...,...,...,...
1801197,1801197,101.9088,202308,22,2,11,2010,37.528394,126.659398,13,33000.0
1801198,1801198,114.9285,202308,28,1,18,2010,37.528394,126.659398,13,30000.0
1801199,1801199,114.9285,202308,28,1,18,2010,37.528394,126.659398,13,30000.0
1801210,1801210,114.9285,202310,26,2,9,2010,37.528394,126.659398,13,39000.0


In [77]:
# 중복 제거 (첫 번째 발생 유지)
train_data = train_data.drop_duplicates(subset=train_data.columns.drop('index'), keep='first')
train_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0
3,3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,5000.0
4,4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,1800.0
...,...,...,...,...,...,...,...,...,...,...,...
1801223,1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,39000.0
1801224,1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,38000.0
1801225,1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,37000.0
1801226,1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,34400.0


# Holdout 데이터셋 설정 (예: 2023년 7월부터 12월까지의 데이터)

In [78]:
# 전체 재학습 데이터를 따로 빼놓음
all_data = train_data.copy()

holdout_start = 202307
holdout_end = 202312
holdout_data = train_data[(train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end)]
train_data = train_data[~((train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end))]

# 학습 데이터와 정답 데이터 분리

In [79]:
X_train = train_data.drop(columns=['deposit'])
y_train = train_data['deposit']
X_holdout = holdout_data.drop(columns=['deposit'])
y_holdout = holdout_data['deposit']
# 전체 재학습 추가 부분
X_all = all_data.drop(columns=['deposit'])
y_all = all_data['deposit']
X_test = test_data.copy()

# DBSCAN 클러스터링으로 'cluster' 피처 추가

### 경도, 위도 스케일링 후 DBSCAN으로 train 기반하여 클러스터 생성, holdout(=validation)은 경도,위도 기준 가장 가까운 train 샘플의 라벨을 할당

In [80]:
# 데이터 스케일링
scaler = StandardScaler()

X_train_location = X_train[['latitude', 'longitude']].copy()
X_holdout_location = X_holdout[['latitude', 'longitude']].copy()

X_train_location_scaled = scaler.fit_transform(X_train_location)
X_holdout_location_scaled = scaler.transform(X_holdout_location)

# 2. Train 데이터에 DBSCAN 적용
dbscan = DBSCAN(eps=0.02, min_samples=15)  # 파라미터는 데이터에 맞게 조정 필요
train_clusters = dbscan.fit_predict(X_train_location_scaled)

# 3. Validation 데이터에 클러스터 라벨 할당
nn = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn.fit(X_train_location_scaled)  # train의 경도, 위도로 NN을 피팅

distances, indices = nn.kneighbors(X_holdout_location_scaled)  # holdout 샘플 당, 가장 근처의 train 샘플을 찾음
holdout_clusters = train_clusters[indices.flatten()]  # 가장 가까운 train 샘플의 클러스터 라벨을 상속받음

# 4. 클러스터 정보를 포함한 데이터셋 생성
X_train['cluster'] = train_clusters
X_holdout['cluster'] = holdout_clusters

In [81]:
X_train.head()

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,cluster
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,0
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,0
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,0
3,3,59.34,201907,15,2,1,1986,36.964647,127.055847,33,1
4,4,59.81,201904,12,2,6,1995,36.97239,127.084514,24,-1


In [82]:
X_holdout.head()

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,cluster
774291,774291,102.6654,202307,8,0,7,2023,37.200075,126.820401,0,158
774292,774292,102.9485,202307,19,0,8,2023,37.200075,126.820401,0,158
774293,774293,102.9485,202307,27,0,12,2023,37.200075,126.820401,0,158
774294,774294,94.3147,202307,30,0,4,2023,37.200075,126.820401,0,158
774295,774295,94.3147,202308,12,0,1,2023,37.200075,126.820401,0,158


# LightGBM 모델 훈련

In [83]:
train_columns = ['area_m2', 'contract_year_month', 'contract_day', 'floor', 'latitude', 'longitude', 'age', 'cluster']
X_train = X_train[train_columns]
X_holdout = X_holdout[train_columns]

lgb_model = lgb.LGBMRegressor(random_state=RANDOM_SEED)
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008609 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1223
[LightGBM] [Info] Number of data points in the train set: 1518975, number of used features: 8
[LightGBM] [Info] Start training from score 37881.878384


## Holdout 데이터셋에 대한 성능 확인

In [84]:
# Holdout 데이터 예측값
lgb_holdout_pred = lgb_model.predict(X_holdout)
lgb_holdout_mae = mean_absolute_error(y_holdout, lgb_holdout_pred) # MAE
lgb_holdout_mse = mean_squared_error(y_holdout, lgb_holdout_pred) # MSE
lgb_holdout_rmse = np.sqrt(lgb_holdout_mse) # RMSE (MSE의 제곱근)
lgb_holdout_r2 = r2_score(y_holdout, lgb_holdout_pred) # R²

# 결과 출력
print("Holdout 데이터셋 LGBM 성능:")
print(f"MAE: {lgb_holdout_mae:.2f}")
print(f"MSE: {lgb_holdout_mse:.2f}")
print(f"RMSE: {lgb_holdout_rmse:.2f}")
print(f"R²: {lgb_holdout_r2:.2f}")

Holdout 데이터셋 LGBM 성능:
MAE: 6494.46
MSE: 109833473.77
RMSE: 10480.15
R²: 0.87


# Catboost 모델 훈련

In [85]:
# cat_model = CatBoostRegressor(
#     iterations=10000,
#     learning_rate=0.05,
#     depth=8,
#     loss_function='MAE',
#     eval_metric='MAE',
#     early_stopping_rounds=200,
#     verbose=500,
#     random_seed=42
# )

# # 모델 학습
# cat_model.fit(X_train, y_train, eval_set=(X_holdout, y_holdout), use_best_model=True)

# # Holdout 데이터 예측
# cat_holdout_pred = cat_model.predict(X_holdout)

# # Holdout 데이터셋 성능 평가 (MAE)
# cat_holdout_mae = mean_absolute_error(y_holdout, cat_holdout_pred)
# print("Holdout 데이터셋 성능:")
# print(f"CatBoost MAE: {cat_holdout_mae:.2f}")

# Sample Submission 제출하기

## 전체 데이터로 LGBM 재학습

### 전체 데이터로 클러스터링 다시하기

In [86]:
# 데이터 스케일링
scaler = StandardScaler()

X_all_location = X_all[['latitude', 'longitude']].copy()
X_test_location = X_test[['latitude', 'longitude']].copy()

X_all_location_scaled = scaler.fit_transform(X_all_location)
X_test_location_scaled = scaler.transform(X_test_location)

# 2. Train 데이터에 DBSCAN 적용
dbscan = DBSCAN(eps=0.02, min_samples=15)  # 파라미터는 데이터에 맞게 조정 필요
train_clusters = dbscan.fit_predict(X_all_location_scaled)

# 3. Validation 데이터에 클러스터 라벨 할당
nn = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn.fit(X_all_location_scaled)  # train의 경도, 위도로 NN을 피팅

distances, indices = nn.kneighbors(X_test_location_scaled)  # test 샘플 당, 가장 근처의 train 샘플을 찾음
test_clusters = train_clusters[indices.flatten()]  # 가장 가까운 train 샘플의 클러스터 라벨을 상속받음

# 4. 클러스터 정보를 포함한 데이터셋 생성
X_all['cluster'] = train_clusters
X_test['cluster'] = test_clusters

### 학습, 추론, output

In [87]:
X_all = X_all[train_columns]
X_test = X_test[train_columns]

lgb_model = lgb.LGBMRegressor(random_state=RANDOM_SEED)
lgb_model.fit(X_all, y_all)

lgb_test_pred = lgb_model.predict(X_test)
sample_submission['deposit'] = lgb_test_pred
sample_submission.to_csv('output.csv', index=False, encoding='utf-8-sig')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010433 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1232
[LightGBM] [Info] Number of data points in the train set: 1717611, number of used features: 8
[LightGBM] [Info] Start training from score 38231.334035


## 전체 데이터로 Catboost 재학습