In [2]:
# 1. 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
# 1. 데이터 로드
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

In [4]:
# 'ID' 열 제거 및 타겟 분리
X = train.drop(['가격(백만원)', 'ID'], axis=1)
y = train['가격(백만원)']
X_test = test.drop(['ID'], axis=1)

# 결측치 처리 (예: 중앙값으로 채움)
X.fillna(X.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

# 범주형 데이터 원-핫 인코딩
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# 테스트 데이터의 열을 훈련 데이터와 맞추기
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

  X.fillna(X.median(), inplace=True)
  X_test.fillna(X_test.median(), inplace=True)


In [5]:
# 3. 모델 정의 및 하이퍼파라미터 튜닝
lgbm_model = LGBMRegressor(random_state=42, device='cpu', force_row_wise=True)

# 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [500, 1000],  # 범위 축소
    'learning_rate': [0.01, 0.05],  # 옵션 축소
    'num_leaves': [31, 50],
    'max_depth': [7, 10],
}

# GridSearchCV를 사용한 하이퍼파라미터 튜닝
grid_search = GridSearchCV(
    estimator=lgbm_model,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=3,
    n_jobs=1  # 병렬 처리 비활성화
)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터와 모델 출력
best_params = grid_search.best_params_
best_lgbm_model = grid_search.best_estimator_
print(f"Best Parameters: {best_params}")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 3998, number of used features: 40
[LightGBM] [Info] Start training from score 62.411276
[CV 1/3] END learning_rate=0.01, max_depth=7, n_estimators=500, num_leaves=31;, score=-3.223 total time=   1.7s
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 3998, number of used features: 40
[LightGBM] [Info] Start training from score 62.047251
[CV 2/3] END learning_rate=0.01, max_depth=7, n_estimators=500, num_leaves=31;, score=-2.973 total time=   1.7s
[LightGBM] [Info] Total Bins 434
[LightGBM] [Info] Number of data points in the train set: 3998, number of used features: 40
[LightGBM] [Info] Start training from score 62.205935
[CV 3/3] END learning_rate=0.01, max_depth=7, n_estimators=500, num_leaves=31;, score=-2.605 total time=   1.7s
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data 

In [6]:
# 4. 검증 데이터 평가
y_val_pred = best_lgbm_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {rmse:.4f}")

# 5. 테스트 데이터 예측 및 제출 파일 생성
y_test_pred = best_lgbm_model.predict(X_test)

# 결과 값 클리핑
y_test_pred = np.clip(y_test_pred, y.min(), y.max())

Validation RMSE: 1.3828


In [7]:
# 제출 파일 생성
submission['가격(백만원)'] = y_test_pred
submission.to_csv("predicted_submission_13.csv", index=False)
print("Predicted results saved to: predicted_submission_13.csv")

Predicted results saved to: predicted_submission_13.csv
