In [1]:
# 1. 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from scipy.optimize import minimize
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2

In [3]:
# 1. 데이터 로드
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

In [4]:
# 3. 데이터 전처리
X = train.drop(['가격(백만원)', 'ID'], axis=1)
y = train['가격(백만원)']

# 결측치 처리
X['배터리용량'] = X['배터리용량'].fillna(X['배터리용량'].median())
test['배터리용량'] = test['배터리용량'].fillna(test['배터리용량'].median())

# 파생 변수 추가
X['에너지효율'] = X['배터리용량'] / (X['주행거리(km)'] + 1e-5)
X['차량나이'] = 2025 - X['연식(년)']
test['에너지효율'] = test['배터리용량'] / (test['주행거리(km)'] + 1e-5)
test['차량나이'] = 2025 - test['연식(년)']

# 원-핫 인코딩
X = pd.get_dummies(X, columns=['제조사', '모델', '차량상태', '구동방식', '사고이력'])
X_test = pd.get_dummies(test.drop(['ID'], axis=1), columns=['제조사', '모델', '차량상태', '구동방식', '사고이력'])
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [5]:
# 4. 머신러닝 모델 정의
models = {
    "XGBoost": XGBRegressor(n_estimators=1000, learning_rate=0.01, max_depth=8, random_state=42),
    "CatBoost": CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=8, loss_function='RMSE', verbose=0, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=1000, learning_rate=0.01, max_depth=8, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=500, random_state=42),
    "Extra Trees": ExtraTreesRegressor(n_estimators=500, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, random_state=42),
}

In [6]:
# 5. 모델 학습 및 성능 평가
predictions = {}
rmses = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    predictions[name] = pred
    rmses[name] = rmse
    print(f"{name} RMSE: {rmse:.4f}")

Training XGBoost...
XGBoost RMSE: 1.6297
Training CatBoost...
CatBoost RMSE: 1.6484
Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001947 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 737
[LightGBM] [Info] Number of data points in the train set: 5997, number of used features: 42
[LightGBM] [Info] Start training from score 62.221487
LightGBM RMSE: 1.3639
Training Random Forest...
Random Forest RMSE: 1.5423
Training Extra Trees...
Extra Trees RMSE: 1.6601
Training Gradient Boosting...
Gradient Boosting RMSE: 1.6498


In [7]:
# 6. 최적 가중치 탐색
def objective(weights):
    weighted_preds = sum(weights[i] * list(predictions.values())[i] for i in range(len(weights)))
    return np.sqrt(mean_squared_error(y_val, weighted_preds))

initial_weights = [1/len(models)] * len(models)
constraints = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
bounds = [(0, 1)] * len(models)

result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)
optimal_weights = result.x
print(f"Optimal Weights: {optimal_weights}")

Optimal Weights: [1.69115714e-01 1.54240478e-17 8.30884286e-01 2.30881074e-21
 4.08639070e-23 2.29845191e-17]


In [8]:
# 7. 앙상블 예측
ensemble_pred = sum(optimal_weights[i] * list(predictions.values())[i] for i in range(len(models)))
ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred))
print(f"Validation RMSE (Optimal Weighted Ensemble): {ensemble_rmse:.4f}")

Validation RMSE (Optimal Weighted Ensemble): 1.3910


In [9]:
# 8. 테스트 데이터 예측
final_test_pred = sum(optimal_weights[i] * models[name].predict(X_test) for i, name in enumerate(models.keys()))

# 결과 값 클리핑
lower_bound = y.min()
upper_bound = y.max()
final_test_pred = np.clip(final_test_pred, lower_bound, upper_bound)

# 제출 파일 생성
submission['가격(백만원)'] = final_test_pred
submission.to_csv("predicted_submission_10.csv", index=False)
print("Predicted results saved to: predicted_submission_10.csv")

Predicted results saved to: predicted_submission_10.csv
