In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
import joblib
import os

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/scaled_data.csv'
data = pd.read_csv(data_path)

# 以前に保存したランダムフォレストとXGBoostのモデルをロード
# rf_model_path = '/Users/hayakawakazue/Downloads/house_price/model/random_forest_0720.joblib'
xgb_model_path = '/Users/hayakawakazue/Downloads/house_price/model/xgboost_0721_1.joblib'
gbr_model_path = '/Users/hayakawakazue/Downloads/house_price/model/gradient_boosting_0721_1.joblib'

# 選択された特徴量
important_features = [
    'OverallQual_Capped',  'OverallHomeQuality', 'FunctionalSpace', 
    'GarageQuality', 'capped_log_GrLivArea', 'TotalBath_Capped',
    'OverallCond', 'GarageCars_Capped', 'BsmtExposureQual', 'YearBuilt_Capped',
    'BasementQualityInteraction', 'log_TotalBsmtSF_Capped',
    'HouseAge', 'OutdoorSpaceQuality', 'PorchAndDeckArea', 'LotArea' 
]

# 特徴量と目的変数を分ける
X = data[important_features]
y = data['SalePrice']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# rf_best = joblib.load(rf_model_path)
xgb_best = joblib.load(xgb_model_path)
gbr_best = joblib.load(gbr_model_path)

# アンサンブルモデルの定義
ensemble_model = VotingRegressor(estimators=[
    ('gbr', gbr_best), ('xgb', xgb_best)
])

# アンサンブルモデルのトレーニング
ensemble_model.fit(X_train, y_train)

# テストデータを使った予測
y_pred = ensemble_model.predict(X_test)

# 評価指標の計算
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# 結果の表示
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# アンサンブルモデルの保存
ensemble_model_path = os.path.join('/Users/hayakawakazue/Downloads/house_price/model', 'ensemble_model_0721_1.joblib')
joblib.dump(ensemble_model, ensemble_model_path)
print(f"最適なアンサンブルモデルが {ensemble_model_path} に保存されました。")

# 実際の値の平均を計算
mean_actual_value = np.mean(y_test)
print(f"Mean Actual Value: {mean_actual_value}")

# 誤差のパーセンテージを計算
error_percentage = (mae / mean_actual_value) * 100
print(f"Mean Absolute Percentage Error (MAPE): {error_percentage:.2f}%")


Mean Absolute Error (MAE): 15751.9482
Mean Squared Error (MSE): 513099241.6649
R-squared (R2): 0.9265
Root Mean Squared Error (RMSE): 22651.6940
最適なアンサンブルモデルが /Users/hayakawakazue/Downloads/house_price/model/ensemble_model_0721_1.joblib に保存されました。
Mean Actual Value: 180007.70319634702
Mean Absolute Percentage Error (MAPE): 8.75%
