# XGBoost

In [2]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.0-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata (2.1 kB)
Using cached xgboost-2.1.0-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.0


In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/scaled_data.csv'
data = pd.read_csv(data_path)

# 特徴量の選択
important_features = [
    'OverallQual_Capped',  'OverallHomeQuality', 'FunctionalSpace', 
    'GarageQuality', 'capped_log_GrLivArea', 'TotalBath_Capped',
    'OverallCond', 'GarageCars_Capped', 'BsmtExposureQual', 'YearBuilt_Capped',
    'BasementQualityInteraction', 'log_TotalBsmtSF_Capped',
    'HouseAge', 'OutdoorSpaceQuality', 'PorchAndDeckArea', 'LotArea' 
]

# 特徴量と目的変数を分ける
X = data[important_features]
y = data['SalePrice']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# XGBoostモデルの定義
xgboost_model = xgb.XGBRegressor(random_state=42)

# ハイパーパラメータのグリッド
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# グリッドサーチの設定
grid_search = GridSearchCV(estimator=xgboost_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=0)

# グリッドサーチの実行
grid_search.fit(X_train, y_train)

# 最適なハイパーパラメータの表示
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation MAE: {-grid_search.best_score_:.4f}")

# 最適なモデルを取得
best_model = grid_search.best_estimator_

# テストデータを使った予測
y_pred = best_model.predict(X_test)

# 評価指標の計算
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# 結果の表示
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# 最適なモデルの保存
model_path = os.path.join('/Users/hayakawakazue/Downloads/house_price/model', 'xgboost_0721_1.joblib')
joblib.dump(best_model, model_path)
print(f"最適なXGBoostモデルが {model_path} に保存されました。")

# 実際の値の平均を計算
mean_actual_value = np.mean(y_test)
print(f"Mean Actual Value: {mean_actual_value}")

# 誤差のパーセンテージを計算
error_percentage = (mae / mean_actual_value) * 100
print(f"Mean Absolute Percentage Error (MAPE): {error_percentage:.2f}%")


Best parameters found: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.6}
Best cross-validation MAE: 18121.2184
Mean Absolute Error (MAE): 16146.8340
Mean Squared Error (MSE): 599679629.3676
R-squared (R2): 0.9141
Root Mean Squared Error (RMSE): 24488.3570
最適なXGBoostモデルが /Users/hayakawakazue/Downloads/house_price/model/xgboost_0721_1.joblib に保存されました。
Mean Actual Value: 180007.70319634702
Mean Absolute Percentage Error (MAPE): 8.97%
