# ランダムフォレストモデルでトレーニングする

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os
import joblib

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train_preprocessed.csv'
data = pd.read_csv(data_path)

# 重要な特徴量のリスト
important_features = [
    'TotalArea', 'QualityScore', 'OverallQual', 'GrLivArea', 
    'GarageScore', 'ExterQual', 'KitchenQual', 
    'GarageCars', 'BsmtQual', 'GarageArea', 'TotalBsmtSF', 
    'BsmtQualityIndex', '1stFlrSF', 'AvgQualityCondition', 'QualityCondition',
    'FullBath', 'YearBuilt', 'YearRemodAdd', 'TotalRooms', 'TotRmsAbvGrd'
]

# 特徴量と目的変数を分ける
X = data[important_features]
y = data['SalePrice']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ランダムフォレストモデルの作成と訓練
# ハイパーパラメータのチューニングは以下のコメントアウトを参考に
# model = RandomForestRegressor(random_state=42, n_estimators=200, max_depth=20, min_samples_split=5, min_samples_leaf=2, max_features='sqrt')
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# クロスバリデーションでの評価
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
cv_mae_scores = -cv_scores

# クロスバリデーションの結果を表示
print(f"Cross-Validation MAE Scores: {cv_mae_scores}")
print(f"Mean MAE: {cv_mae_scores.mean():.4f}")
print(f"Standard Deviation of MAE: {cv_mae_scores.std():.4f}")

# テストデータを使った予測
y_pred = model.predict(X_test)

# 評価指標の計算
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# 結果の表示
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# モデルの保存
model_path = os.path.join('/Users/hayakawakazue/Downloads/house_price/model', 'random_forest.joblib')
joblib.dump(model, model_path)
print(f"Random Forest モデルが {model_path} に保存されました。")


Cross-Validation MAE Scores: [14988.29027493 19807.53286528 16042.27404253 16887.15461373
 14643.93482117]
Mean MAE: 16473.8373
Standard Deviation of MAE: 1845.3182
Mean Absolute Error (MAE): 15439.5958
Mean Squared Error (MSE): 485173952.1026
R-squared (R2): 0.9008
Root Mean Squared Error (RMSE): 22026.6646
Random Forest モデルが /Users/hayakawakazue/Downloads/house_price/model/random_forest.joblib に保存されました。


# ハイパーパラメータをチューニング

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os
import joblib

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train_preprocessed.csv'
data = pd.read_csv(data_path)

# 重要な特徴量のリスト
important_features = [
    'TotalArea', 'QualityScore', 'OverallQual', 'GrLivArea', 
    'GarageScore', 'ExterQual', 'KitchenQual', 
    'GarageCars', 'BsmtQual', 'GarageArea', 'TotalBsmtSF', 
    'BsmtQualityIndex', '1stFlrSF', 'AvgQualityCondition', 'QualityCondition',
    'FullBath', 'YearBuilt', 'YearRemodAdd', 'TotalRooms', 'TotRmsAbvGrd'
]

# 特徴量と目的変数を分ける
X = data[important_features]
y = data['SalePrice']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# データの前処理：nanを含む行を削除
X_train = X_train.dropna()
y_train = y_train[X_train.index]
X_test = X_test.dropna()
y_test = y_test[X_test.index]

# ハイパーパラメータの候補を設定
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# ランダムフォレストモデルの定義
model = RandomForestRegressor(random_state=42)

# グリッドサーチの設定
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)

# グリッドサーチの実行
grid_search.fit(X_train, y_train)

# 最適なハイパーパラメータの表示
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation MAE: {-grid_search.best_score_:.4f}")

# 最適なモデルを取得
best_model = grid_search.best_estimator_

# テストデータを使った予測
y_pred = best_model.predict(X_test)

# 評価指標の計算
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# 結果の表示
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# 最適なモデルの保存
model_path = os.path.join('/Users/hayakawakazue/Downloads/house_price/model', 'random_forest_best.joblib')
joblib.dump(best_model, model_path)
print(f"最適なランダムフォレストモデルが {model_path} に保存されました。")


Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best parameters found: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best cross-validation MAE: 16535.2363
Mean Absolute Error (MAE): 15436.2778
Mean Squared Error (MSE): 480358578.3816
R-squared (R2): 0.9018
Root Mean Squared Error (RMSE): 21917.0842
最適なランダムフォレストモデルが /Users/hayakawakazue/Downloads/house_price/model/random_forest_best.joblib に保存されました。


# XGBoost

In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train_preprocessed.csv'
data = pd.read_csv(data_path)

# 重要な特徴量のリスト
important_features = [
    'TotalArea', 'QualityScore', 'OverallQual', 'GrLivArea', 
    'GarageScore', 'ExterQual', 'KitchenQual', 
    'GarageCars', 'BsmtQual', 'GarageArea', 'TotalBsmtSF', 
    'BsmtQualityIndex', '1stFlrSF', 'AvgQualityCondition', 'QualityCondition',
    'FullBath', 'YearBuilt', 'YearRemodAdd', 'TotalRooms', 'TotRmsAbvGrd'
]

# 特徴量と目的変数を分ける
X = data[important_features]
y = data['SalePrice']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoostモデルの定義
xgboost_model = xgb.XGBRegressor(random_state=42)

# ハイパーパラメータの候補を設定
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# ランダムサーチの設定
random_search = RandomizedSearchCV(estimator=xgboost_model, param_distributions=param_dist, n_iter=100, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2, random_state=42)

# ランダムサーチの実行
random_search.fit(X_train, y_train)

# 最適なハイパーパラメータの表示
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation MAE: {-random_search.best_score_:.4f}")

# 最適なモデルを取得
best_model = random_search.best_estimator_

# テストデータを使った予測
y_pred = best_model.predict(X_test)

# 評価指標の計算
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# 結果の表示
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# 最適なモデルの保存
model_path = os.path.join('/Users/hayakawakazue/Downloads/house_price/model', 'xgboost_best.joblib')
joblib.dump(best_model, model_path)
print(f"最適なXGBoostモデルが {model_path} に保存されました。")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Best cross-validation MAE: 16127.3175
Mean Absolute Error (MAE): 15801.3609
Mean Squared Error (MSE): 489814505.8734
R-squared (R2): 0.8998
Root Mean Squared Error (RMSE): 22131.7533
最適なXGBoostモデルが /Users/hayakawakazue/Downloads/house_price/model/xgboost_best.joblib に保存されました。
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=9, n_estimators=300, subsample=0.7; total time=   1.5s
[CV] END colsample_bytree=0.9, learning_rate=0.3, max_depth=6, n_estimators=300, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=6, n_estimators=200, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=9, n_estimators=300, subsample=0.9; total time=   1.5s
[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=9, n_estimat

# 多重共線性の確認: Variance Inflation Factor (VIF) 

In [3]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIFの計算
X = data[important_features]
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data)


                Feature        VIF
0          TotRmsAbvGrd        inf
1            TotalRooms        inf
2          YearRemodAdd   2.529970
3             YearBuilt   3.692147
4              FullBath   2.227004
5      QualityCondition  37.729182
6   AvgQualityCondition  42.283498
7              1stFlrSF   3.511701
8      BsmtQualityIndex   6.788725
9           TotalBsmtSF        inf
10           GarageArea        inf
11             BsmtQual   9.042833
12           GarageCars   7.190065
13          KitchenQual   3.679341
14            ExterQual   3.969228
15          GarageScore  11.752605
16            GrLivArea        inf
17          OverallQual  10.062625
18         QualityScore  21.534537
19            TotalArea        inf


  vif = 1. / (1. - r_squared_i)


# 特徴量の相互作用と非線形関係の確認

In [4]:
from sklearn.inspection import plot_partial_dependence
import matplotlib.pyplot as plt

# 部分依存プロットの描画
features = ['TotalArea', 'QualityScore', 'OverallQual', 'GrLivArea']
fig, ax = plt.subplots(figsize=(12, 8))
plot_partial_dependence(model, X, features, ax=ax)
plt.show()


ImportError: cannot import name 'plot_partial_dependence' from 'sklearn.inspection' (/opt/anaconda3/lib/python3.11/site-packages/sklearn/inspection/__init__.py)