# ランダムフォレストモデルでトレーニングする

In [None]:
!pip install numpy scikit-learn joblib
!pip show scikit-learn

In [1]:
import pandas as pd

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/cluster_0_important_features.csv'
data = pd.read_csv(data_path)

# 特徴量のリスト
features = data.columns.tolist()
print("\nFeatures in cluster_0_important_features.csv:")
print(features)



Features in cluster_0_important_features.csv:
['SalePrice', 'OverallQual_TotalArea', 'OverallConditionArea', 'Age', 'BsmtUnfSF', 'YearBuilt', 'LotArea', 'log_LotArea', 'QualityScore', 'TotalArea', 'GrLivArea_OverallQual', 'RemodelAge', 'OverallQual_GrLivArea', 'GarageYrBlt', 'YearRemodAdd', 'TotalRmsAbvGrd_OverallCond', 'TotalBsmtSF_OverallQual']


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os
import joblib

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/cluster_0_important_features.csv'
data = pd.read_csv(data_path)

# 選択する特徴量と目的変数
selected_features = [
    'OverallQual_TotalArea', 'OverallConditionArea', 'Age', 'BsmtUnfSF', 'YearBuilt', 
    'LotArea', 'log_LotArea', 'QualityScore', 'TotalArea', 'GrLivArea_OverallQual', 
    'RemodelAge', 'OverallQual_GrLivArea', 'GarageYrBlt', 'YearRemodAdd', 
    'TotalRmsAbvGrd_OverallCond', 'TotalBsmtSF_OverallQual'
]

# 特徴量と目的変数を分ける
X = data[selected_features]
y = data['SalePrice']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

# データの前処理：nanを含む行を削除し、対応する y も削除
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_test = X_test.dropna()
y_test = y_test[X_test.index]

# ハイパーパラメータの候補を設定
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 7, 10],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [4, 6, 8],
    'max_features': ['sqrt']
}

# ランダムフォレストモデルの定義
model = RandomForestRegressor(random_state=24)

# グリッドサーチの設定
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=0)

# グリッドサーチの実行
grid_search.fit(X_train, y_train)

# 最適なハイパーパラメータの表示
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation MAE: {-grid_search.best_score_:.4f}")

# 最適なモデルを取得
best_model = grid_search.best_estimator_

# テストデータを使った予測
y_pred = best_model.predict(X_test)

# 評価指標の計算
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# 結果の表示
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# 最適なモデルの保存
model_path = os.path.join('/Users/hayakawakazue/Downloads/house_price/model', 'random_forest_Cluster_0711_0.joblib')
joblib.dump(best_model, model_path)
print(f"最適なランダムフォレストモデルが {model_path} に保存されました。")

# 実際の値の平均を計算
mean_actual_value = np.mean(y_test)
print(f"Mean Actual Value: {mean_actual_value}")

# 誤差のパーセンテージを計算
error_percentage = (mae / mean_actual_value) * 100
print(f"Mean Absolute Percentage Error (MAPE): {error_percentage:.2f}%")


Best parameters found: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best cross-validation MAE: 12412.6574
Mean Absolute Error (MAE): 13603.2021
Mean Squared Error (MSE): 372299543.7194
R-squared (R2): 0.6713
Root Mean Squared Error (RMSE): 19295.0653
最適なランダムフォレストモデルが /Users/hayakawakazue/Downloads/house_price/model/random_forest_Cluster_0711_0.joblib に保存されました。
Mean Actual Value: 138287.8870056497
Mean Absolute Percentage Error (MAPE): 9.84%
