# ランダムフォレストモデルでトレーニングする

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os
import joblib

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/data/train/train_cleaned5/combined_data_21_24_25_29_30_33_cleaned_preprocessed.csv'
data = pd.read_csv(data_path)

# 重要な特徴量のリスト（使用する特徴量）
important_features = ['面積/築年数比', '価格増加率', '価格/面積比', '面積（㎡）', 'エリア人気度スコア', '面積×築年数']

# 特徴量と目的変数を分ける
X = data[important_features]
y = data['取引価格（総額）_log']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)

# ランダムフォレストモデルの作成と訓練
# model = RandomForestRegressor(random_state=42, n_estimators=200, max_depth=20, min_samples_split=5, min_samples_leaf=2, max_features='sqrt')
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# クロスバリデーションでの評価
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
cv_mae_scores = -cv_scores

# クロスバリデーションの結果を表示
print(f"Cross-Validation MAE Scores: {cv_mae_scores}")
print(f"Mean MAE: {cv_mae_scores.mean():.4f}")
print(f"Standard Deviation of MAE: {cv_mae_scores.std():.4f}")

# テストデータを使った予測
y_pred = model.predict(X_test)

# 評価指標の計算
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 結果の表示
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

# モデルの保存
model_path = os.path.join('/Users/hayakawakazue/Downloads/data/models', '21_24_25_29_30_33_random_forest.joblib')
joblib.dump(model, model_path)
print(f"Random Forest モデルが {model_path} に保存されました。")


Cross-Validation MAE Scores: [0.02560564 0.02842268 0.02528091 0.02668583 0.0265663 ]
Mean MAE: 0.0265
Standard Deviation of MAE: 0.0011
Mean Absolute Error (MAE): 0.0237
Mean Squared Error (MSE): 0.0026
R-squared (R2): 0.9633
Random Forest モデルが /Users/hayakawakazue/Downloads/data/models/21_24_25_29_30_33_random_forest.joblib に保存されました。


# 線形回帰モデルでトレーニングする

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/data/train/train_cleaned5/train_08_cleaned_preprocessed.csv'
data = pd.read_csv(data_path)

# 重要な特徴量のリスト（使用する特徴量）
important_features = ['面積/築年数比', '価格増加率', '価格/面積比']

# 特徴量と目的変数を分ける
X = data[important_features]
y = data['取引価格（総額）_log']

# 訓練データとテストデータの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# モデルのインスタンス化
lr_model = LinearRegression()

# クロスバリデーションスコアの計算
cv_scores = cross_val_score(lr_model, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
cv_mae_scores = -cv_scores
mean_cv_mae = cv_mae_scores.mean()
std_cv_mae = cv_mae_scores.std()

# モデルの訓練
lr_model.fit(X_train, y_train)

# テストデータに対する予測
y_pred = lr_model.predict(X_test)

# 評価指標の計算
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression")
print("Cross-Validation MAE Scores:", cv_mae_scores)
print("Mean MAE:", mean_cv_mae)
print("Standard Deviation of MAE:", std_cv_mae)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)



Linear Regression
Cross-Validation MAE Scores: [0.1508964  0.14403353 0.14865423 0.15357879 0.14384981]
Mean MAE: 0.14820255310072025
Standard Deviation of MAE: 0.0038129234375653296
Mean Absolute Error (MAE): 0.15040183226065723
Mean Squared Error (MSE): 0.03896115993462449
R-squared (R2): 0.6797941023911749
