In [6]:
# GPT 도움
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# -----------------------------
# 1) 데이터 준비
# -----------------------------
import pandas as pd
from sklearn.model_selection import train_test_split

# 1) 데이터 불러오기 + 결측치 제거
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/boston.csv").dropna()

# 2) feature / label 분리
X = df.drop(columns=["PRICE"])
y = df["PRICE"]

# 3) 학습용 / 테스트용 분리 (랜덤 셔플 포함)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 2) 모델 구성
# -----------------------------
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(n_estimators=300, random_state=42)
lr = LinearRegression()
#구성된 모델 가져옴. 분류와 다른점은 regressor함수를 불러온게 다름.

# -----------------------------
# 3) 모델 학습
# -----------------------------
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)
#동

# -----------------------------
# 4) 모델 평가
# -----------------------------
#회귀의 평가부분 코드가 길어지는 이유.
#분류와 다르게 회귀는 정답과 얼마나 가까운지가 핵심임.
#mae(평균 절대값 오차): 내 예측이 평균적으로 얼마만큼 틀렸는가?
#rmse(제곱평균 제곱근오차): 틀린 정도를 평균적으로 보되, 큰 실수를 특히 무겁게 벌주자
#r2(결정계수): 내 모델이 단순 평균 예측보다 얼마나 나은가?

def evaluate(model, X_test, y_test):
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, pred)
    return mae, rmse, r2

dt_mae, dt_rmse, dt_r2 = evaluate(dt, X_test, y_test)
rf_mae, rf_rmse, rf_r2 = evaluate(rf, X_test, y_test)
lr_mae, lr_rmse, lr_r2 = evaluate(lr, X_test, y_test)

print("=== Regression Test Metrics ===")
print(f"Decision Tree  | MAE: {dt_mae:.4f} | RMSE: {dt_rmse:.4f} | R2: {dt_r2:.4f}")
print(f"Random Forest  | MAE: {rf_mae:.4f} | RMSE: {rf_rmse:.4f} | R2: {rf_r2:.4f}")
print(f"Linear Reg.    | MAE: {lr_mae:.4f} | RMSE: {lr_rmse:.4f} | R2: {lr_r2:.4f}")

=== Regression Test Metrics ===
Decision Tree  | MAE: 0.4438 | RMSE: 0.6941 | R2: 0.6323
Random Forest  | MAE: 0.3144 | RMSE: 0.4886 | R2: 0.8178
Linear Reg.    | MAE: 0.5323 | RMSE: 0.7448 | R2: 0.5767


In [None]:
# 교수님 버전
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# -----------------------------
# 1) 데이터 준비 (회귀용)
# -----------------------------

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/boston.csv").dropna()


X = df.drop(columns=["PRICE"])
y = df["PRICE"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42

)

# -----------------------------
# 2) 모델 구성 (회귀 모델)
# -----------------------------

dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)


# 선형회귀는 스케일링과 함께 파이프라인 구성 권장

lr = make_pipeline(StandardScaler(with_mean=True, with_std=True), LinearRegression())


# -----------------------------
# 3) 모델 학습
# -----------------------------

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)


# -----------------------------
# 4) 모델 평가 함수
# -----------------------------

def eval_reg(y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    mse  = mean_squared_error(y_true, y_pred)  # squared 인자 제거
    rmse = np.sqrt(mse)                        # 직접 제곱근
    r2   = r2_score(y_true, y_pred)
    return mae, rmse, r2



dt_mae, dt_rmse, dt_r2 = eval_reg(y_test, dt.predict(X_test))
rf_mae, rf_rmse, rf_r2 = eval_reg(y_test, rf.predict(X_test))
lr_mae, lr_rmse, lr_r2 = eval_reg(y_test, lr.predict(X_test))


print("=== Test Metrics (Regression) ===")

print("[Decision Tree]")
print(f"MAE: {dt_mae:.3f} | RMSE: {dt_rmse:.3f} | R^2: {dt_r2:.3f}")

print("[Random Forest]")
print(f"MAE: {rf_mae:.3f} | RMSE: {rf_rmse:.3f} | R^2: {rf_r2:.3f}")

print("[Linear Regression]")
print(f"MAE: {lr_mae:.3f} | RMSE: {lr_rmse:.3f} | R^2: {lr_r2:.3f}")

=== Test Metrics (Regression) ===
[Decision Tree]
MAE: 0.444 | RMSE: 0.694 | R^2: 0.632
[Random Forest]
MAE: 0.314 | RMSE: 0.489 | R^2: 0.818
[Linear Regression]
MAE: 0.532 | RMSE: 0.745 | R^2: 0.577
