In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
plt.style.use('ggplot')

In [None]:
# 데이터 불러오기
apart_df = pd.read_csv('../data/new_apart_data_preprocessing_outlier.csv')
apart_df.head()

In [None]:
# 데이터 전처리
apart_df.drop(columns=['Unnamed: 0','building_usage','deal_type', 'contract_month', 'contract_year','arch_decade', 'subway_line', 'floor', 'sub_lot_num', 'main_lot_num', 'price_per_pyeong' ], inplace=True)

In [3]:
# 2022~2024년 데이터를 테스트셋으로, 나머지를 훈련셋으로 분리
train_df = apart_df[~apart_df['receipt_year'].isin([2022, 2023, 2024])]
test_df = apart_df[apart_df['receipt_year'].isin([2022, 2023, 2024])]

# X, y 분리
X_train = train_df.drop('price_euk', axis=1).select_dtypes(include=['number'])
y_train = train_df['price_euk']

X_test = test_df.drop('price_euk', axis=1).select_dtypes(include=['number'])
y_test = test_df['price_euk']

# 년도로 분리해서 사용할 땐 무작위로 섞는건 의미 없음

In [14]:
# 파이프라인 정의
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold


numerical_features = ['arch_area', 'arch_year', 'pyeong', 'subway_name','landing_rate', 'deposit_rate']
categorical_features = X_train.drop(numerical_features, axis=1).columns

# 전처리 파이프라인
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ("num", StandardScaler(), numerical_features)
        ], 
        remainder="passthrough")


# pipe_random = Pipeline(steps=[('preprocessor', preprocessor),
#                               ('regressor', RandomForestRegressor(n_jobs=-1))])


# # 탐색할 하이퍼파라미터 그리드 정의
# param_grid = {
#     'regressor__n_estimators': [100],
#     'regressor__max_depth': [100, 200, 300],
#     'regressor__min_samples_split': [2, 3, 4],
#     'regressor__min_samples_leaf': [1, 2, 3],
#     'regressor__max_features': [1.0, 'sqrt']
# }

# # GridSearchCV로 최적의 하이퍼파라미터 탐색
# grid_search = GridSearchCV(
#     estimator=pipe_random,
#     param_grid=param_grid,
#     cv=5,  # 5겹 교차검증
#     scoring='neg_mean_squared_error',  # 회귀에서는 MSE(작을수록 좋음)
#     n_jobs=-1,  # 모든 CPU 사용
#     verbose=2
# )

# # 학습 (시간이 오래 걸릴 수 있음)
# grid_search.fit(X_train, y_train)

# # 최적의 하이퍼파라미터와 성능 출력
# print("최적 하이퍼파라미터:", grid_search.best_params_)
# print("최적 성능(MSE):", grid_search.best_score_)

# # 최적 모델 객체
# best_params = grid_search.best_estimator_

# # 최적 모델로 실험

In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

pipe_random = Pipeline(steps=[('preprocessor', preprocessor),
                            #   ('regressor', RandomForestRegressor(max_depth=13, n_jobs=-1, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=200))
                                ('regressor', RandomForestRegressor())
                              ])

# 탐색할 하이퍼파라미터 그리드 정의
param_grid = {
    'regressor__n_estimators': [150, 200],
    'regressor__max_depth': [None, 50],
    'regressor__min_samples_split': [3, 5],
    'regressor__min_samples_leaf': [2, 4],
    'regressor__max_features': [1.0]
}

# KFold 객체 생성 (5겹 교차검증, 데이터 섞기, 랜덤시드 고정)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 추적할 url 설정
mlflow.set_tracking_uri("http://127.0.0.1:5000") # 트래킹할 url 입력
print(f"Tracking URL : ", mlflow.get_tracking_uri()) # 디버깅 제대로 불러와졌는지 확인

# 실험 이름 설정
exp = mlflow.set_experiment(experiment_name='부동산 실거래가 예측 모델 실험')
# 설정된 상세 정보 확인
print(f"Name: {exp.name}")
print(f"ID: {exp.experiment_id}")
print(f"Location: {exp.artifact_location}")
print(f"Tags: {exp.tags}")
print(f"Lifecycle: {exp.lifecycle_stage}")
print(f"Create Timestamp: {exp.creation_time}")

# 위 코드 복사해 가져옴...
mlflow.autolog() # mlflow의 autolog 기능 정의

# 1번째 스타일 : end_run 함수 필요
mlflow.start_run() # mlflow가 기록할 코드의 시작지점

# 모델 학습 ======================================================= 

# GridSearchCV로 최적의 하이퍼파라미터 탐색
grid_search = GridSearchCV(
    estimator=pipe_random,
    param_grid=param_grid,
    cv=kf,  # 5겹 교차검증
    scoring=['neg_mean_squared_error', 'r2', 'neg_mean_absolute_error'],  # 회귀에서는 MSE(작을수록 좋음)
    refit= 'r2',
    n_jobs=-1,  # 모든 CPU 사용
    verbose=2
)
model = grid_search.fit(X_train, y_train)
pred = model.predict(X_test)

mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)

print(f'R2:{r2:.4f}')
print(f'RMSE:{rmse:.4f}')
print(f'MAE:{mae:.4f}')
print(f'MSE:{mse:.4f}')
print(f'Score:{model.score(X_train, y_train)}')
print('==================================')
print('')
# 최적의 하이퍼파라미터와 성능 출력
print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최적 성능(MSE):", grid_search.best_score_)

# 최적 모델 객체
grid_search.best_estimator_
# ================================================================

mlflow.end_run() # mlflow가 기록할 코드의 종료지점

                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['receipt_year', 'gu_name', 'dong_name', 'pyeong_group', 'school_type'], dtype='object')),
                                                 ('num', StandardScaler(),
                                   ...`


Tracking URL :  http://127.0.0.1:5000
Name: 부동산 실거래가 예측 모델 실험
ID: 1
Location: ./mlruns/1
Tags: {}
Lifecycle: active
Create Timestamp: 1749518305498
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END regressor__max_depth=None, regressor__max_features=1.0, regressor__min_samples_leaf=2, regressor__min_samples_split=5, regressor__n_estimators=150; total time=  56.0s
[CV] END regressor__max_depth=None, regressor__max_features=1.0, regressor__min_samples_leaf=2, regressor__min_samples_split=5, regressor__n_estimators=150; total time=  56.6s
[CV] END regressor__max_depth=None, regressor__max_features=1.0, regressor__min_samples_leaf=2, regressor__min_samples_split=3, regressor__n_estimators=150; total time= 1.0min
[CV] END regressor__max_depth=None, regressor__max_features=1.0, regressor__min_samples_leaf=2, regressor__min_samples_split=3, regressor__n_estimators=150; total time= 1.0min
[CV] END regressor__max_depth=None, regressor__max_features=1.0, regressor__min_samples

2025/06/10 12:00:33 INFO mlflow.sklearn.utils: Logging the 5 best runs, 11 runs will be omitted.
                  transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 Index(['receipt_year', 'gu_name', 'dong_name', 'pyeong_group', 'school_type'], dtype='object')),
                                ('num', StandardScaler(),
                                 ['arch_area', 'arch_year', 'pyeong',
                                  'subway_name', 'landing_rate',
                                 ...`
                  transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 Index(['receipt_year', 'gu_name', 'dong_name', 'pyeong_group', 'school_type'], dtype='object')),
                                ('num', StandardScaler(),
                                 ['arch_area', 'arch_year', 'pyeong',
                                  'subway_name', 'landing_rate',
                                 ...`
                  tra

R2:0.8867
RMSE:2.8974
MAE:1.4913
MSE:8.3949
Score:0.9857173117407737

최적 하이퍼파라미터: {'regressor__max_depth': 50, 'regressor__max_features': 1.0, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 3, 'regressor__n_estimators': 200}
최적 성능(MSE): 0.9555051266970616
