In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
plt.style.use('ggplot')

In [79]:
# 데이터 불러오기
apart_df = pd.read_csv('../data/new_apart_data_preprocessing_outlier.csv')
apart_df.head()

Unnamed: 0.1,Unnamed: 0,receipt_year,gu_name,dong_name,main_lot_num,sub_lot_num,arch_area,floor,arch_year,building_usage,...,pyeong_group,price_euk,price_per_pyeong,contract_year,contract_month,subway_line,subway_name,school_type,landing_rate,deposit_rate
0,0,2025,11,253,365.0,4.0,51.07,7,2013.0,0,...,1,3.88,2511.542252,2025,6,1.0,17,5,4.0,2.97
1,1,2025,1,191,447.0,0.0,84.98,9,1987.0,0,...,2,52.0,20228.386261,2025,6,2.0,13,5,4.0,2.97
2,2,2025,1,8,1282.0,0.0,84.9,28,2020.0,0,...,2,32.0,12459.967487,2025,6,2.0,13,5,4.0,2.97
3,3,2025,16,23,1500.0,0.0,84.45,12,2001.0,0,...,2,13.75,5382.421013,2025,6,2.0,14,5,4.0,2.97
4,4,2025,4,315,1165.0,0.0,84.99,13,2015.0,0,...,2,9.7,3772.928074,2025,6,5.0,9,4,4.0,2.97


In [80]:
apart_df.drop(columns=['Unnamed: 0','building_usage','deal_type', 'contract_month', 'contract_year','arch_decade', 'subway_line', 'floor', 'sub_lot_num', 'main_lot_num', 'price_per_pyeong' ], inplace=True)

In [81]:
apart_df.columns

Index(['receipt_year', 'gu_name', 'dong_name', 'arch_area', 'arch_year',
       'pyeong', 'pyeong_group', 'price_euk', 'subway_name', 'school_type',
       'landing_rate', 'deposit_rate'],
      dtype='object')

In [82]:
# 2022~2024년 데이터를 테스트셋으로, 나머지를 훈련셋으로 분리
train_df = apart_df[~apart_df['receipt_year'].isin([2022, 2023, 2024])]
test_df = apart_df[apart_df['receipt_year'].isin([2022, 2023, 2024])]

# X, y 분리
X_train = train_df.drop('price_euk', axis=1).select_dtypes(include=['number'])
y_train = train_df['price_euk']

X_test = test_df.drop('price_euk', axis=1).select_dtypes(include=['number'])
y_test = test_df['price_euk']

# 년도로 분리해서 사용할 땐 무작위로 섞는건 의미 없음

In [83]:
# 파이프라인 정의
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


numerical_features = ['arch_area', 'arch_year', 'pyeong', 'subway_name','landing_rate', 'deposit_rate']
categorical_features = X_train.drop(numerical_features, axis=1).columns

# 전처리 파이프라인
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ("num", StandardScaler(), numerical_features)
        ], 
        remainder="passthrough")

# linear 회귀 파이프라인
pipe_random = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', RandomForestRegressor())])

pipe_kneighbors = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', KNeighborsRegressor())])

pipe_decision = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', DecisionTreeRegressor(max_depth=10))])
# # randomforest 회귀 파이프라인
# pipe_random = make_pipeline(
#     StandardScaler(),
#     RandomForestRegressor()
# )

# Kneighbors 회귀 파이프라인
# pipe_kneighbors = make_pipeline(
#     StandardScaler(),
#     KNeighborsRegressor()
# )

# # DecisionTree 회귀 파이프라인
# pipe_decision = make_pipeline(
#     StandardScaler(),
#     DecisionTreeRegressor(max_depth=10),
# )

pipes = [pipe_random, pipe_kneighbors, pipe_decision]

In [84]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

for pipe in pipes:
    model = pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    mse = mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    rmse = root_mean_squared_error(y_test, pred)
    r2 = r2_score(y_test, pred)

    if pipe == pipe_random:
        print('===랜덤 포레스트 모델의 평가결과===')
    elif pipe == pipe_kneighbors:
        print('===kNeighbors 모델의 평가결과===')
    elif pipe == pipe_decision:
        print('===DecisionTree 모델의 평가결과===')
    print(f'R2:{r2:.4f}')
    print(f'RMSE:{rmse:.4f}')
    print(f'MAE:{mae:.4f}')
    print(f'MSE:{mse:.4f}')
    print(f'Score:{pipe.score(X_train, y_train)}')
    print('===============================')
    print('')

===랜덤 포레스트 모델의 평가결과===
R2:0.8868
RMSE:2.8960
MAE:1.4660
MSE:8.3869
Score:0.992815114079242

===kNeighbors 모델의 평가결과===
R2:0.5841
RMSE:5.5514
MAE:3.4277
MSE:30.8176
Score:0.9313158626747525

===DecisionTree 모델의 평가결과===
R2:0.8016
RMSE:3.8339
MAE:2.4449
MSE:14.6988
Score:0.9010156560514444

