In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
# IterativeImputer 사용을 명시적으로 활성화
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime
import zipfile

In [28]:
# 데이터 읽기, 용량 절약을 위해 zip 파일의 압축을 해제하지 않고 바로 읽도록한다.
zf = zipfile.ZipFile('20241112.zip') 

df_source = pd.read_csv(zf.open('20241112.csv'), dtype = {'사업자등록번호':'string', '등록국세청코드':'string', '납세자유형코드':'string', 
                                                          '사업자유형코드':'string', '산업분류코드':'string', '시도':'string', 
                                                          '국세청상호명':'string', '국세청상호명존재여부':'string', '영업일수':'int', 
                                                          '영업일수(100일단위)':'int', '개업일':'string', '폐업일':'string', '통신판매사업자여부':'string', 
                                                          '통신판매사업자전화번호':'string', '통신판매사업자전자우편':'string', 
                                                          '나라장터조달업체제조구분코드':'string', '고용보험 업종코드':'string', '나라장터조달업체업무구분코드':'string',
                                                          '사업장 우편번호':'string'}
                                                          )
df_source.columns = df_source.columns.str.strip().str.replace(' ', '_')
display(df_source)

Unnamed: 0,사업자등록번호,등록국세청코드,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명,국세청상호명존재여부,영업일수,영업일수(100일단위),...,사업장_우편번호,사업장_주소,고용보험_업종코드,고용보험_업종명,산재보험_성립일자,고용보험_성립일자,산재보험_상시근로자수,고용보험_상시근로자수,산재보험_사업구분,고용보험_사업구분
0,1010109091,101,01,01,56114,서울,김밥천국삼청점,Y,4766,48,...,,,,,,,,,,
1,1010109107,101,01,01,56122,서울,명송 하나,Y,5791,58,...,,,,,,,,,,
2,1010112688,101,01,01,47312,서울,가인전자,Y,2586,26,...,,,,,,,,,,
3,1010112733,101,01,01,20400,서울,켐스펙교역,Y,2371,24,...,,,,,,,,,,
4,1010112806,101,01,01,46596,서울,동광전업사,Y,2477,25,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595328,8999100276,899,91,04,03112,경북,제2007대성호,Y,2146,21,...,,,,,,,,,,
1595329,8999300310,899,93,04,46312,부산,경북농산,Y,2023,20,...,,,,,,,,,,
1595330,8999601213,899,96,04,90212,대구,글나루독서실,Y,868,9,...,,,,,,,,,,
1595331,8999700981,899,97,04,96921,전북,길묘,Y,363,4,...,,,,,,,,,,


In [30]:
# 개인사업자의 데이터만 남긴다. (81 ~ 88 제외)
df_stage1 = df_source[(df_source['사업자유형코드'] <= '80') | (df_source['사업자유형코드'] >= '89')]
display(df_stage1)

Unnamed: 0,사업자등록번호,등록국세청코드,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명,국세청상호명존재여부,영업일수,영업일수(100일단위),...,사업장_우편번호,사업장_주소,고용보험_업종코드,고용보험_업종명,산재보험_성립일자,고용보험_성립일자,산재보험_상시근로자수,고용보험_상시근로자수,산재보험_사업구분,고용보험_사업구분
0,1010109091,101,01,01,56114,서울,김밥천국삼청점,Y,4766,48,...,,,,,,,,,,
1,1010109107,101,01,01,56122,서울,명송 하나,Y,5791,58,...,,,,,,,,,,
2,1010112688,101,01,01,47312,서울,가인전자,Y,2586,26,...,,,,,,,,,,
3,1010112733,101,01,01,20400,서울,켐스펙교역,Y,2371,24,...,,,,,,,,,,
4,1010112806,101,01,01,46596,서울,동광전업사,Y,2477,25,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595327,8999001055,899,90,04,85709,경기,아트앤하트 고양탄현에듀포레푸르지오,Y,690,7,...,,,,,,,,,,
1595328,8999100276,899,91,04,03112,경북,제2007대성호,Y,2146,21,...,,,,,,,,,,
1595329,8999300310,899,93,04,46312,부산,경북농산,Y,2023,20,...,,,,,,,,,,
1595330,8999601213,899,96,04,90212,대구,글나루독서실,Y,868,9,...,,,,,,,,,,


In [31]:
# Drop duplicates
df_stage2 = df_stage1.drop(columns = ['사업자등록번호', '영업일수', '국세청상호명'], axis=1)
df_stage2 = df_stage2.drop_duplicates()

# DataType Correction
df_stage2['나라장터조달업체종업원수'] = pd.to_numeric(df_stage2['나라장터조달업체종업원수'],errors='coerce', downcast=None).astype('Int64')
df_stage2['산재보험_상시근로자수'] = pd.to_numeric(df_stage2['산재보험_상시근로자수'],errors='coerce', downcast=None).astype('Int64')
df_stage2['고용보험_상시근로자수'] = pd.to_numeric(df_stage2['고용보험_상시근로자수'],errors='coerce', downcast=None).astype('Int64')

display(df_stage2)

Unnamed: 0,등록국세청코드,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명존재여부,영업일수(100일단위),개업일,폐업일,통신판매사업자여부,...,사업장_우편번호,사업장_주소,고용보험_업종코드,고용보험_업종명,산재보험_성립일자,고용보험_성립일자,산재보험_상시근로자수,고용보험_상시근로자수,산재보험_사업구분,고용보험_사업구분
0,101,01,01,56114,서울,Y,48,19941001,20071019,N,...,,,,,,,,,,
1,101,01,01,56122,서울,Y,58,19940523,20100331,N,...,,,,,,,,,,
2,101,01,01,47312,서울,Y,26,19970101,20040131,N,...,,,,,,,,,,
3,101,01,01,20400,서울,Y,24,19970101,20030630,N,...,,,,,,,,,,
4,101,01,01,46596,서울,Y,25,19970101,20031014,N,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595327,899,90,04,85709,경기,Y,7,20200106,20211126,N,...,,,,,,,,,,
1595328,899,91,04,03112,경북,Y,21,20160811,20220627,N,...,,,,,,,,,,
1595329,899,93,04,46312,부산,Y,20,20160915,20220331,N,...,,,,,,,,,,
1595330,899,96,04,90212,대구,Y,9,20200316,20220801,N,...,,,,,,,,,,


In [32]:
# 데이터 탐색용 데이터프레임 정의
df_exploration = df_stage2.copy()
df_exploration.head()

Unnamed: 0,등록국세청코드,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명존재여부,영업일수(100일단위),개업일,폐업일,통신판매사업자여부,...,사업장_우편번호,사업장_주소,고용보험_업종코드,고용보험_업종명,산재보험_성립일자,고용보험_성립일자,산재보험_상시근로자수,고용보험_상시근로자수,산재보험_사업구분,고용보험_사업구분
0,101,1,1,56114,서울,Y,48,19941001,20071019,N,...,,,,,,,,,,
1,101,1,1,56122,서울,Y,58,19940523,20100331,N,...,,,,,,,,,,
2,101,1,1,47312,서울,Y,26,19970101,20040131,N,...,,,,,,,,,,
3,101,1,1,20400,서울,Y,24,19970101,20030630,N,...,,,,,,,,,,
4,101,1,1,46596,서울,Y,25,19970101,20031014,N,...,,,,,,,,,,


In [33]:
df_exploration.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1113890 entries, 0 to 1595331
Data columns (total 41 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   등록국세청코드           1113890 non-null  string
 1   사업자유형코드           1113890 non-null  string
 2   납세자유형코드           1113890 non-null  string
 3   산업분류코드            1113890 non-null  string
 4   시도                1113890 non-null  string
 5   국세청상호명존재여부        1113890 non-null  string
 6   영업일수(100일단위)      1113890 non-null  int64 
 7   개업일               1113890 non-null  string
 8   폐업일               1113890 non-null  string
 9   통신판매사업자여부         1113890 non-null  string
 10  통신판매사업자전화번호       10922 non-null    string
 11  통신판매사업자전자우편       10650 non-null    string
 12  통신판매사업자사업장소재지     10931 non-null    object
 13  통신판매사업자도로명사업장소재지  10617 non-null    object
 14  통신판매사업자판매방식       39717 non-null    object
 15  통신판매사업자취급품목       39717 non-null    object
 16  통신판매사업자인터넷도메인     38386

In [34]:
df_exploration.describe()

Unnamed: 0,영업일수(100일단위),나라장터조달업체종업원수,산재보험_상시근로자수,고용보험_상시근로자수
count,1113890.0,47889.0,21228.0,21228.0
mean,29.45813,4.693729,3.146976,2.905031
std,34.03668,10.250413,8.089938,7.399242
min,-6569.0,0.0,0.0,0.0
25%,11.0,1.0,1.0,1.0
50%,22.0,3.0,1.0,1.0
75%,41.0,5.0,3.0,3.0
max,7377.0,1014.0,410.0,412.0


In [35]:
df_exploration.nunique()

등록국세청코드               792
사업자유형코드                91
납세자유형코드                 6
산업분류코드               1604
시도                     34
국세청상호명존재여부              2
영업일수(100일단위)          406
개업일                 19118
폐업일                 12329
통신판매사업자여부               2
통신판매사업자전화번호          8807
통신판매사업자전자우편          9258
통신판매사업자사업장소재지        5544
통신판매사업자도로명사업장소재지     8814
통신판매사업자판매방식            22
통신판매사업자취급품목           118
통신판매사업자인터넷도메인       26288
통신판매사업자호스트서버소재지      6980
나라장터조달업체여부              2
나라장터조달업체주소          29687
나라장터조달업체상세주소        33100
나라장터조달업체전화번호        43598
나라장터조달업체팩스번호        38374
나라장터조달업체홈페이지         8583
나라장터조달업체제조구분코드          2
나라장터조달업체제조구분코드명         2
나라장터조달업체종업원수          118
나라장터조달업체업무구분코드         22
나라장터조달업체업무구분코드명        22
보험구분                    3
사업장명                20218
사업장_우편번호            11395
사업장_주소              19717
고용보험_업종코드             892
고용보험_업종명              892
산재보험_성립일자            4368
고용보험_성립일자            4212
산재보험_상시근로자수           104
고용보험_상시근로자수 

# 여기 아래는 quiz1의 코드를 가져온 것으로 수정이 필요하다. 

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, KNeighborsRegressor
# IterativeImputer 사용을 명시적으로 활성화
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures 
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA  # PCA 추가
################################################################################
# 전처리
################################################################################

# 데이터 읽기
df_source = pd.read_csv("../quiz/wine.csv")
df_source.columns = df_source.columns.str.strip().str.replace(' ', '_').str.upper()
df_source = df_source.drop_duplicates()

# 피처 타입 정의
target           = ['QUALITY']
numeric_features = ['FIXED_ACIDITY', 'VOLATILE_ACIDITY', 'CITRIC_ACID', 'RESIDUAL_SUGAR', 'CHLORIDES', 'FREE_SULFUR_DIOXIDE', 'TOTAL_SULFUR_DIOXIDE', 'DENSITY', 'PH', 'SULPHATES', 'ALCOHOL']

ordinal_features = []
nominal_features = ['TYPE']
drop_na_features = []

# 특성과 타겟 변수 분리
y = df_source[target]
X = df_source[numeric_features + ordinal_features + nominal_features]

# 이상치 제거 함수 (도메인 지식 활용 ) 정의
def remove_domain_outliers(X, y=None):
        X_cleaned = X.copy()     
        X_cleaned = \
        X_cleaned[(df_source.CHLORIDES <= 0.4) &
                (df_source.CITRIC_ACID <= 1.00) & 
                (df_source.RESIDUAL_SUGAR <= 30) & 
                (df_source.DENSITY <= 1.01) &
                (df_source.FREE_SULFUR_DIOXIDE <= 150) & (df_source.SULPHATES <= 1.5)]
        if y is not None:
            y_cleaned = y.loc[X_cleaned.index]  # 타겟 데이터프레임의 인덱스 동기화
            return X_cleaned, y_cleaned
        return X_cleaned

# 이상치 제거기 (자동 ) 설정
# auto_outlier_remover = None  # 이상치 제거기 설정하지 않음
# auto_outlier_remover = LocalOutlierFactor(n_neighbors=5, contamination=0.1) 
auto_outlier_remover = IsolationForest(random_state=42, contamination=0.05)
auto_outlier_removers = [
     None,
     LocalOutlierFactor(n_neighbors=10, contamination=0.1),
     IsolationForest(random_state=42, contamination=0.05)
]

# 사용자 정의 Transformer
class FeatureCombiner(BaseEstimator, TransformerMixin):
        def __init__(self, combined_features=[]):
            self.combined_features = combined_features
            pass

        def fit(self, X, y=None):
            return self  # 이 Transformer는 학습이 필요하지 않음
        
        def transform(self, X):
            X = pd.DataFrame(X, columns=numeric_features)
            self.X_combined = pd.DataFrame()
            # 새로운 특성 생성
            if 'TOTAL_ACIDITY' in self.combined_features:
                self.X_combined['TOTAL_ACIDITY'] = X['FIXED_ACIDITY'] + X['VOLATILE_ACIDITY']
            if 'ACIDITY_RATIO' in self.combined_features:
                self.X_combined['ACIDITY_RATIO'] = self.X_combined['TOTAL_ACIDITY'] / X['ALCOHOL']
            if 'SUGAR_RATIO' in self.combined_features:
                self.X_combined['SUGAR_RATIO'] = X['RESIDUAL_SUGAR'] / X['TOTAL_SULFUR_DIOXIDE']
            if 'FIXED_VOLATILE_INTERACTION' in self.combined_features:
                self.X_combined['FIXED_VOLATILE_INTERACTION'] = X['FIXED_ACIDITY'] * X['VOLATILE_ACIDITY']
            return pd.concat([self.X_combined], axis=1)
        
        def get_feature_names_out(self, input_features=None):
             # 새로 생성된 특성과 기존 numeric features 이름을 합쳐 반환
             return self.X_combined.columns.tolist()
        
algorithms = [
     LinearRegression(),
     Ridge(random_state=42, max_iter=10000000),
     Lasso(random_state=42, max_iter=10000000),
     ElasticNet(random_state=42, max_iter=10000000),
     KNeighborsRegressor(),
]

params_common = {
     'preprocessor__numeric_transformer__numeric_imputer__strategy' : ['median', 'mean', 'most_frequent'],
     'preprocessor__numeric_transformer__feature_combiner__combined_features' : [
          ['TOTAL_ACIDITY', 'ACIDITY_RATIO', 'SUGAR_RATIO', 'FIXED_VOLATILE_INTERACTION'],
          ['TOTAL_ACIDITY', 'ACIDITY_RATIO', 'SUGAR_RATIO'],
          ['TOTAL_ACIDITY', 'ACIDITY_RATIO'],
          ['TOTAL_ACIDITY'],
          []
     ],
     'preprocessor__numeric_transformer__polynomial_features__degree' : [1, 2, 3],
}

params_model = {}
params_model['LinearRegression'] = {}
params_model['Ridge'] = {'algorithm__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],}  # 다양한 alpha 값
params_model['Lasso'] = {'algorithm__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],}  # 다양한 alpha 값
params_model['ElasticNet'] = {
    'algorithm__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'algorithm__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
}
params_model['KNeighborsRegressor'] = {
    'algorithm__n_neighbors': range(3, 51),         # k-NN의 k 값 (이웃 개수)
    'algorithm__weights': ['uniform', 'distance'],  # 가중치 옵션
    'algorithm__p': [1, 2],                         # 거리 측정 방식 (1: 맨해튼 거리, 2: 유클리드 거리)
}

searches = {}

for algorithm in algorithms:
    algorithm_name = algorithm.__class__.__name__
    print(f"Processing {algorithm_name}...")
    # 수치형 데이터 전처리 파이프라인
    numeric_transformer = Pipeline(
        steps=[
            ('numeric_imputer'      , SimpleImputer()),
            ('feature_combiner'     , FeatureCombiner()),
            ('polynomial_features'  , PolynomialFeatures(include_bias=False)),
            ('numeric_scaler'       , StandardScaler()),
        ]
    )
    # 서열형 데이터 전처리 파이프라인
    ordinal_transformer = Pipeline(
        steps=[
            ('ordinal_imputer'      , SimpleImputer(strategy='constant', fill_value="None")),
            ('ordinal_encoder'      , OrdinalEncoder(categories=[['None', 'Old', 'Recent']])),
        ]
    )
    # 명목형 데이터 전처리 파이프라인
    nominal_transformer = Pipeline(
        steps=[
            ('nominal_imputer'      , SimpleImputer(strategy='most_frequent')),
            ('nominal_encoder'      , OneHotEncoder(drop='first')),
        ]
    )
    # 전체 전처리기 - 각 타입별로 변수 처리 및 결합
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric_transformer'  , numeric_transformer, numeric_features),
            ('ordinal_transformer'  , ordinal_transformer, ordinal_features),
            ('nominal_transformer'  , nominal_transformer, nominal_features),
        ]
    )
    # 전체 파이프라인
    model = Pipeline(
        steps=[
            ('preprocessor'         , preprocessor),
            ('algorithm'            , algorithm),
        ]
    )
    # [10] RandomizedSearchCV를 위한 파라미터 정의
    params = params_common.copy()
    params.update(params_model[algorithm_name])
    display(f"{algorithm_name}'s params:", params)

    # [10] RandomizedSearchCV 초기화
    search = RandomizedSearchCV(
        estimator=model, 
        param_distributions=params, # param_grid(GridSearchCV) 대신 param_distributions 사용
        cv=5, 
        scoring='r2',
        n_jobs=-1,
        n_iter=100,                 # 파라미터 조합 중 100개 샘플링. GridSearchCV는 모든 파라미터 사용
        random_state=42,            # 재현성을 위해 random_state 설정
    )
    search_key = (
        algorithm.__class__.__name__,
        SimpleImputer().__class__.__name__
    )
    searches[search_key] = search  # [10] search를 딕셔너리에 저장

# [10] 모형(파이프라인)의 구조를 시각적으로 표현
for key, search in searches.items():
    display(f"Search: {key}", search)

# 훈련 세트와 테스트 세트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 중복 데이터 제거
mask_dups = X_train.duplicated(keep=False)
display("Duplicated Samples:", X_train[mask_dups])
X_train = X_train.drop_duplicates()
y_train = y_train.loc[X_train.index]
display(X_train)
display(y_train)

In [36]:
import warnings
# 모든 경고 무시: 경고를 무시하면 중요한 문제를 놓칠 수 있으므로, 가능하면 경고의 원인을 분석하여 문제를 해결하는 것이 바람직
warnings.filterwarnings("ignore")

In [None]:
################################################################################
# 모형 훈련
################################################################################

# 1.  결측치 처리
# 1.1 특정 열에 결측치 포함 시 제거
X_train = X_train.dropna(subset=[drop_na_features])
y_train = y_train.loc[X_train.index]

# 2.  이상치 제거
# 2.1 이상치 제거 - 도메인 지식 활용 및 자동 제거기 적용
X_train, y_train = remove_domain_outliers(X_train, y_train)

# [10] 여러 모형을 저장하기 위해 dictionary 사용
models = {}                         # 모형을 저장할 딕셔너리
performances = {}                   # 각 모델의 성능을 저장할 딕셔너리
best_train_score = float('-inf')    # 가장 높은 R² 값을 기록하기 위한 변수
best_model = None                   # 최적 모델을 저장하기 위한 변수
best_key = None                     # 최적 모델의 키를 저장하기 위한 변수

for auto_outlier_remover in auto_outlier_removers:
    outlier_remover_name = auto_outlier_remover.__class__.__name__
    print(f"\nApplying Auto Outlier Remover: {outlier_remover_name}")

    # 2.2 자동 이상치 제거기 적용
    if auto_outlier_remover is not None:
        # y_train_inlier = auto_outlier_remover.fit_predict(X_train[numeric_features])
        imputer = SimpleImputer(strategy='median')
        X_train_imputed = imputer.fit_transform(X_train[numeric_features])
        y_train_inlier = auto_outlier_remover.fit_predict(X_train_imputed)
        inlier_mask_train = y_train_inlier != -1
    else:
        # auto_outlier_remover가 None인 경우, 모든 데이터가 inlier로 간주
        inlier_mask_train = np.ones(X_train.shape[0], dtype=bool)  # 모든 값이 True

    # inlier 마스크에 따라 데이터 필터링
    X_train, y_train = X_train[inlier_mask_train], y_train[inlier_mask_train]

    for key, search in searches.items():    # [10] 반복문을 통해 여러 search를 fitting
        print(f"Training {key}...")

        # 5. 모형 훈련
        # 5.1  모형 적합 (모형에는 아래의 기능이 파이프라인에 포함됨)
        search.fit(X_train, y_train.values.ravel())    # model 대신 search를 적합(fitting)
        model = search.best_estimator_
        model.fit(X_train, y_train.values.ravel())
        
        full_key = (outlier_remover_name, *key) # models 딕셔너리의 키에 outlier_remover_name 추가
        models[full_key] = model  # 모델을 models 딕셔너리에 저장

        # 교차검증 성능 출력
        train_score = search.best_score_
        print(f"Best Score (R²): {train_score:.4f}")
        performances[full_key] = train_score

        # 5.4 최적 모델의 하이퍼파라미터 출력
        df_search_results = pd.DataFrame(search.cv_results_)
        display("Best Hyperparameters:", search.best_params_)  # 튜닝된 하이퍼파라미터 출력
        # display("Search Results:", pd.DataFrame(search.cv_results_))    # 교차검증결과 출력

        # 가장 높은 R² 값을 가진 모델을 best_model로 설정
        if train_score > best_train_score:
            best_train_score = train_score
            best_model = model
            best_key = full_key

# 최적 모델과 그 성능 정보 출력
print(f"Best Model R²: {best_train_score:.4f}")
print(f"Best Model: {best_key}")
display(best_model)

# 모든 모델의 성능 출력
print("All Model Performances:")
for key, score in performances.items():
    print(f"{key}: R² = {score:.4f}")

In [None]:
################################################################################
# 모형 평가
################################################################################

for model_name, model in models.items():
    print(f"Evaluating model: {model_name}")

    y_test_pred = model.predict(X_test)
    y_test_pred = pd.DataFrame(
        y_test_pred, 
        columns=[col + "_PREDICTED" for col in y_test.columns], 
        index=y_test.index
    )

    # 테스트 세트 성능 측정
    r2_test = model.score(X_test, y_test)
    print(f"Test R²  : {r2_test:.4f}")