In [1]:
import pandas as pd
import numpy as np
import zipfile
from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.experimental import enable_iterative_imputer  # IterativeImputer 사용을 명시적으로 활성화
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin


# Data Description
### 등록국세청코드
- 101 ~ 999의 값으로 신규개업자에게 사용 가능한 번호 101-999를 순차적으로 부여합니다.<br>
- 사업자등록번호를 최초부여한 관할 세무서의 코드

### 납세자유형코드
- 개인구분 코드<br>
     ① 개인과세사업자는 특정 동 구별없이 01부터 79까지를 순차적으로 부여<br>
     ② 개인면세사업자는 산업 구분없이 90부터 99까지를 순차적으로 부여<br>
     ③ 소득세법 제2조 제3항에 해당하는 법인이 아닌 종교 단체 : 89<br>
     ④ 소득세법 제2조 제3항에 해당하는 자로서 "(3)"이외의자(아파트관리사무소 등) 및 다단계판매원 : 80<br>
<br><br>
- 법인성격코드：법인에 대하여는 성격별 코드를 구분하여 사용한다.<br>
     ① 영리법인의 본점 81，86，87, 88<br>
     ② 비영리법인의 본점 및 지점(법인격 없는 사단，재단，기타 단체 중 법인으로 보는 단체를 포함) : 82<br>
     ③ 국가，지방자치단체，지방자치단체조합 : 83<br>
     ④ 외국법인의 본・지점 및 연락사무소 : 84<br>
     ⑤ 영리법인의 지점 : 85<br>

In [2]:
# 데이터 읽기, 용량 절약을 위해 zip 파일의 압축을 해제하지 않고 바로 읽도록 한다.
with zipfile.ZipFile('20241112.zip') as zf:
    with zf.open('20241112.csv') as file:
        df_source = pd.read_csv(file, dtype={
            '사업자등록번호': 'string', 
            '등록국세청코드': 'string', 
            '납세자유형코드': 'string',
            '사업자유형코드': 'string', 
            '산업분류코드': 'string', 
            '시도': 'string',
            '국세청상호명': 'string', 
            '국세청상호명존재여부': 'string', 
            '영업일수': 'int',
            '영업일수(100일단위)': 'int', 
            '개업일': 'string', 
            '폐업일': 'string', 
            '통신판매사업자여부': 'string',
            '통신판매사업자전화번호': 'string', 
            '통신판매사업자전자우편': 'string',
            '나라장터조달업체제조구분코드': 'string', 
            '고용보험 업종코드': 'string', 
            '나라장터조달업체업무구분코드': 'string',
            '사업장 우편번호': 'string'
        })
df_source.columns = df_source.columns.str.strip().str.replace(' ', '_')
display(df_source)


Unnamed: 0,사업자등록번호,등록국세청코드,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명,국세청상호명존재여부,영업일수,영업일수(100일단위),...,사업장_우편번호,사업장_주소,고용보험_업종코드,고용보험_업종명,산재보험_성립일자,고용보험_성립일자,산재보험_상시근로자수,고용보험_상시근로자수,산재보험_사업구분,고용보험_사업구분
0,1010109091,101,01,01,56114,서울,김밥천국삼청점,Y,4766,48,...,,,,,,,,,,
1,1010109107,101,01,01,56122,서울,명송 하나,Y,5791,58,...,,,,,,,,,,
2,1010112688,101,01,01,47312,서울,가인전자,Y,2586,26,...,,,,,,,,,,
3,1010112733,101,01,01,20400,서울,켐스펙교역,Y,2371,24,...,,,,,,,,,,
4,1010112806,101,01,01,46596,서울,동광전업사,Y,2477,25,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595328,8999100276,899,91,04,03112,경북,제2007대성호,Y,2146,21,...,,,,,,,,,,
1595329,8999300310,899,93,04,46312,부산,경북농산,Y,2023,20,...,,,,,,,,,,
1595330,8999601213,899,96,04,90212,대구,글나루독서실,Y,868,9,...,,,,,,,,,,
1595331,8999700981,899,97,04,96921,전북,길묘,Y,363,4,...,,,,,,,,,,


In [3]:
# 개인사업자의 데이터만 남긴다. (81 ~ 88 제외)
df_stage1 = df_source[(df_source['사업자유형코드'] <= '80') | (df_source['사업자유형코드'] >= '89')]
display(df_stage1)

Unnamed: 0,사업자등록번호,등록국세청코드,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명,국세청상호명존재여부,영업일수,영업일수(100일단위),...,사업장_우편번호,사업장_주소,고용보험_업종코드,고용보험_업종명,산재보험_성립일자,고용보험_성립일자,산재보험_상시근로자수,고용보험_상시근로자수,산재보험_사업구분,고용보험_사업구분
0,1010109091,101,01,01,56114,서울,김밥천국삼청점,Y,4766,48,...,,,,,,,,,,
1,1010109107,101,01,01,56122,서울,명송 하나,Y,5791,58,...,,,,,,,,,,
2,1010112688,101,01,01,47312,서울,가인전자,Y,2586,26,...,,,,,,,,,,
3,1010112733,101,01,01,20400,서울,켐스펙교역,Y,2371,24,...,,,,,,,,,,
4,1010112806,101,01,01,46596,서울,동광전업사,Y,2477,25,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595327,8999001055,899,90,04,85709,경기,아트앤하트 고양탄현에듀포레푸르지오,Y,690,7,...,,,,,,,,,,
1595328,8999100276,899,91,04,03112,경북,제2007대성호,Y,2146,21,...,,,,,,,,,,
1595329,8999300310,899,93,04,46312,부산,경북농산,Y,2023,20,...,,,,,,,,,,
1595330,8999601213,899,96,04,90212,대구,글나루독서실,Y,868,9,...,,,,,,,,,,


In [None]:
# 필요한 컬럼들로 DataFrame 재구성
df_stage2 = df_stage1[['사업자유형코드','납세자유형코드','산업분류코드','시도','국세청상호명존재여부','영업일수(100일단위)','개업일','통신판매사업자여부',
                       '나라장터조달업체여부','나라장터조달업체종업원수','산재보험_상시근로자수','고용보험_상시근로자수']].assign(개업일=lambda x: x['개업일'].str[:4])

# Float --> Int로 변환
df_stage2['나라장터조달업체종업원수'] = pd.to_numeric(df_stage2['나라장터조달업체종업원수'],errors='coerce', downcast=None).astype('Int64')
df_stage2['산재보험_상시근로자수'] = pd.to_numeric(df_stage2['산재보험_상시근로자수'],errors='coerce', downcast=None).astype('Int64')
df_stage2['고용보험_상시근로자수'] = pd.to_numeric(df_stage2['고용보험_상시근로자수'],errors='coerce', downcast=None).astype('Int64')

# Drop duplicates
df_stage2 = df_stage2.drop_duplicates()

display(df_stage2)

Unnamed: 0,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명존재여부,영업일수(100일단위),개업일,통신판매사업자여부,나라장터조달업체여부,나라장터조달업체종업원수,산재보험_상시근로자수,고용보험_상시근로자수
0,01,01,56114,서울,Y,48,1994,N,N,,,
1,01,01,56122,서울,Y,58,1994,N,N,,,
2,01,01,47312,서울,Y,26,1997,N,N,,,
3,01,01,20400,서울,Y,24,1997,N,N,,,
4,01,01,46596,서울,Y,25,1997,N,N,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1595327,90,04,85709,경기,Y,7,2020,N,N,,,
1595328,91,04,03112,경북,Y,21,2016,N,N,,,
1595329,93,04,46312,부산,Y,20,2016,N,N,,,
1595330,96,04,90212,대구,Y,9,2020,N,N,,,


In [16]:
df_stage2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1051967 entries, 0 to 1595331
Data columns (total 12 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   사업자유형코드       1051967 non-null  string
 1   납세자유형코드       1051967 non-null  string
 2   산업분류코드        1051967 non-null  string
 3   시도            1051967 non-null  string
 4   국세청상호명존재여부    1051967 non-null  string
 5   영업일수(100일단위)  1051967 non-null  int64 
 6   개업일           1051967 non-null  string
 7   통신판매사업자여부     1051967 non-null  string
 8   나라장터조달업체여부    1051967 non-null  object
 9   나라장터조달업체종업원수  47590 non-null    Int64 
 10  산재보험_상시근로자수   21051 non-null    Int64 
 11  고용보험_상시근로자수   21051 non-null    Int64 
dtypes: Int64(3), int64(1), object(1), string(7)
memory usage: 107.3+ MB


In [None]:
df_stage2.describe()

In [None]:
df_stage2.nunique()

# 여기 아래는 quiz1의 코드를 가져온 것으로 수정이 필요하다. 

In [19]:
df_source = df_stage2.copy()

# 피처 타입 정의
target           = ['영업일수(100일단위)']
numeric_features = ['나라장터조달업체종업원수', '산재보험_상시근로자수', '고용보험_상시근로자수']
ordinal_features = []
nominal_features = ['사업자유형코드','납세자유형코드','산업분류코드','시도','국세청상호명존재여부','개업일','통신판매사업자여부','나라장터조달업체여부']
drop_na_features = []

# 특성과 타겟 변수 분리
y = df_source[target]
X = df_source[numeric_features + ordinal_features + nominal_features]

# 이상치 제거 함수 (도메인 지식 활용 ) 정의
def remove_domain_outliers(X, y=None):
        X_cleaned = X.copy()     
        X_cleaned = \
        X_cleaned[(df_source.나라장터조달업체종업원수 <= 1000) &
                (df_source.산재보험_상시근로자수 <= 1000) & 
                (df_source.고용보험_상시근로자수 <= 1000)]
        if y is not None:
            y_cleaned = y.loc[X_cleaned.index]  # 타겟 데이터프레임의 인덱스 동기화
            return X_cleaned, y_cleaned
        return X_cleaned

# 이상치 제거기 (자동 ) 설정
# auto_outlier_remover = None  # 이상치 제거기 설정하지 않음
# auto_outlier_remover = LocalOutlierFactor(n_neighbors=5, contamination=0.1) 
auto_outlier_remover = IsolationForest(random_state=42, contamination=0.05)
auto_outlier_removers = [
     None,
     LocalOutlierFactor(n_neighbors=10, contamination=0.1),
     IsolationForest(random_state=42, contamination=0.05)
]

# 사용자 정의 Transformer
class FeatureCombiner(BaseEstimator, TransformerMixin):
        def __init__(self, combined_features=[]):
            self.combined_features = combined_features
            pass

        def fit(self, X, y=None):
            return self  # 이 Transformer는 학습이 필요하지 않음
        
        def transform(self, X):
            X = pd.DataFrame(X, columns=numeric_features)
            self.X_combined = pd.DataFrame()
            # 새로운 특성 생성
            if 'AVG_WORKFORCE' in self.combined_features:
                self.X_combined['AVG_WORKFORCE'] = (X['나라장터조달업체종업원수'] + X['산재보험_상시근로자수'] + X['고용보험_상시근로자수']) / 3
            if 'TOTAL_WORKFORCE' in self.combined_features:
                self.X_combined['TOTAL_WORKFORCE'] = (X['나라장터조달업체종업원수'] + X['산재보험_상시근로자수'] + X['고용보험_상시근로자수'])
            return pd.concat([self.X_combined], axis=1)
        
        def get_feature_names_out(self, input_features=None):
             # 새로 생성된 특성과 기존 numeric features 이름을 합쳐 반환
             return self.X_combined.columns.tolist()
        
algorithms = [
     LinearRegression(),
     Ridge(random_state=42, max_iter=10000000),
     Lasso(random_state=42, max_iter=10000000),
     ElasticNet(random_state=42, max_iter=10000000),
     KNeighborsRegressor(),
]

params_common = {
     'preprocessor__numeric_transformer__numeric_imputer__strategy' : ['median', 'mean', 'most_frequent'],
     'preprocessor__numeric_transformer__feature_combiner__combined_features' : [
          ['TOTAL_WORKFORCE', 'AVG_WORKFORCE'],
          ['TOTAL_WORKFORCE'],
          []
     ],
     'preprocessor__numeric_transformer__polynomial_features__degree' : [1, 2, 3],
}

params_model = {}
params_model['LinearRegression'] = {}
params_model['Ridge'] = {'algorithm__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],}  # 다양한 alpha 값
params_model['Lasso'] = {'algorithm__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],}  # 다양한 alpha 값
params_model['ElasticNet'] = {
    'algorithm__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'algorithm__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
}
params_model['KNeighborsRegressor'] = {
    'algorithm__n_neighbors': range(3, 51),         # k-NN의 k 값 (이웃 개수)
    'algorithm__weights': ['uniform', 'distance'],  # 가중치 옵션
    'algorithm__p': [1, 2],                         # 거리 측정 방식 (1: 맨해튼 거리, 2: 유클리드 거리)
}

searches = {}

for algorithm in algorithms:
    algorithm_name = algorithm.__class__.__name__
    print(f"Processing {algorithm_name}...")
    # 수치형 데이터 전처리 파이프라인
    numeric_transformer = Pipeline(
        steps=[
            ('numeric_imputer'      , SimpleImputer()),
            ('feature_combiner'     , FeatureCombiner()),
            ('polynomial_features'  , PolynomialFeatures(include_bias=False)),
            ('numeric_scaler'       , StandardScaler()),
        ]
    )
    # 서열형 데이터 전처리 파이프라인
    ordinal_transformer = Pipeline(
        steps=[
            ('ordinal_imputer'      , SimpleImputer(strategy='constant', fill_value="None")),
            ('ordinal_encoder'      , OrdinalEncoder(categories=[['None', 'Old', 'Recent']])),
        ]
    )
    # 명목형 데이터 전처리 파이프라인
    nominal_transformer = Pipeline(
        steps=[
            ('nominal_imputer'      , SimpleImputer(strategy='most_frequent')),
            ('nominal_encoder'      , OneHotEncoder(drop='first')),
        ]
    )
    # 전체 전처리기 - 각 타입별로 변수 처리 및 결합
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric_transformer'  , numeric_transformer, numeric_features),
            ('ordinal_transformer'  , ordinal_transformer, ordinal_features),
            ('nominal_transformer'  , nominal_transformer, nominal_features),
        ]
    )
    # 전체 파이프라인
    model = Pipeline(
        steps=[
            ('preprocessor'         , preprocessor),
            ('algorithm'            , algorithm),
        ]
    )
    # [10] RandomizedSearchCV를 위한 파라미터 정의
    params = params_common.copy()
    params.update(params_model[algorithm_name])
    display(f"{algorithm_name}'s params:", params)

    # [10] RandomizedSearchCV 초기화
    search = RandomizedSearchCV(
        estimator=model, 
        param_distributions=params, # param_grid(GridSearchCV) 대신 param_distributions 사용
        cv=5, 
        scoring='r2',
        n_jobs=-1,
        n_iter=100,                 # 파라미터 조합 중 100개 샘플링. GridSearchCV는 모든 파라미터 사용
        random_state=42,            # 재현성을 위해 random_state 설정
    )
    search_key = (
        algorithm.__class__.__name__,
        SimpleImputer().__class__.__name__
    )
    searches[search_key] = search  # [10] search를 딕셔너리에 저장

# [10] 모형(파이프라인)의 구조를 시각적으로 표현
for key, search in searches.items():
    display(f"Search: {key}", search)

# 훈련 세트와 테스트 세트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 중복 데이터 제거
mask_dups = X_train.duplicated(keep=False)
display("Duplicated Samples:", X_train[mask_dups])
X_train = X_train.drop_duplicates()
y_train = y_train.loc[X_train.index]
display(X_train)
display(y_train)

Processing LinearRegression...


"LinearRegression's params:"

{'preprocessor__numeric_transformer__numeric_imputer__strategy': ['median',
  'mean',
  'most_frequent'],
 'preprocessor__numeric_transformer__feature_combiner__combined_features': [['TOTAL_WORKFORCE',
   'AVG_WORKFORCE'],
  ['TOTAL_WORKFORCE'],
  []],
 'preprocessor__numeric_transformer__polynomial_features__degree': [1, 2, 3]}

Processing Ridge...


"Ridge's params:"

{'preprocessor__numeric_transformer__numeric_imputer__strategy': ['median',
  'mean',
  'most_frequent'],
 'preprocessor__numeric_transformer__feature_combiner__combined_features': [['TOTAL_WORKFORCE',
   'AVG_WORKFORCE'],
  ['TOTAL_WORKFORCE'],
  []],
 'preprocessor__numeric_transformer__polynomial_features__degree': [1, 2, 3],
 'algorithm__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}

Processing Lasso...


"Lasso's params:"

{'preprocessor__numeric_transformer__numeric_imputer__strategy': ['median',
  'mean',
  'most_frequent'],
 'preprocessor__numeric_transformer__feature_combiner__combined_features': [['TOTAL_WORKFORCE',
   'AVG_WORKFORCE'],
  ['TOTAL_WORKFORCE'],
  []],
 'preprocessor__numeric_transformer__polynomial_features__degree': [1, 2, 3],
 'algorithm__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}

Processing ElasticNet...


"ElasticNet's params:"

{'preprocessor__numeric_transformer__numeric_imputer__strategy': ['median',
  'mean',
  'most_frequent'],
 'preprocessor__numeric_transformer__feature_combiner__combined_features': [['TOTAL_WORKFORCE',
   'AVG_WORKFORCE'],
  ['TOTAL_WORKFORCE'],
  []],
 'preprocessor__numeric_transformer__polynomial_features__degree': [1, 2, 3],
 'algorithm__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
 'algorithm__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}

Processing KNeighborsRegressor...


"KNeighborsRegressor's params:"

{'preprocessor__numeric_transformer__numeric_imputer__strategy': ['median',
  'mean',
  'most_frequent'],
 'preprocessor__numeric_transformer__feature_combiner__combined_features': [['TOTAL_WORKFORCE',
   'AVG_WORKFORCE'],
  ['TOTAL_WORKFORCE'],
  []],
 'preprocessor__numeric_transformer__polynomial_features__degree': [1, 2, 3],
 'algorithm__n_neighbors': range(3, 51),
 'algorithm__weights': ['uniform', 'distance'],
 'algorithm__p': [1, 2]}

"Search: ('LinearRegression', 'SimpleImputer')"

"Search: ('Ridge', 'SimpleImputer')"

"Search: ('Lasso', 'SimpleImputer')"

"Search: ('ElasticNet', 'SimpleImputer')"

"Search: ('KNeighborsRegressor', 'SimpleImputer')"

'Duplicated Samples:'

Unnamed: 0,나라장터조달업체종업원수,산재보험_상시근로자수,고용보험_상시근로자수,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명존재여부,개업일,통신판매사업자여부,나라장터조달업체여부
834544,,,,06,01,47320,서울,Y,2008,N,N
988423,,,,29,01,68112,충남,Y,2011,N,N
140840,,,,91,04,47210,서울,Y,2008,N,N
489428,,,,01,01,47414,경기,Y,2000,N,N
1500493,,,,32,01,56213,부산,Y,2011,N,N
...,...,...,...,...,...,...,...,...,...,...,...
246892,,,,04,02,56199,인천,Y,2000,N,N
411256,,,,27,01,46596,경기,Y,2005,N,N
228428,,,,07,01,46699,서울,Y,2004,N,N
1053857,,,,04,01,56213,전북,Y,1999,N,N


Unnamed: 0,나라장터조달업체종업원수,산재보험_상시근로자수,고용보험_상시근로자수,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명존재여부,개업일,통신판매사업자여부,나라장터조달업체여부
308872,,,,32,01,96112,경기,N,2003,N,N
1402905,,,,06,01,27196,경남,Y,1985,N,N
834544,,,,06,01,47320,서울,Y,2008,N,N
251479,,,,14,01,62010,인천,Y,2005,N,N
369319,,,,40,01,76310,충북,Y,2009,N,N
...,...,...,...,...,...,...,...,...,...,...,...
1225324,,,,02,01,47223,대구,Y,1983,N,N
503329,,,,14,01,47413,경기,Y,1993,N,N
1391481,,4,4,11,01,46109,부산,Y,2010,N,N
1142283,,,,08,01,47813,전남,Y,1989,N,N


Unnamed: 0,영업일수(100일단위)
308872,49
1402905,33
834544,12
251479,1
369319,17
...,...
1225324,142
503329,30
1391481,48
1142283,8


In [20]:
import warnings
# 모든 경고 무시: 경고를 무시하면 중요한 문제를 놓칠 수 있으므로, 가능하면 경고의 원인을 분석하여 문제를 해결하는 것이 바람직
warnings.filterwarnings("ignore")

In [21]:
################################################################################
# 모형 훈련
################################################################################

# 1.  결측치 처리
# 1.1 특정 열에 결측치 포함 시 제거
X_train = X_train.dropna(subset=[drop_na_features])
y_train = y_train.loc[X_train.index]

# 2.  이상치 제거
# 2.1 이상치 제거 - 도메인 지식 활용 및 자동 제거기 적용
X_train, y_train = remove_domain_outliers(X_train, y_train)

# [10] 여러 모형을 저장하기 위해 dictionary 사용
models = {}                         # 모형을 저장할 딕셔너리
performances = {}                   # 각 모델의 성능을 저장할 딕셔너리
best_train_score = float('-inf')    # 가장 높은 R² 값을 기록하기 위한 변수
best_model = None                   # 최적 모델을 저장하기 위한 변수
best_key = None                     # 최적 모델의 키를 저장하기 위한 변수

for auto_outlier_remover in auto_outlier_removers:
    outlier_remover_name = auto_outlier_remover.__class__.__name__
    print(f"\nApplying Auto Outlier Remover: {outlier_remover_name}")

    # 2.2 자동 이상치 제거기 적용
    if auto_outlier_remover is not None:
        # y_train_inlier = auto_outlier_remover.fit_predict(X_train[numeric_features])
        imputer = SimpleImputer(strategy='median')
        X_train_imputed = imputer.fit_transform(X_train[numeric_features])
        y_train_inlier = auto_outlier_remover.fit_predict(X_train_imputed)
        inlier_mask_train = y_train_inlier != -1
    else:
        # auto_outlier_remover가 None인 경우, 모든 데이터가 inlier로 간주
        inlier_mask_train = np.ones(X_train.shape[0], dtype=bool)  # 모든 값이 True

    # inlier 마스크에 따라 데이터 필터링
    X_train, y_train = X_train[inlier_mask_train], y_train[inlier_mask_train]

    for key, search in searches.items():    # [10] 반복문을 통해 여러 search를 fitting
        print(f"Training {key}...")

        # 5. 모형 훈련
        # 5.1  모형 적합 (모형에는 아래의 기능이 파이프라인에 포함됨)
        search.fit(X_train, y_train.values.ravel())    # model 대신 search를 적합(fitting)
        model = search.best_estimator_
        model.fit(X_train, y_train.values.ravel())
        
        full_key = (outlier_remover_name, *key) # models 딕셔너리의 키에 outlier_remover_name 추가
        models[full_key] = model  # 모델을 models 딕셔너리에 저장

        # 교차검증 성능 출력
        train_score = search.best_score_
        print(f"Best Score (R²): {train_score:.4f}")
        performances[full_key] = train_score

        # 5.4 최적 모델의 하이퍼파라미터 출력
        df_search_results = pd.DataFrame(search.cv_results_)
        display("Best Hyperparameters:", search.best_params_)  # 튜닝된 하이퍼파라미터 출력
        # display("Search Results:", pd.DataFrame(search.cv_results_))    # 교차검증결과 출력

        # 가장 높은 R² 값을 가진 모델을 best_model로 설정
        if train_score > best_train_score:
            best_train_score = train_score
            best_model = model
            best_key = full_key

# 최적 모델과 그 성능 정보 출력
print(f"Best Model R²: {best_train_score:.4f}")
print(f"Best Model: {best_key}")
display(best_model)

# 모든 모델의 성능 출력
print("All Model Performances:")
for key, score in performances.items():
    print(f"{key}: R² = {score:.4f}")


Applying Auto Outlier Remover: NoneType
Training ('LinearRegression', 'SimpleImputer')...


Traceback (most recent call last):
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/

Best Score (R²): nan


one
                        ^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/pipeline.py", line 600, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/compose/_column_transformer.py", line 1076, in transform
    Xs = self._call_func_on_transformers(
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/compose/_column_transformer.py", line 885, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/utils/

'Best Hyperparameters:'

{'preprocessor__numeric_transformer__polynomial_features__degree': 1,
 'preprocessor__numeric_transformer__numeric_imputer__strategy': 'median',
 'preprocessor__numeric_transformer__feature_combiner__combined_features': ['TOTAL_WORKFORCE',
  'AVG_WORKFORCE']}

Traceback (most recent call last):
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/

Training ('Ridge', 'SimpleImputer')...


Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/utils/_response.py", line 239, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/pipeline.py", line 600, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/compose/_column_transformer.py", line 1076, in transform
    Xs = self._call_func_on_transformers

Best Score (R²): nan


Traceback (most recent call last):
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/

'Best Hyperparameters:'

{'preprocessor__numeric_transformer__polynomial_features__degree': 3,
 'preprocessor__numeric_transformer__numeric_imputer__strategy': 'most_frequent',
 'preprocessor__numeric_transformer__feature_combiner__combined_features': ['TOTAL_WORKFORCE'],
 'algorithm__alpha': 10.0}

Traceback (most recent call last):
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/

Training ('Lasso', 'SimpleImputer')...


Traceback (most recent call last):
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/

Best Score (R²): nan


Traceback (most recent call last):
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/

'Best Hyperparameters:'

{'preprocessor__numeric_transformer__polynomial_features__degree': 3,
 'preprocessor__numeric_transformer__numeric_imputer__strategy': 'most_frequent',
 'preprocessor__numeric_transformer__feature_combiner__combined_features': ['TOTAL_WORKFORCE'],
 'algorithm__alpha': 10.0}

Traceback (most recent call last):
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/

Training ('ElasticNet', 'SimpleImputer')...


 self._transform(
                    ^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/preprocessing/_encoders.py", line 214, in _transform
    raise ValueError(msg)
ValueError: Found unknown categories ['76'] in column 0 during transform

Traceback (most recent call last):
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bayleys/miniforge3/envs/FDA/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    

ValueError: at least one array or dtype is required

In [None]:
################################################################################
# 모형 평가
################################################################################

for model_name, model in models.items():
    print(f"Evaluating model: {model_name}")

    y_test_pred = model.predict(X_test)
    y_test_pred = pd.DataFrame(
        y_test_pred, 
        columns=[col + "_PREDICTED" for col in y_test.columns], 
        index=y_test.index
    )

    # 테스트 세트 성능 측정
    r2_test = model.score(X_test, y_test)
    print(f"Test R²  : {r2_test:.4f}")