In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('movies_train.csv')
test = pd.read_csv('movies_test.csv')
sub = pd.read_csv('submission.csv')

title : 영화의 제목

distributor : 배급사

genre : 장르

release_time : 개봉일

time : 상영시간(분)

screening_rat : 상영등급

director : 감독이름

dir_prev_bfnum : 해당 감독이 이 영화를 만들기 전 제작에 참여한 영화에서의 평균 관객수(단 관객수가 알려지지 않은 영화 제외)

dir_prev_num : 해당 감독이 이 영화를 만들기 전 제작에 참여한 영화의 개수(단 관객수가 알려지지 않은 영화 제외)

num_staff : 스텝수

num_actor : 주연배우수

box_off_num : 관객수

In [3]:
train.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083
3,나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.0,2,20,6,217866
4,불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.0,1,251,2,483387


In [4]:
test.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
0,용서는 없다,시네마서비스,느와르,2010-01-07,125,청소년 관람불가,김형준,300529.0,2,304,3
1,아빠가 여자를 좋아해,(주)쇼박스,멜로/로맨스,2010-01-14,113,12세 관람가,이광재,342700.2,4,275,3
2,하모니,CJ 엔터테인먼트,드라마,2010-01-28,115,12세 관람가,강대규,4206611.0,3,419,7
3,의형제,(주)쇼박스,액션,2010-02-04,116,15세 관람가,장훈,691342.0,2,408,2
4,평행 이론,CJ 엔터테인먼트,공포,2010-02-18,110,15세 관람가,권호영,31738.0,1,380,1


In [5]:
sub.head()

Unnamed: 0,title,box_off_num
0,용서는 없다,0
1,아빠가 여자를 좋아해,0
2,하모니,0
3,의형제,0
4,평행 이론,0


# **전처리**
1. 결측치 처리 (dir_prev_bfnum)
2. 날짜형 변수 처리 (release_time)
3. 배급사명 통일 (distributor)
4. label encoding, one-hot encoding

**1. 결측치 처리 (dir_prev_bfnum)**
- 감독의 이전 작품 평균 관객 수 결측치를 0으로 변경

In [6]:
train['dir_prev_bfnum'] = train['dir_prev_bfnum'].fillna(0)
test['dir_prev_bfnum'] = test['dir_prev_bfnum'].fillna(0)

**2. 날짜형 변수 처리 (release_time)**
- 개봉일을 연도(release_year), 월(release_month)로 분리

In [7]:
# 문자열 → datetime 변환
train['release_time'] = pd.to_datetime(train['release_time'])
test['release_time'] = pd.to_datetime(test['release_time'])

In [8]:
# 'release_year', 'release_month' 생성
train['release_year'] = train['release_time'].dt.year
train['release_month'] = train['release_time'].dt.month
test['release_year'] = test['release_time'].dt.year
test['release_month'] = test['release_time'].dt.month

In [9]:
# release_time 변수 제거
train = train.drop(['release_time'], axis=1)
test = test.drop(['release_time'], axis=1)

In [10]:
train.head()

Unnamed: 0,title,distributor,genre,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num,release_year,release_month
0,개들의 전쟁,롯데엔터테인먼트,액션,96,청소년 관람불가,조병옥,0.0,0,91,2,23398,2012,11
1,내부자들,(주)쇼박스,느와르,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501,2015,11
2,은밀하게 위대하게,(주)쇼박스,액션,123,15세 관람가,장철수,220775.25,4,343,4,6959083,2013,6
3,나는 공무원이다,(주)NEW,코미디,101,전체 관람가,구자홍,23894.0,2,20,6,217866,2012,7
4,불량남녀,쇼박스(주)미디어플렉스,코미디,108,15세 관람가,신근호,1.0,1,251,2,483387,2010,11


- 'release_year', 'release_month' 변수 생성됨

**3. 배급사명 통일 (distributor)**

1) 수동 매핑

In [11]:
#배급사 개수 파악
train['distributor'].value_counts()

Unnamed: 0_level_0,count
distributor,Unnamed: 1_level_1
CJ 엔터테인먼트,54
롯데엔터테인먼트,52
(주)NEW,30
(주)마운틴픽쳐스,29
(주)쇼박스,26
...,...
영화사 廊,1
크리에이티브컴즈(주),1
ysfilm,1
이달투,1


In [12]:
train['distributor'].unique()

array(['롯데엔터테인먼트', '(주)쇼박스', '(주)NEW', '쇼박스(주)미디어플렉스', '백두대간',
       '유니버설픽쳐스인터내셔널코리아', '(주)두타연', '(주) 케이알씨지', '(주)콘텐츠 윙', '(주)키노아이',
       '(주)팝 파트너스', 'CJ E&M 영화부문', '(주) 영화제작전원사', 'CJ E&M Pictures',
       'CGV 무비꼴라쥬', '리틀빅픽처스', '스폰지', 'CJ 엔터테인먼트', 'CGV아트하우스', '조이앤시네마',
       '인디플러그', '콘텐츠판다', '인디스토리', '(주)팝엔터테인먼트', '시네마서비스', '웃기씨네',
       '영화사 진진', '(주)레인보우 팩토리', '김기덕 필름', 'NEW', 'CJ CGV',
       '동국대학교 충무로영상제작센터', 'BoXoo 엔터테인먼트', '(주)마운틴픽쳐스', 'CGV 아트하우스',
       '메가박스(주)플러스엠', '골든타이드픽처스', '파이오니아21', '디 씨드', '드림팩트 엔터테인먼트', '시너지',
       '디마엔터테인먼트', '판다미디어', '(주)스톰픽쳐스코리아', '(주)예지림 엔터테인먼트', '(주) 영화사조제',
       '보람엔터테인먼트', '(주)시네마달', '노바엔터테인먼트', '(주)패스파인더씨앤씨', '(주)대명문화공장',
       '(주)온비즈넷', 'KT&G 상상마당', '무비꼴라쥬', '인벤트 디', '씨네그루(주)키다리이엔티',
       '스튜디오후크', '시네마 달', '나이너스엔터테인먼트(주)', 'THE 픽쳐스', '영구아트무비', '리틀빅픽쳐스',
       '어뮤즈', '이모션 픽처스', '(주)이스트스카이필름', '필라멘트 픽쳐스', '조이앤컨텐츠그룹', '타임스토리그룹',
       '마운틴 픽처스', '(주)휘엔터테인먼트', '이십세기폭스코리아(주)', '(주)피터팬픽쳐스', '에스와이코마드',
       '(주)더픽쳐스', '오퍼스픽쳐스'

In [13]:
distributor_map = {
    'CJ 엔터테인먼트': 'CJE&M Movie',
    'CJ E&M 영화부문': 'CJE&M Movie',
    'CJ E&M Pictures': 'CJE&M Movie',

    '무비꼴라쥬': 'CGV무비꼴라쥬',

    '(주)마운틴픽쳐스': '마운틴픽처스',

    '(주)더픽쳐스': 'THE픽쳐스',

    '더픽쳐스/(주)마운틴픽쳐스': 'THE픽쳐스, 마운틴픽처스',

    '메가박스(주)플러스엠': '메가박스',
    '씨너스엔터테인먼트(주)': '메가박스', #메가박스와 합병

    '조이앤컨텐츠그룹': '조이앤시네마',
    '드림팩트 엔터테인먼트': '조이앤시네마',
    '(주)드림팩트엔터테인먼트': '조이앤시네마',
    '(주) 케이알씨지': '조이앤시네마',
    '스크린조이': '조이앤시네마', #다 조이앤시네마라고 보면 됨

    '콘텐츠판다': 'NEW', #계열사

    '사람과 사람들': '키노아이DMC',
    '(주)키노아이': '키노아이DMC',

    '(주)JK필름': '롯데',
    '(주)이스트스카이필름': '롯데' #해당 영화 검색 결과 배급사는 롯데임을 확인
}

#전처리 함수
train['distributor'] = train['distributor'].map(distributor_map).fillna(train['distributor'])
test['distributor'] = test['distributor'].map(distributor_map).fillna(test['distributor'])

- CJ CGV라는 배급사가 존재하므로 함수 적용 이전에 CJE&M Movie 수동 매핑 필요
- 무비꼴라쥬는 CGV무비꼴라쥬로 매핑 후 함수 적용 시 CGV로 처리

2) 함수 적용하여 동일 배급사 정리

In [14]:
def simplify_distributor(name):
    name = name.strip()

    if "CGV" in name:
        return "CGV"
    elif "쇼박스" in name:
        return "쇼박스"
    elif "싸이더스" in name:
        return "싸이더스"
    elif "NEW" in name:
        return "NEW"
    elif "리틀빅" in name:
        return "리틀빅픽처스"
    elif "시네마달" in name:
        return "시네마달"
    elif "스폰지" in name:
        return "스폰지"
    elif "롯데" in name:
        return "롯데엔터테인먼트"
    elif "SK" in name:
        return "SK"
    else:
        return name  # 변경하지 않음

train["distributor"] = train["distributor"].apply(simplify_distributor)
test["distributor"] = test["distributor"].apply(simplify_distributor)

3) 문자 간 공백 제거 및 공동 배급 나누기

In [15]:
#문자 간 공백 제거
train['distributor'] = train['distributor'].str.replace(r'\s+', '', regex=True)
test['distributor'] = test['distributor'].str.replace(r'\s+', '', regex=True)

In [16]:
#쉼표 기준으로 데이터 분리
train['distributor'] = train['distributor'].str.split(r',')
test['distributor'] = test['distributor'].str.split(r',')

#분리된 데이터를 기존 행과 합치기 위한 코드
train = train.explode('distributor').reset_index(drop=True)
test = test.explode('distributor').reset_index(drop=True)

4) (주) 제거

In [17]:
train['distributor'] = train['distributor'].str.replace(r'\(주\)', '', regex=True)
test['distributor'] = test['distributor'].str.replace(r'\(주\)', '', regex=True)

In [18]:
train['distributor'].value_counts()

Unnamed: 0_level_0,count
distributor,Unnamed: 1_level_1
CJE&MMovie,59
롯데엔터테인먼트,54
NEW,39
마운틴픽처스,33
쇼박스,28
...,...
영화사廊,1
크리에이티브컴즈,1
ysfilm,1
이달투,1


In [19]:
train['distributor'].unique()

array(['롯데엔터테인먼트', '쇼박스', 'NEW', '백두대간', '유니버설픽쳐스인터내셔널코리아', '두타연',
       '조이앤시네마', '콘텐츠윙', '키노아이DMC', '팝파트너스', 'CJE&MMovie', '영화제작전원사',
       'CGV', '리틀빅픽처스', '스폰지', '인디플러그', '인디스토리', '팝엔터테인먼트', '시네마서비스',
       '웃기씨네', '영화사진진', '레인보우팩토리', '김기덕필름', '동국대학교충무로영상제작센터',
       'BoXoo엔터테인먼트', '마운틴픽처스', '메가박스', '골든타이드픽처스', '파이오니아21', '디씨드',
       '시너지', '디마엔터테인먼트', '판다미디어', '스톰픽쳐스코리아', '예지림엔터테인먼트', '영화사조제',
       '보람엔터테인먼트', '시네마달', '노바엔터테인먼트', '패스파인더씨앤씨', '대명문화공장', '온비즈넷',
       'KT&G상상마당', '인벤트디', '씨네그루키다리이엔티', '스튜디오후크', '나이너스엔터테인먼트', 'THE픽쳐스',
       '영구아트무비', '어뮤즈', '이모션픽처스', '필라멘트픽쳐스', '타임스토리그룹', '휘엔터테인먼트',
       '이십세기폭스코리아', '피터팬픽쳐스', '에스와이코마드', '오퍼스픽쳐스', '고앤고필름', 'KT', '싸이더스',
       '프레인글로벌', '나우콘텐츠', '홀리가든', '브릿지웍스', '엣나인필름', '위더스필름', '에이원엔터테인먼트',
       'OAL(올)', '전망좋은영화사', '스토리셋', '이상우필름', '씨네굿필름', '영희야놀자', '찬란',
       '어썸피플', '아방가르드필름', '와이드릴리즈', 'tvN', '액티버스엔터테인먼트', '제나두엔터테인먼트',
       '아이필름코퍼레이션', '쟈비스미디어', '트리필름', '에스피엠', '건시네마', '키노엔터테인먼트',
       '아우라픽처스', '에이블엔터테인먼트'

-  배급사 개수가 169개 -> 137개로 줄어들음

**4.label encoding, one-hot encoding**
- distributor, genre, screening_rat, director

In [20]:
# X, y
X = train.drop(['box_off_num', 'title'], axis=1)
y = train['box_off_num']


# '기타'를 NaN으로 대체
X = X.replace('기타', np.nan)
test = test.replace('기타', np.nan)

#원핫인코딩 (drop_first로 다중공선성 줄이기)
X_encoded = pd.get_dummies(X, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

In [21]:
train.head()

Unnamed: 0,title,distributor,genre,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num,release_year,release_month
0,개들의 전쟁,롯데엔터테인먼트,액션,96,청소년 관람불가,조병옥,0.0,0,91,2,23398,2012,11
1,내부자들,쇼박스,느와르,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501,2015,11
2,은밀하게 위대하게,쇼박스,액션,123,15세 관람가,장철수,220775.25,4,343,4,6959083,2013,6
3,나는 공무원이다,NEW,코미디,101,전체 관람가,구자홍,23894.0,2,20,6,217866,2012,7
4,불량남녀,쇼박스,코미디,108,15세 관람가,신근호,1.0,1,251,2,483387,2010,11


# **회귀 모델링**

**1. Linear, Ridge, Lasso, RandomForest, ElasticNet, GradientBoosting, DecisionTree, LightGBM, XGBoost을 적용하여 성능 비교**

원본과 로그 변환 **y_log = np.log1p(y)** 비교

In [22]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# === 데이터 준비 ===
X = train.drop(columns=['box_off_num', 'title'])
y = train['box_off_num']
y_log = np.log1p(y)

# 기타 → NaN 처리
X = X.replace('기타', np.nan)

# One-Hot Encoding
X = pd.get_dummies(X, drop_first=True)

# 결측값 채우기
X = X.fillna(0)

# === Train/Test 분리 (동일한 기준 유지)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
_, _, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# === 스케일링 (선형 계열 모델용)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === 모델 정의 (추가 모델 포함) ===
models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=100, random_state=42)
}

# === 성능 비교 ===
results = []

for name, model in models.items():
    # 원본 타겟 학습
    if name in ['Linear', 'Ridge', 'Lasso', 'ElasticNet']:
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    results.append({'Model': name + '_original', 'RMSE': rmse, 'R2': r2})

    # 로그 변환 타겟 학습
    if name in ['Linear', 'Ridge', 'Lasso', 'ElasticNet']:
        model.fit(X_train_scaled, y_train_log)
        preds_log = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train_log)
        preds_log = model.predict(X_test)

    # 너무 큰 예측값 방어 (overflow 방지)
    preds_log = np.clip(preds_log, a_min=None, a_max=20)
    preds_log_exp = np.expm1(preds_log)
    rmse_log = np.sqrt(mean_squared_error(np.expm1(y_test_log), preds_log_exp))
    r2_log = r2_score(np.expm1(y_test_log), preds_log_exp)

    results.append({'Model': name + '_log', 'RMSE': rmse_log, 'R2': r2_log})

# === 결과 정리 ===
results_df = pd.DataFrame(results).sort_values(by='RMSE')
print(results_df)

  model = cd_fast.enet_coordinate_descent(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186
                        Model          RMSE          R2
12  GradientBoosting_original  1.057452e+06    0.487627
15                XGBoost_log  1.073468e+06    0.471989
13       GradientBoos

- 리더보드 점수: 1304067.2754845512

**2. 단일 모델 모델링**

In [23]:
from sklearn.model_selection import GridSearchCV

# 공통 옵션
cv_folds = 5
scoring_metric = 'neg_root_mean_squared_error'

1) XGBoost

In [24]:
# XGBoost_log에 대한 GridSearchCV
xgb_params = {
    'n_estimators': [40,80],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

xgb_model = XGBRegressor(random_state=42, verbose=0)
xgb_grid = GridSearchCV(
    xgb_model,
    xgb_params,
    cv=cv_folds,
    scoring=scoring_metric
)

xgb_grid.fit(X_train_scaled, y_train_log)

print("Best XGBoost Params:", xgb_grid.best_params_)
print("Best XGBoost RMSE (log target):", -xgb_grid.best_score_)

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "v

Best XGBoost Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 80}
Best XGBoost RMSE (log target): 1.907130191636605


Parameters: { "verbose" } are not used.



In [25]:
# xgboost_log 리더보드 점수
model_scores = np.sqrt(abs(xgb_grid.best_score_))

# DataFrame으로 정리
df = pd.DataFrame({
    'Model' : ['XGBoost_log'],
    'RMSE' : model_scores
})

print(df)

         Model      RMSE
0  XGBoost_log  1.380989


- 최종 RMSE: 1.8782018885471665
- 리더보드 점수: 1.370475

2) GradientBoosting

In [26]:
# GradientBoosting_original에 대한 GridSearchCV
gb_params = {
    'n_estimators': [40,80],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

gb_model = GradientBoostingRegressor(random_state=42)
gb_grid = GridSearchCV(
    gb_model,
    gb_params,
    cv=cv_folds,
    scoring=scoring_metric
)

gb_grid.fit(X_train_scaled, y_train)

print("Best GradientBoosting Params:", gb_grid.best_params_)
print("Best GradientBoosting RMSE:", -gb_grid.best_score_)

Best GradientBoosting Params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 40}
Best GradientBoosting RMSE: 1547434.2917777342


In [27]:
# GradientBoosting_original 리더보드 점수
model_scores = np.sqrt(abs(gb_grid.best_score_))

# DataFrame으로 정리
df = pd.DataFrame({
    'Model' : ['GradientBoosting_original'],
    'RMSE' : model_scores
})

print(df)

                       Model        RMSE
0  GradientBoosting_original  1243.95912


- 최종 RMSE: 1549668.5012341165
- 리더보드 점수: 1244.85682

**3. 보팅**

1) 최적의 조합 찾기

In [28]:
import itertools
from sklearn.ensemble import VotingRegressor

voting_results = []
model_combinations = list(itertools.combinations(models.items(), 3))  # 9C3

for combo in model_combinations:
    name = "+".join([m[0] for m in combo])
    estimators = [(m[0], m[1]) for m in combo]

    # -------- 원본 타겟 --------
    vr_ori = VotingRegressor(estimators=estimators)
    vr_ori.fit(X_train, y_train)
    pred_ori = vr_ori.predict(X_test)
    rmse_ori = np.sqrt(mean_squared_error(y_test, pred_ori))
    voting_results.append({
        'Combination': name,
        'Target': 'original',
        'RMSE': rmse_ori
    })

    # -------- 로그 변환 타겟 --------
    vr_log = VotingRegressor(estimators=estimators)
    vr_log.fit(X_train, y_train_log)
    pred_log = vr_log.predict(X_test)
    pred_log_exp = np.expm1(pred_log)  # 로그 역변환
    rmse_log = np.sqrt(mean_squared_error(np.expm1(y_test_log), pred_log_exp))
    voting_results.append({
        'Combination': name,
        'Target': 'log',
        'RMSE': rmse_log
    })

# --------------------------
# 결과 확인
# --------------------------
voting_df = pd.DataFrame(voting_results)
voting_df_sorted = voting_df.sort_values(by="RMSE")
display(voting_df_sorted.head(10))  # RMSE 가장 낮은 10개 조합 보기

  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  model = cd_fast.enet_coordinate_descent(
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bin

  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  model = cd_fast.enet_coordinate_descent(
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  model = cd_fast.enet_coordinate_descent(
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  model = cd_fast.enet_coordinate_descent(
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  model = cd_fast.enet_coordinate_descent(
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  model = cd_fast.enet_coordinate_descent(
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  model = cd_fast.enet_coordinate_des

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000086 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186


  model = cd_fast.enet_coordinate_descent(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 718742.302083
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bin

Unnamed: 0,Combination,Target,RMSE
167,GradientBoosting+XGBoost+LightGBM,log,1050016.0
94,Ridge+GradientBoosting+LightGBM,original,1066453.0
86,Ridge+RandomForest+GradientBoosting,original,1068770.0
136,ElasticNet+RandomForest+GradientBoosting,original,1086299.0
144,ElasticNet+GradientBoosting+LightGBM,original,1087845.0
84,Ridge+DecisionTree+LightGBM,original,1088029.0
80,Ridge+DecisionTree+GradientBoosting,original,1088625.0
72,Ridge+ElasticNet+GradientBoosting,original,1088863.0
156,DecisionTree+GradientBoosting+LightGBM,original,1092693.0
130,ElasticNet+DecisionTree+GradientBoosting,original,1100832.0


- 로그 변환된 GradientBoosting + XGBoost + LightGBM 조합의 성능이 가장 좋음

In [29]:
from sklearn.ensemble import VotingRegressor

# Voting Regressor 구성
voting_reg = VotingRegressor(estimators=[
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42))
])

# 학습 및 예측
voting_reg.fit(X_train, y_train_log)
y_pred = voting_reg.predict(X_test)
pred_log_exp = np.expm1(pred_log)

# RMSE 계산
rmse_log = np.sqrt(mean_squared_error(np.expm1(y_test_log), pred_log_exp))
print(f"Voting Regressor RMSE: {rmse_log:.2f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186
Voting Regressor RMSE: 1050016.36


2) 가중치 조정

In [30]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
from itertools import product
import numpy as np
import pandas as pd

# 모델 구성
estimators = [
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42))
]

# 가중치 조합 (1~3까지 가능한 가중치 조합 생성)
weight_candidates = list(product([1, 2, 3], repeat=3))

results = []

for weights in weight_candidates:
    # VotingRegressor 구성
    voting = VotingRegressor(estimators=estimators, weights=weights)
    voting.fit(X_train, y_train_log)

    # 예측 및 역변환
    y_pred_log = voting.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(np.expm1(y_test_log), y_pred))

    results.append({
        'weights': weights,
        'RMSE': rmse
    })

# 결과 정리
df_results = pd.DataFrame(results)
df_results_sorted = df_results.sort_values(by='RMSE')
print(df_results_sorted)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you

In [31]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# 최종 VotingRegressor 정의 (최적 가중치 사용)
final_model = VotingRegressor(
    estimators=[
        ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
        ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
        ('lgbm', LGBMRegressor(n_estimators=100, random_state=42))
    ],
    weights=[2, 3, 1]
)

# 학습
final_model.fit(X_train, y_train_log)

# 예측 (로그 스케일)
y_pred_log = final_model.predict(X_test)

# 로그 역변환
y_pred = np.expm1(y_pred_log)
y_test_true = np.expm1(y_test_log)

# RMSE 계산
final_rmse = np.sqrt(mean_squared_error(y_test_true, y_pred))
print(f"최종 Voting Regressor RMSE (log target, weights=[2,3,1]): {final_rmse:.2f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186
최종 Voting Regressor RMSE (log target, weights=[2,3,1]): 1048475.96


3) RandomizedSearchCV로 선택된 모델 튜닝 시도

In [32]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

# GradientBoostingRegressor
gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3],
    'subsample': [0.8, 1.0]
}
gb_random = RandomizedSearchCV(
    GradientBoostingRegressor(random_state=42),
    gb_params,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)
gb_random.fit(X_train, y_train_log)
best_gb = gb_random.best_estimator_

# XGBRegressor
xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 10]
}
xgb_random = RandomizedSearchCV(
    XGBRegressor(random_state=42, verbosity=0),
    xgb_params,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)
xgb_random.fit(X_train, y_train_log)
best_xgb = xgb_random.best_estimator_

# LGBMRegressor
lgbm_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [-1, 5, 7],
    'num_leaves': [31, 50, 70],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0, 1]
}
lgbm_random = RandomizedSearchCV(
    LGBMRegressor(random_state=42),
    lgbm_params,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)
lgbm_random.fit(X_train, y_train_log)
best_lgbm = lgbm_random.best_estimator_

# Voting Regressor
voting_reg = VotingRegressor(
    estimators=[
        ('gb', best_gb),
        ('xgb', best_xgb),
        ('lgbm', best_lgbm)
    ]
)
voting_reg.fit(X_train, y_train_log)

# 예측 및 평가
y_pred_log = voting_reg.predict(X_test)
y_pred = np.expm1(y_pred_log)
rmse = np.sqrt(mean_squared_error(np.expm1(y_test_log), y_pred))

print(f"\nRandomizedSearchCV로 튜닝한 Voting Regressor RMSE: {rmse:.2f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 23
[LightGBM] [Info] Start training from score 9.853186

RandomizedSearchCV로 튜닝한 Voting Regressor RMSE: 1001213.18


- 최종 RMSE: 1001213.18
- 리더보드 점수: 1396254.5633061845

**4. 스태킹(meta model=LinearRegression)**

In [33]:
#스태킹 베이스 모델 조합
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

stacking_results = []
model_combinations = list(itertools.combinations(models.items(), 3))  # 9C3

# 스케일링 (선형 모델 포함되므로 전체 스케일 적용)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for combo in model_combinations:
    name = "+".join([m[0] for m in combo])
    estimators = [(m[0], m[1]) for m in combo]

    # -------- 스태킹 회귀 (log target) --------
    stacking_model = StackingRegressor(
        estimators=estimators,
        final_estimator=LinearRegression(),
        passthrough=False,
        cv=5,
        n_jobs=-1
    )

    try:
        stacking_model.fit(X_train_scaled, y_train_log)
        pred_log = stacking_model.predict(X_test_scaled)
        pred_exp = np.expm1(pred_log)
        rmse = np.sqrt(mean_squared_error(np.expm1(y_test_log), pred_exp))

        stacking_results.append({
            'Combination': name,
            'Target': 'log',
            'RMSE': rmse
        })
    except Exception as e:
        print(f"Error with combination {name}: {e}")
        continue

# 결과 확인
stacking_df = pd.DataFrame(stacking_results).sort_values(by='RMSE')
display(stacking_df.head(10))  # RMSE 낮은 상위 10개 조합




Unnamed: 0,Combination,Target,RMSE
47,Ridge+GradientBoosting+LightGBM,log,1030327.0
19,Linear+DecisionTree+GradientBoosting,log,1056580.0
40,Ridge+DecisionTree+GradientBoosting,log,1057501.0
77,DecisionTree+GradientBoosting+XGBoost,log,1060642.0
25,Linear+GradientBoosting+XGBoost,log,1061055.0
26,Linear+GradientBoosting+LightGBM,log,1062485.0
46,Ridge+GradientBoosting+XGBoost,log,1062732.0
4,Linear+Ridge+GradientBoosting,log,1065295.0
78,DecisionTree+GradientBoosting+LightGBM,log,1067365.0
48,Ridge+XGBoost+LightGBM,log,1077375.0


In [34]:
#스태킹 모델(base_model: GradientBoosting + XGB + LGBM)_보팅 best 결과 이용
# --- 필요한 라이브러리 임포트 ---
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# === 베이스 모델 정의 ===
base_learners = [
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42))
]

# === 메타 모델 (Linear Regression) 정의 ===
meta_learner = LinearRegression()

# === Stacking Regressor 구성 ===
stacking_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_learner,
    passthrough=False,  # 원래 특성 사용 여부
    cv=5,
    n_jobs=-1
)

# === 로그 타겟으로 학습 ===
stacking_model.fit(X_train, y_train_log)

# === 예측 및 역변환 ===
y_pred_log = stacking_model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # 로그 역변환
y_test_true = np.expm1(y_test_log)

# === 성능 평가 ===
rmse_stacking = np.sqrt(mean_squared_error(y_test_true, y_pred))
print(f"Stacking Regressor RMSE (log target): {rmse_stacking:.2f}")

Stacking Regressor RMSE (log target): 1051485.61


In [35]:
#스태킹 모델(base_model: Ridge + GradientBoosting + LightGBM)_스태킹 1번째 최적 조합
# --- 필요한 라이브러리 임포트 ---
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# === 베이스 모델 정의 ===
base_learners = [
    ('ridge', Ridge(alpha=1.0)),
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42))
]

# === 메타 모델 (Linear Regression) 정의 ===
meta_learner = LinearRegression()

# === Stacking Regressor 구성 ===
stacking_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_learner,
    passthrough=False,
    cv=5,
    n_jobs=-1
)

# === 학습 (로그 변환 타겟 사용) ===
# Ridge는 scaled 데이터를, 다른 모델은 비스케일 데이터를 사용했으므로 스케일 맞추기
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ridge 모델은 스케일된 데이터를 쓰기 위해 따로 wrapper 사용
# -> 여기서는 전체 모델에 스케일된 X를 넣는 방식으로 통일

stacking_model.fit(X_train_scaled, y_train_log)

# === 예측 및 평가 ===
y_pred_log = stacking_model.predict(X_test_scaled)
y_pred = np.expm1(y_pred_log)           # 로그 역변환
y_test_true = np.expm1(y_test_log)

rmse = np.sqrt(mean_squared_error(y_test_true, y_pred))
print(f"Stacking Regressor RMSE (log target, Ridge+GB+LGBM): {rmse:.2f}")

Stacking Regressor RMSE (log target, Ridge+GB+LGBM): 1030326.59




In [36]:
#스태킹 모델(base_model: Ridge + DecisionTree + GradientBoosting )_스태킹 2번째 최적 조합
# --- 라이브러리 임포트 ---
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# === 스케일링 (Ridge 포함 시 필수) ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === 베이스 모델 정의 ===
base_learners = [
    ('ridge', Ridge(alpha=1.0)),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
]

# === 메타 모델 정의 ===
meta_learner = LinearRegression()

# === 스태킹 모델 구성 ===
stacking_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_learner,
    passthrough=False,  # 원본 피처는 사용하지 않음
    cv=5,
    n_jobs=-1
)

# === 학습 ===
stacking_model.fit(X_train_scaled, y_train_log)

# === 예측 및 평가 ===
y_pred_log = stacking_model.predict(X_test_scaled)
y_pred = np.expm1(y_pred_log)  # 로그 역변환
y_test_true = np.expm1(y_test_log)

# === RMSE 계산 ===
rmse = np.sqrt(mean_squared_error(y_test_true, y_pred))
print(f"Stacking Regressor RMSE (log target, Ridge+DT+GB → Linear): {rmse:.2f}")

Stacking Regressor RMSE (log target, Ridge+DT+GB → Linear): 1057501.08


- Ridge + GradientBoosting + LightGBM(스태킹 모델 최적 조합)을 base_model로 했을 경우의 RMSE값이 가장 작으므로 성능이 제일 좋다.

- 최종 RMSE : 1029683.81
- 리더보드 점수: 5.720429651386372e+23(=28.54973997)

## **5. 스태킹(meta model=Ridge)**

1. GradientBoosting + XGB + LGBM
2. Ridge + GradientBoosting + LightGBM
3. Ridge + DecisionTree + GradientBoosting

In [37]:
# Stacking model (base_model: GradientBoosting + XGB + LGBM)
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# ------base models 설정-------------------------------------------------------
base_models = [
    ('gbr', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbosity=0)),
    ('lgbm', LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42,n_jobs=-1, verbosity=-1))
]

# ------meta model 설정 (Ridge)------------------------------------------------
meta_model = Ridge(alpha=1.0)

# ------stacking regressor 정의------------------------------------------------
stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=True,
    cv = 5,
    n_jobs=-1
)

# -----학습 및 예측--------------------------------------------------------------
stacking_model.fit(X_train, y_train_log)
y_pred_log = stacking_model.predict(X_test)

y_pred_log = np.maximum(y_pred_log, 0)
y_pred = np.expm1(y_pred_log)
y_test_true = np.expm1(y_test_log)

# 평가
stacking_rmse = np.sqrt(mean_squared_error(y_test_true, y_pred))
stacking_r2 = r2_score(y_test_true, y_pred)

print(f"✅ StackingRegressor RMSE: {stacking_rmse:.4f}")
print(f"✅ StackingRegressor R2 Score: {stacking_r2:.4f}")

  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


✅ StackingRegressor RMSE: 4448745.9892
✅ StackingRegressor R2 Score: -8.0686


In [38]:
# Stacking model (base_model:  Ridge + GradientBoosting + LightGBM)
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np

# ------base models 설정-------------------------------------------------------
base_models = [
    ('gbr', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('ridge', Ridge(alpha=1.0)),
    ('lgbm', LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1, verbosity=-1))
]

# ------meta model 설정 (Ridge)------------------------------------------------
meta_model = Ridge(alpha=1.0)

# ------stacking regressor 정의------------------------------------------------
stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=True,
    cv = 5,
    n_jobs=-1
)

# -----학습 및 예측--------------------------------------------------------------
stacking_model.fit(X_train, y_train_log)
y_pred_log = stacking_model.predict(X_test)
y_pred_log = np.maximum(y_pred_log, 0)  # 음수 제거
y_pred = np.expm1(y_pred_log)
y_test_true = np.expm1(y_test_log)

# 평가
stacking_rmse = np.sqrt(mean_squared_error(y_test_true, y_pred))
stacking_r2 = r2_score(y_test_true, y_pred)

print(f"✅ StackingRegressor RMSE: {stacking_rmse:.4f}")
print(f"✅ StackingRegressor R2 Score: {stacking_r2:.4f}")

✅ StackingRegressor RMSE: 3332467.0588
✅ StackingRegressor R2 Score: -4.0886


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


In [39]:
# Stacking model (base_model: Ridge + DecisionTree + GradientBoosting)
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# ------base models 설정-------------------------------------------------------
base_models = [
    ('gbr', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('ridge', Ridge(alpha=1.0)),
    ('dt', DecisionTreeRegressor(random_state=42))
]

# ------meta model 설정 (Ridge)------------------------------------------------
meta_model = Ridge(alpha=1.0)

# ------stacking regressor 정의------------------------------------------------
stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=True,
    cv = 5,
    n_jobs=-1
)

# -----학습 및 예측--------------------------------------------------------------
stacking_model.fit(X_train, y_train_log)
y_pred_log = stacking_model.predict(X_test)

# 음수 방지 후 역변환
y_pred_log = np.maximum(y_pred_log, 0)
y_pred = np.expm1(y_pred_log)
y_test_true = np.expm1(y_test_log)

# 평가
stacking_rmse = np.sqrt(mean_squared_error(y_test_true, y_pred))
stacking_r2 = r2_score(y_test_true, y_pred)

print(f"✅ StackingRegressor RMSE: {stacking_rmse:.4f}")
print(f"✅ StackingRegressor R2 Score: {stacking_r2:.4f}")

✅ StackingRegressor RMSE: 4035513.6011
✅ StackingRegressor R2 Score: -6.4621


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


### 스태킹(meta model=Ridge) 결론

- 1번 조합: GradientBoosting + XGB + LGBM

✅ StackingRegressor RMSE: 4448745.9892

✅ StackingRegressor R2 Score: -8.0686

✅ StackingRegressor MAE: 970979.0792

- 2번 조합: Ridge + GradientBoosting + LightGBM

✅ StackingRegressor RMSE: 3332467.0588

✅ StackingRegressor R2 Score: -4.0886

✅ StackingRegressor MAE: 800726.7568

- 3번 조합: Ridge + DecisionTree + GradientBoosting

✅ StackingRegressor RMSE: 4035513.6011

✅ StackingRegressor R2 Score: -6.4621

✅ StackingRegressor MAE: 844762.2163


최종 결론

- 2번 조합(Ridge + GradientBoosting + LightGBM)이 모든 지표에서 가장 우수한 성능을 보임
- 이는 Ridge가 선형적인 패턴을, GradientBoosting과 LGBM이 비선형적인 복잡한 패턴을 잡아내며 균형 잡힌 학습이 이뤄졌기 때문임
- 1번 조합은 모두 ensemble tree기반으로, feature 간 중복 학습이나 과적합 가능성이 매우 높아 예측 성능이 매우 떨어짐.
- **리더보드 점수**: 	2086069.7326255299