In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor
from sklearn.feature_selection import SelectKBest, f_regression

# ✅ 데이터 로드
df = pd.read_csv("winequality-red.csv")

# ✅ 컬럼명 정리 (공백 제거)
df.columns = df.columns.str.replace(' ', '_')

# ✅ 로그 변환 (NaN 방지)
log_features = ['alcohol', 'volatile_acidity', 'sulphates', 'chlorides', 'residual_sugar']
for feature in log_features:
    df[feature] = np.where(df[feature] > 0, np.log1p(df[feature]), 0)

# ✅ 새로운 Feature 생성 (변수 조합)
df['alcohol_sulphates'] = df['alcohol'] * df['sulphates']
df['volatile_citric'] = df['volatile_acidity'] * df['citric_acid']

# ✅ 나누기 연산 방지 (0으로 나누는 것 방지)
df['fixed_density'] = np.where(df['density'] > 0, df['fixed_acidity'] / df['density'], 0)
df['chloride_sulfur_ratio'] = np.where(df['total_sulfur_dioxide'] > 0, df['chlorides'] / df['total_sulfur_dioxide'], 0)

# ✅ 제곱 및 제곱근 변환 (NaN 방지)
df['sulphates_sq'] = df['sulphates'] ** 2
df['sqrt_total_sulfur'] = np.sqrt(np.abs(df['total_sulfur_dioxide']))

# ✅ Inf 및 NaN 처리
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)  # NaN이 포함된 행 제거

In [2]:
# ✅ Feature Selection (상위 10개 변수 선택)
k_best = SelectKBest(score_func=f_regression, k=10)
X_selected = k_best.fit_transform(df.drop(columns=['quality']), df['quality'])

# ✅ 선택된 Feature 출력
selected_features = df.drop(columns=['quality']).columns[k_best.get_support()]
print(f"📌 선택된 Feature: {selected_features}")

# ✅ Feature Scaling (표준화)
scaler = StandardScaler()
df[selected_features] = scaler.fit_transform(df[selected_features])

📌 선택된 Feature: Index(['volatile_acidity', 'citric_acid', 'chlorides', 'total_sulfur_dioxide',
       'density', 'sulphates', 'alcohol', 'alcohol_sulphates', 'sulphates_sq',
       'sqrt_total_sulfur'],
      dtype='object')


In [3]:
X = df[selected_features]
y = df["quality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# ✅ 기본 CatBoost 모델 학습
catboost_model = CatBoostRegressor(
    iterations=500, 
    learning_rate=0.05, 
    depth=10, 
    random_state=42, 
    verbose=0
)
catboost_model.fit(X_train, y_train)

# ✅ 모델 평가
y_pred_cat = catboost_model.predict(X_test)
cat_mse = mean_squared_error(y_test, y_pred_cat)
cat_r2 = r2_score(y_test, y_pred_cat)

print(f"🔹 기본 CatBoost 모델 MSE: {cat_mse:.4f}, R²: {cat_r2:.4f}")

🔹 기본 CatBoost 모델 MSE: 0.3181, R²: 0.5133


In [5]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

# ✅ Base 모델 조합
base_models = [
    ('ridge', Ridge(alpha=1.0)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=100, learning_rate=0.05, random_state=42))
]

# ✅ Meta-learner 적용
stacking_model = StackingRegressor(estimators=base_models, final_estimator=Ridge(alpha=1.0))

# ✅ 학습
stacking_model.fit(X_train, y_train)

# ✅ 예측 및 평가
y_pred_stacking = stacking_model.predict(X_test)
stacking_mse = mean_squared_error(y_test, y_pred_stacking)
stacking_r2 = r2_score(y_test, y_pred_stacking)

print(f"🔹 개선된 Stacking 적용 후 MSE: {stacking_mse:.4f}, R²: {stacking_r2:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1210
[LightGBM] [Info] Number of data points in the train set: 1279, number of used features: 10
[LightGBM] [Info] Start training from score 5.623925
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029690 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1157
[LightGBM] [Info] Number of data points in the train set: 1023, number of used features: 10
[LightGBM] [Info] Start training from score 5.627566
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1159
[LightGBM] [Info] Number of data points in the train set: 1023, number of used features: 10
[LightGBM] [Info] Start traini

In [6]:
print("\n📌 최종 모델 성능 비교:")
print(f"🔹 기본 CatBoost - MSE: {cat_mse:.4f}, R²: {cat_r2:.4f}")
print(f"🔹 개선된 Stacking - MSE: {stacking_mse:.4f}, R²: {stacking_r2:.4f}")


📌 최종 모델 성능 비교:
🔹 기본 CatBoost - MSE: 0.3181, R²: 0.5133
🔹 개선된 Stacking - MSE: 0.3167, R²: 0.5154
