In [2]:
# 라이브러리 임포트
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import koreanize_matplotlib
koreanize_matplotlib.koreanize()

import warnings
warnings.filterwarnings('ignore')

In [3]:
# 데이터 로드
train = pd.read_csv(r'E:\2026_1\캐글 공모전\playground-series-s6e1\train.csv')
test = pd.read_csv(r'E:\2026_1\캐글 공모전\playground-series-s6e1\test.csv')
submission = pd.read_csv(r'E:\2026_1\캐글 공모전\playground-series-s6e1\sample_submission.csv')

print(f"Train: {train.shape}, Test: {test.shape}")

Train: (630000, 13), Test: (270000, 12)


## 전략: six_try 기반 추가 개선

six_try 최고 성능: CV RMSE 8.73352 (LGB, num_leaves=60, min_child_samples=35, Top 15 피처)

이번에 시도할 것들:
1. 전체 피처 사용 + 최적 파라미터
2. XGBoost 추가
3. LightGBM + XGBoost 스태킹/블렌딩
4. 잔차 분석 후 타깃별 보정

In [4]:
# 파생변수 생성 (six_try와 동일)
def create_features(df):
    df = df.copy()
    
    # 범주형 인코딩
    sleep_map = {'poor': 1, 'average': 2, 'good': 3}
    facility_map = {'low': 1, 'medium': 2, 'high': 3}
    difficulty_map = {'easy': 1, 'moderate': 2, 'hard': 3}
    
    df['sleep_quality_num'] = df['sleep_quality'].map(sleep_map)
    df['facility_num'] = df['facility_rating'].map(facility_map)
    df['difficulty_num'] = df['exam_difficulty'].map(difficulty_map)
    
    # 조건부 상호작용
    df['study_quality_adj'] = df['study_hours'] * (df['sleep_quality_num'] / 3)
    df['study_facility_adj'] = df['study_hours'] * (df['facility_num'] / 3)
    df['attendance_sleep_synergy'] = (df['class_attendance'] / 100) * df['sleep_quality_num']
    
    # 효율성 지표
    df['waking_study_ratio'] = df['study_hours'] / (24 - df['sleep_hours'])
    df['study_sleep_ratio'] = df['study_hours'] / (df['sleep_hours'] + 0.1)
    df['total_investment'] = df['study_hours'] + df['sleep_hours']
    df['prep_vs_difficulty'] = df['study_hours'] / (df['difficulty_num'] + 0.5)
    
    # 비선형 변환
    df['study_sqrt'] = np.sqrt(df['study_hours'])
    df['study_log'] = np.log1p(df['study_hours'])
    df['study_sq'] = df['study_hours'] ** 2
    df['sleep_sq'] = df['sleep_hours'] ** 2
    
    # 최적 구간 거리
    df['study_optimal_dist'] = abs(df['study_hours'] - 5)
    df['sleep_deficit'] = np.maximum(0, 7 - df['sleep_hours'])
    df['sleep_excess'] = np.maximum(0, df['sleep_hours'] - 8)
    
    # 학생 프로파일
    df['hardworking_type'] = ((df['study_hours'] > 5) & (df['class_attendance'] > 85)).astype(int)
    df['cramming_type'] = ((df['study_hours'] > 6) & (df['sleep_hours'] < 6)).astype(int)
    
    # 출석 임계점
    df['attendance_low'] = (df['class_attendance'] < 70).astype(int)
    df['attendance_high'] = (df['class_attendance'] >= 90).astype(int)
    
    return df

train = create_features(train)
test = create_features(test)
print(f"파생변수 생성 완료: {train.shape[1]}개 컬럼")

파생변수 생성 완료: 34개 컬럼


In [5]:
# 범주형 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

cat_cols = ['gender', 'course', 'sleep_quality', 'study_method', 
            'facility_rating', 'exam_difficulty', 'internet_access']

label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

print("라벨 인코딩 완료")

라벨 인코딩 완료


In [6]:
# 피처/타겟 분리
drop_cols = ['id', 'exam_score']
feature_cols = [col for col in train.columns if col not in drop_cols]

X = train[feature_cols]
y = train['exam_score']
X_test = test[feature_cols]

# six_try에서 선정된 Top 15 피처 (직접 지정)
selected_features_15 = [
    'waking_study_ratio', 'study_facility_adj', 'class_attendance',
    'attendance_sleep_synergy', 'study_hours', 'study_quality_adj',
    'study_method', 'study_log', 'total_investment', 'attendance_low',
    'facility_num', 'facility_rating', 'study_sqrt', 'hardworking_type',
    'study_optimal_dist'
]

print(f"전체 피처: {len(feature_cols)}개")
print(f"선택 피처: {len(selected_features_15)}개")

전체 피처: 32개
선택 피처: 15개


## 1. LightGBM Baseline (six_try 최적 파라미터)

In [7]:
# LightGBM 학습 (six_try 최적 파라미터)
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Train/Valid Split
X_train, X_valid, y_train, y_valid = train_test_split(
    X[selected_features_15], y, test_size=0.2, random_state=42
)

# six_try 최적 파라미터
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 60,
    'max_depth': -1,
    'min_child_samples': 35,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

lgb_model = lgb.train(
    lgb_params,
    train_data,
    num_boost_round=2000,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
)

lgb_pred_valid = lgb_model.predict(X_valid)
lgb_pred_test = lgb_model.predict(X_test[selected_features_15])

lgb_rmse = np.sqrt(mean_squared_error(y_valid, lgb_pred_valid))
print(f"\nLightGBM Valid RMSE: {lgb_rmse:.5f}")

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 8.82564
[200]	valid_0's rmse: 8.77343
[300]	valid_0's rmse: 8.7541
[400]	valid_0's rmse: 8.7448
[500]	valid_0's rmse: 8.73769
[600]	valid_0's rmse: 8.7319
[700]	valid_0's rmse: 8.72693
[800]	valid_0's rmse: 8.7241
[900]	valid_0's rmse: 8.72189
[1000]	valid_0's rmse: 8.72023
[1100]	valid_0's rmse: 8.71967
[1200]	valid_0's rmse: 8.71917
[1300]	valid_0's rmse: 8.71815
[1400]	valid_0's rmse: 8.71808
Early stopping, best iteration is:
[1388]	valid_0's rmse: 8.71784

LightGBM Valid RMSE: 8.71784


## 2. XGBoost 추가

In [8]:
# XGBoost 학습
import xgboost as xgb

xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.05,
    'max_depth': 8,
    'min_child_weight': 35,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest = xgb.DMatrix(X_test[selected_features_15])

xgb_model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=2000,
    evals=[(dvalid, 'valid')],
    early_stopping_rounds=100,
    verbose_eval=100
)

xgb_pred_valid = xgb_model.predict(dvalid)
xgb_pred_test = xgb_model.predict(dtest)

xgb_rmse = np.sqrt(mean_squared_error(y_valid, xgb_pred_valid))
print(f"\nXGBoost Valid RMSE: {xgb_rmse:.5f}")

ModuleNotFoundError: No module named 'xgboost'

## 3. LightGBM + XGBoost 블렌딩

In [None]:
# 최적 블렌딩 비율 탐색
print("블렌딩 비율 탐색 (LightGBM : XGBoost)")
print("="*50)

best_ratio = 0
best_blend_rmse = float('inf')

for lgb_ratio in np.arange(0, 1.05, 0.1):
    xgb_ratio = 1 - lgb_ratio
    blend_pred = lgb_ratio * lgb_pred_valid + xgb_ratio * xgb_pred_valid
    blend_rmse = np.sqrt(mean_squared_error(y_valid, blend_pred))
    
    if blend_rmse < best_blend_rmse:
        best_blend_rmse = blend_rmse
        best_ratio = lgb_ratio
    
    print(f"  LGB {lgb_ratio:.1f} : XGB {xgb_ratio:.1f} -> RMSE: {blend_rmse:.5f}")

print(f"\n최적 비율: LGB {best_ratio:.1f} : XGB {1-best_ratio:.1f}")
print(f"최적 RMSE: {best_blend_rmse:.5f}")

## 4. 전체 피처 사용 비교

In [None]:
# 전체 피처로 LightGBM 학습
X_train_full, X_valid_full, y_train_full, y_valid_full = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_data_full = lgb.Dataset(X_train_full, label=y_train_full)
valid_data_full = lgb.Dataset(X_valid_full, label=y_valid_full, reference=train_data_full)

lgb_model_full = lgb.train(
    lgb_params,
    train_data_full,
    num_boost_round=2000,
    valid_sets=[valid_data_full],
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
)

lgb_pred_valid_full = lgb_model_full.predict(X_valid_full)
lgb_pred_test_full = lgb_model_full.predict(X_test)

lgb_rmse_full = np.sqrt(mean_squared_error(y_valid_full, lgb_pred_valid_full))
print(f"\nLightGBM (전체 피처) Valid RMSE: {lgb_rmse_full:.5f}")
print(f"LightGBM (Top 15) Valid RMSE: {lgb_rmse:.5f}")

## 5. 결과 비교 및 제출 파일 생성

In [None]:
# 결과 비교
print("="*60)
print("모델 성능 비교")
print("="*60)

results = [
    {"모델": "LightGBM (Top 15)", "Valid RMSE": lgb_rmse},
    {"모델": "XGBoost (Top 15)", "Valid RMSE": xgb_rmse},
    {"모델": f"블렌딩 (LGB {best_ratio:.1f}:XGB {1-best_ratio:.1f})", "Valid RMSE": best_blend_rmse},
    {"모델": "LightGBM (전체 피처)", "Valid RMSE": lgb_rmse_full},
]

results_df = pd.DataFrame(results).sort_values("Valid RMSE")
print(results_df.to_string(index=False))

# 최적 모델 선택
best_model_name = results_df.iloc[0]['모델']
best_model_rmse = results_df.iloc[0]['Valid RMSE']
print(f"\n최적 모델: {best_model_name} (RMSE: {best_model_rmse:.5f})")

In [None]:
# 제출 파일 생성 (최적 블렌딩)
final_pred = best_ratio * lgb_pred_test + (1 - best_ratio) * xgb_pred_test

submission['exam_score'] = final_pred
out_path = r'E:\2026_1\캐글 공모전\영현\sbmission_result\submission_seven_lgb_xgb_blend.csv'
submission.to_csv(out_path, index=False, encoding='utf-8-sig')

print(f"제출 파일 저장: {out_path}")
print(f"\n예측값 통계:")
print(submission['exam_score'].describe())

In [None]:
# 시각화: 모델별 RMSE 비교
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. RMSE 바 차트
ax1 = axes[0]
model_names = results_df['모델'].tolist()
rmse_values = results_df['Valid RMSE'].tolist()
colors = ['#2ecc71', '#3498db', '#e74c3c', '#9b59b6'][:len(model_names)]

bars = ax1.barh(model_names, rmse_values, color=colors)
ax1.set_xlabel('Valid RMSE')
ax1.set_title('모델별 Valid RMSE 비교')
ax1.set_xlim(min(rmse_values) * 0.995, max(rmse_values) * 1.005)

for bar, val in zip(bars, rmse_values):
    ax1.text(val + 0.001, bar.get_y() + bar.get_height()/2, 
             f'{val:.5f}', va='center', fontsize=10)

# 2. 예측값 분포
ax2 = axes[1]
ax2.hist(lgb_pred_test, bins=50, alpha=0.5, label='LightGBM', color='#3498db')
ax2.hist(xgb_pred_test, bins=50, alpha=0.5, label='XGBoost', color='#e74c3c')
ax2.hist(final_pred, bins=50, alpha=0.5, label='Blend', color='#2ecc71')
ax2.set_xlabel('Predicted exam_score')
ax2.set_ylabel('Count')
ax2.set_title('Test 예측값 분포')
ax2.legend()

plt.tight_layout()
plt.show()