In [1]:
# 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 로드
train = pd.read_csv(r'E:\2026_1\캐글 공모전\playground-series-s6e1\train.csv')
test = pd.read_csv(r'E:\2026_1\캐글 공모전\playground-series-s6e1\test.csv')
submission = pd.read_csv(r'E:\2026_1\캐글 공모전\playground-series-s6e1\sample_submission.csv')

print(f"Train: {train.shape}, Test: {test.shape}")

Train: (630000, 13), Test: (270000, 12)


In [3]:
# 파생변수 생성 함수
def create_features(df):
    df = df.copy()
    
    # 범주형 인코딩
    sleep_map = {'poor': 1, 'average': 2, 'good': 3}
    facility_map = {'low': 1, 'medium': 2, 'high': 3}
    difficulty_map = {'easy': 1, 'moderate': 2, 'hard': 3}
    
    df['sleep_quality_num'] = df['sleep_quality'].map(sleep_map)
    df['facility_num'] = df['facility_rating'].map(facility_map)
    df['difficulty_num'] = df['exam_difficulty'].map(difficulty_map)
    
    # 조건부 상호작용 (핵심)
    df['study_quality_adj'] = df['study_hours'] * (df['sleep_quality_num'] / 3)
    df['study_facility_adj'] = df['study_hours'] * (df['facility_num'] / 3)
    df['attendance_sleep_synergy'] = (df['class_attendance'] / 100) * df['sleep_quality_num']
    
    # 효율성 지표
    df['waking_study_ratio'] = df['study_hours'] / (24 - df['sleep_hours'])
    df['study_sleep_ratio'] = df['study_hours'] / (df['sleep_hours'] + 0.1)
    df['total_investment'] = df['study_hours'] + df['sleep_hours']
    df['prep_vs_difficulty'] = df['study_hours'] / (df['difficulty_num'] + 0.5)
    
    # 비선형 변환
    df['study_sqrt'] = np.sqrt(df['study_hours'])
    df['study_log'] = np.log1p(df['study_hours'])
    df['study_sq'] = df['study_hours'] ** 2
    df['sleep_sq'] = df['sleep_hours'] ** 2
    
    # 최적 구간 거리
    df['study_optimal_dist'] = abs(df['study_hours'] - 5)
    df['sleep_deficit'] = np.maximum(0, 7 - df['sleep_hours'])
    df['sleep_excess'] = np.maximum(0, df['sleep_hours'] - 8)
    
    # 학생 프로파일
    df['hardworking_type'] = ((df['study_hours'] > 5) & (df['class_attendance'] > 85)).astype(int)
    df['cramming_type'] = ((df['study_hours'] > 6) & (df['sleep_hours'] < 6)).astype(int)
    
    # 출석 임계점
    df['attendance_low'] = (df['class_attendance'] < 70).astype(int)
    df['attendance_high'] = (df['class_attendance'] >= 90).astype(int)
    
    return df

In [4]:
# 파생변수 적용
train = create_features(train)
test = create_features(test)

print(f"파생변수 생성 후: {train.shape[1]}개 컬럼")

파생변수 생성 후: 34개 컬럼


In [5]:
# 범주형 변수 라벨 인코딩
cat_cols = ['gender', 'course', 'sleep_quality', 'study_method', 
            'facility_rating', 'exam_difficulty', 'internet_access']

label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

print("라벨 인코딩 완료")

라벨 인코딩 완료


In [6]:
# 피처/타겟 분리
drop_cols = ['id', 'exam_score']
feature_cols = [col for col in train.columns if col not in drop_cols]

X = train[feature_cols]
y = train['exam_score']
X_test = test[feature_cols]

print(f"학습 피처: {X.shape[1]}개")

학습 피처: 32개


In [7]:
# LightGBM 모델 정의
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'min_child_samples': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbose': -1,
    'n_jobs': -1,
    'random_state': 42
}

In [8]:
# K-Fold 교차검증 + 예측
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    oof_preds[valid_idx] = model.predict(X_valid)
    test_preds += model.predict(X_test) / n_splits
    
    fold_rmse = np.sqrt(np.mean((y_valid - oof_preds[valid_idx])**2))
    scores.append(fold_rmse)
    print(f"Fold {fold+1}: RMSE = {fold_rmse:.4f}")

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[996]	valid_0's rmse: 8.74094
Fold 1: RMSE = 8.7409
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 8.74415
Fold 2: RMSE = 8.7442
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 8.73242
Fold 3: RMSE = 8.7324
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's rmse: 8.75579
Fold 4: RMSE = 8.7558
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 8.7764
Fold 5: RMSE = 8.7764


In [12]:
# 성능 평가
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

cv_rmse = np.sqrt(mean_squared_error(y, oof_preds))
cv_mae = mean_absolute_error(y, oof_preds)
cv_r2 = r2_score(y, oof_preds)

print(f"CV RMSE: {cv_rmse:.4f}")
print(f"CV MAE:  {cv_mae:.4f}")
print(f"CV R2:   {cv_r2:.4f}")
print(f"\n평균 Fold RMSE: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

CV RMSE: 8.7500
CV MAE:  6.9768
CV R2:   0.7860

평균 Fold RMSE: 8.7499 (+/- 0.0152)


In [10]:
# Feature Importance 확인
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 15 Features:")
print(importance_df.head(15).to_string(index=False))

Top 15 Features:
                 feature   importance
      waking_study_ratio 6.540295e+08
      study_facility_adj 1.042889e+08
        class_attendance 8.069513e+07
attendance_sleep_synergy 7.980128e+07
             study_hours 7.221888e+07
       study_quality_adj 4.847250e+07
            study_method 4.810909e+07
               study_log 3.251376e+07
        total_investment 2.375912e+07
          attendance_low 8.241738e+06
            facility_num 3.658959e+06
         facility_rating 3.189797e+06
              study_sqrt 2.871040e+06
        hardworking_type 2.786668e+06
      study_optimal_dist 2.602120e+06


In [11]:
# 제출 파일 생성
submission['exam_score'] = test_preds
submission.to_csv('submission.csv', index=False)

print("submission.csv 저장 완료")
submission.head()

submission.csv 저장 완료


Unnamed: 0,id,exam_score
0,630000,71.4501
1,630001,69.510845
2,630002,88.534412
3,630003,55.957182
4,630004,47.17569
