In [15]:
import pandas as pd
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.cluster import KMeans
import warnings

warnings.filterwarnings('ignore')

# 1. 데이터 로드
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# --- [전처리 및 피처 엔지니어링] ---

# 1. 결측치 및 이상치 처리
train.fillna('NaN', inplace=True)
test.fillna('NaN', inplace=True)
train = train[(train['family_size'] <= 7)].reset_index(drop=True)

# 2. 의미없는 변수 제거
train.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)

# 3. DAYS_EMPLOYED 처리 (무직자 0 처리 및 양수 변환)
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for feat in feats:
    train[feat] = np.abs(train[feat])
    test[feat] = np.abs(test[feat])

# 4. 파생변수 생성 (실력자 버전)
for df in [train, test]:
    df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_befofeEMP_ratio'] = df['income_total'] / df['before_EMPLOYED']
    df['Age'] = df['DAYS_BIRTH'] // 365
    df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) % 12
    df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) % 4
    df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
    df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) % 12
    df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) % 4
    df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'])
    df['income_mean'] = df['income_total'] / df['family_size']
    
    # [핵심] ID 생성
    df['ID'] = df['gender'].astype(str) + '_' + df['income_total'].astype(str) + '_' + \
               df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' + \
               df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' + \
               df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' + \
               df['income_type'].astype(str) + '_' + df['edu_type'].astype(str) + '_' + \
               df['family_type'].astype(str) + '_' + df['house_type'].astype(str) + '_' + \
               df['occyp_type'].astype(str)

# 5. 다중공선성 제거 및 인코딩
train.drop(['child_num'], axis=1, inplace=True) # child_num은 family_size와 상관관계 높음
test.drop(['child_num'], axis=1, inplace=True)

categorical_feats = ['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats])
test[categorical_feats] = encoder.transform(test[categorical_feats])
train['ID'] = train['ID'].astype('int64')
test['ID'] = test['ID'].astype('int64')

# 6. 로그 스케일 및 클러스터링
train['income_total'] = np.log1p(1 + train['income_total'])
test['income_total'] = np.log1p(1 + test['income_total'])

kmeans_train = train.drop(['credit'], axis=1)
kmeans = KMeans(n_clusters=36, random_state=42).fit(kmeans_train)
train['cluster'] = kmeans.predict(kmeans_train)
test['cluster'] = kmeans.predict(test)

# 7. 표준화
numerical_feats = [f for f in train.columns if f not in categorical_feats + ['credit', 'income_total', 'cluster']]
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])

# --- [CatBoost 모델 학습] ---

X = train.drop(['credit'], axis=1)
y = train['credit']
groups = train['ID'] # 동일 인물 그룹화
X_test = test.copy()

n_splits = 10
sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)

cat_oof = np.zeros((train.shape[0], 3))
cat_test = np.zeros((test.shape[0], 3))

print(f"CatBoost 10-Fold 학습을 시작합니다...")

for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y, groups=groups)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # 기본값에 가까운 안정적인 파라미터
    model = CatBoostClassifier(
        iterations=3000, # 넉넉하게 설정 후 early_stopping으로 조절
        learning_rate=0.01,
        loss_function='MultiClass',
        random_seed=42,
        early_stopping_rounds=200,
        verbose=500 # 500회마다 로그 출력
    )
    
    model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)
    
    cat_oof[val_idx] = model.predict_proba(X_val)
    cat_test += model.predict_proba(X_test) / n_splits
    print(f"Fold {fold+1} 완료")

# 결과 확인
print(f"\n[CatBoost 10-Fold CV Score]: {log_loss(y, cat_oof):.4f}")

# 제출 파일 생성
submission.iloc[:, 1:] = cat_test
submission.to_csv('catboost_10fold_final.csv', index=False)
print("제출 파일이 생성되었습니다: catboost_10fold_final.csv")

CatBoost 10-Fold 학습을 시작합니다...
0:	learn: 1.0926989	test: 1.0930168	best: 1.0930168 (0)	total: 23.1ms	remaining: 1m 9s
500:	learn: 0.7823925	test: 0.8322591	best: 0.8321889 (372)	total: 6.78s	remaining: 33.8s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8321888639
bestIteration = 372

Shrink model to first 373 iterations.
Fold 1 완료
0:	learn: 1.0928205	test: 1.0927078	best: 1.0927078 (0)	total: 22.8ms	remaining: 1m 8s
500:	learn: 0.7869116	test: 0.7957377	best: 0.7956993 (493)	total: 6.69s	remaining: 33.4s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7953621402
bestIteration = 597

Shrink model to first 598 iterations.
Fold 2 완료
0:	learn: 1.0927811	test: 1.0928872	best: 1.0928872 (0)	total: 14.5ms	remaining: 43.4s
500:	learn: 0.7848243	test: 0.8206176	best: 0.8196094 (324)	total: 8.13s	remaining: 40.6s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8196093565
bestIteration = 324

Shrink model to first 325 iterations.
Fo

KeyboardInterrupt: 