In [2]:
# ▶ 1. 필수 라이브러리
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
# ▶ 2. 데이터 로딩
df = pd.read_csv('병합/New_unique_ID_Seg.csv', encoding='utf-8-sig')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400000 entries, 0 to 2399999
Data columns (total 45 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   ID                  object
 1   _2순위카드이용금액          int64 
 2   이용금액_오프라인_B0M       int64 
 3   정상입금원금_B5M          int64 
 4   이용금액대               int64 
 5   이용금액_오프라인_R3M       int64 
 6   이용금액_오프라인_R6M       int64 
 7   정상입금원금_B2M          int64 
 8   _3순위업종_이용금액         int64 
 9   _2순위업종_이용금액         int64 
 10  _2순위쇼핑업종_이용금액       int64 
 11  최대이용금액_일시불_R12M     int64 
 12  _1순위업종_이용금액         int64 
 13  _3순위쇼핑업종_이용금액       int64 
 14  쇼핑_도소매_이용금액         int64 
 15  이용건수_오프라인_R6M       int64 
 16  _1순위교통업종_이용금액       int64 
 17  연체입금원금_B0M          int64 
 18  청구금액_R6M            int64 
 19  청구금액_B0             int64 
 20  평잔_일시불_6M           int64 
 21  월중평잔_일시불            int64 
 22  잔액_일시불_B0M          int64 
 23  입회일자_신용             int64 
 24  최종카드발급경과월           int64 
 25  _1순위카드이용건수        

In [10]:
df['_2순위신용체크구분'].value_counts()

_2순위신용체크구분
기타    958115
신용    873447
체크    568438
Name: count, dtype: int64

In [4]:
# ▶ 3. y_vif 생성 (Segment → 숫자 라벨로)
le = LabelEncoder()
y_vif = le.fit_transform(df['Segment'])

In [5]:
# ▶ 4. X_selected 생성 (불필요한 컬럼 제거)
X_selected = df.drop(columns=['Segment', 'ID'])
X_vif = X_selected.copy()

In [14]:
# ▶ cat_features 추출 (문자열형 컬럼 목록)
cat_features = X_vif.select_dtypes(include=['object']).columns.tolist()

In [17]:
# ▶ 5. 교차검증 수동 구현 (CatBoost 안정성 확보)
def evaluate_cv_score_manual(X_input, y_input, cat_features, cv=3):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    scores = []

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_input, y_input), 1):
        X_train, X_val = X_input.iloc[train_idx], X_input.iloc[valid_idx]
        y_train, y_val = y_input[train_idx], y_input[valid_idx]

        model = CatBoostClassifier(
            loss_function="MultiClass",
            eval_metric="MultiClass",
            task_type="GPU",
            learning_rate=0.01,
            iterations=70000,
            early_stopping_rounds=3000,
            l2_leaf_reg=50,
            random_seed=42,
            od_type="Iter",
            depth=5,
            border_count=64,
            verbose=1000
        )

        model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=cat_features, use_best_model=True)
        preds = model.predict(X_val)
        score = f1_score(y_val, preds, average='micro')
        scores.append(score)

        print(f"• Fold {fold}: {score:.4f}")

    print(f"\n✅ 평균 F1_micro 스코어: {np.mean(scores):.4f}")
    return np.mean(scores)


In [18]:
# 실행
evaluate_cv_score_manual(X_vif, y_vif, cat_features)

0:	learn: 1.5800731	test: 1.5800939	best: 1.5800939 (0)	total: 90.2ms	remaining: 1h 45m 14s
1000:	learn: 0.2913911	test: 0.2926447	best: 0.2926447 (1000)	total: 33.2s	remaining: 38m 7s
2000:	learn: 0.2756604	test: 0.2772502	best: 0.2772502 (2000)	total: 1m 7s	remaining: 38m 17s
3000:	learn: 0.2677993	test: 0.2696370	best: 0.2696370 (3000)	total: 1m 41s	remaining: 37m 47s
4000:	learn: 0.2626479	test: 0.2647543	best: 0.2647543 (4000)	total: 2m 15s	remaining: 37m 22s
5000:	learn: 0.2588649	test: 0.2612537	best: 0.2612537 (5000)	total: 2m 50s	remaining: 36m 52s
6000:	learn: 0.2557832	test: 0.2584523	best: 0.2584523 (6000)	total: 3m 24s	remaining: 36m 17s
7000:	learn: 0.2532433	test: 0.2562046	best: 0.2562046 (7000)	total: 3m 58s	remaining: 35m 44s
8000:	learn: 0.2510970	test: 0.2543523	best: 0.2543523 (8000)	total: 4m 32s	remaining: 35m 14s
9000:	learn: 0.2492037	test: 0.2527477	best: 0.2527477 (9000)	total: 5m 6s	remaining: 34m 40s
10000:	learn: 0.2474979	test: 0.2513334	best: 0.2513334 (

np.float64(0.91073375)

In [20]:
X_vif.to_csv(f'New unique/X_vif.csv', index=False)
pd.DataFrame({'Segment_encoded': y_vif}).to_csv(f'New unique/y_vif_encoded.csv', index=False)

In [24]:
import pickle

# ▶ cat_features: 범주형 변수 인덱스
with open(f'New unique/cat_features.pkl', 'rb') as f:
    cat_features = pickle.load(f)

# ▶ label_encoder: 숫자 → 문자 라벨 매핑용
with open(f'New unique/label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

# ▶ feature_columns: test 데이터 컬럼 순서 일치용
with open(f'New unique/feature_columns.pkl', 'rb') as f:
    feature_cols = pickle.load(f)

EOFError: Ran out of input

In [26]:
import pickle
import os

# 저장 폴더 경로
save_path = 'New unique'
os.makedirs(save_path, exist_ok=True)

# 1. 범주형 인덱스 저장
with open(os.path.join(save_path, 'cat_features.pkl'), 'wb') as f:
    pickle.dump(cat_features, f)

# 2. LabelEncoder 저장
with open(os.path.join(save_path, 'label_encoder.pkl'), 'wb') as f:
    pickle.dump(le, f)

# 3. Feature 컬럼 순서 저장
with open(os.path.join(save_path, 'feature_columns.pkl'), 'wb') as f:
    pickle.dump(X_vif.columns.tolist(), f)

print("✅ 모든 pickle 파일 저장 완료!")

✅ 모든 pickle 파일 저장 완료!
