# 앞서 중요한 feature만 뽑은 데이터 셋으로 학습 진행

## 데이터셋 불러오기, 인코딩, 스케일링 진행

In [1]:
import gc
import time
import joblib
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

# # 1. 중요 피처 리스트 불러오기
# important_features = joblib.load("important_features.pkl")
# print(f"✅ 중요 피처 {len(important_features)}개 로드 완료")

# 2. 새로운 학습 데이터셋 불러오기
train_df = pd.read_csv("../data/important_train.csv")
train_df.head()

Unnamed: 0.1,Unnamed: 0,정상청구원금_B5M,이용금액_오프라인_B0M,연속유실적개월수_기본_24M_카드,최대이용금액_체크_R12M,이용금액_R3M_신용체크,이용금액_체크_R12M,이용금액_체크_B0M,이용건수_신용_R12M,이용금액_일시불_R12M,...,승인거절건수_입력오류_R3M,IB문의건수_결제_R6M,여유_항공이용금액,컨택건수_보험_TM_B0M,잔액_리볼빙CA이월_B0M,상담건수_B0M,증감_RP건수_보험_전월,증감_RP건수_렌탈_전월,승인거절건수_한도초과_B0M,Segment
0,0,14958,4043,13,1020,196,6474,0,165,20667,...,0,0,0,0,0,0,0,0,0,D
1,1,3367,3980,12,0,13475,-414,0,204,54341,...,0,0,0,0,0,0,0,0,0,E
2,2,23963,4524,8,0,23988,-414,0,148,55656,...,0,0,0,0,0,2,0,0,0,C
3,3,19614,3975,5,0,3904,871,0,105,10753,...,0,0,0,0,0,0,0,0,0,D
4,4,0,0,0,3324,1190,10754,0,-1,-2129,...,0,0,0,0,0,0,0,0,0,E


In [None]:
# Feature / Label 분리
y = train_df["Segment"]
X = train_df.drop(columns=["Segment"])

# Label 인코딩
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 👉 인코더와 스케일러 저장
joblib.dump(label_encoder, "xgboost_encoder.joblib")
joblib.dump(scaler, "xgboost_scaler.joblib")
print("✅ label_encoder 및 scaler 저장 완료")

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, stratify=y_encoded, random_state=8432)

## XGBoost Depth 6 ~ 13

In [None]:

depth_values = [6, 7, 8, 9, 10, 11, 12, 13]

for depth in depth_values:
    print(f"\n🌲 XGBoost max_depth={depth}") 
    model = XGBClassifier(max_depth=depth, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    elapsed_time = time.time() - start_time

    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"🔍 F1_macro score: {f1:.4f}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print(f"⏱ 학습 및 예측 시간: {elapsed_time:.2f}초")

    # 모델 저장 (파일명에 max_depth와 f1 score 포함)
    model_filename = f"xgboost2_depth{depth}_f1{f1:.4f}.joblib"
    joblib.dump(model, model_filename)
    print(f"💾 모델 저장 완료: {model_filename}")

    # 메모리 해제를 위해 모델 변수 삭제 및 가비지 컬렉션 수행
    del model
    gc.collect()

# 메모리 문제로 LightGBM 방식으로 도전. depth 6 ~ 15

In [2]:
%pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 13.2 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\SSAFY\Desktop\개인폴더\DACON\01_신용카드_고객_세그먼트_분류_AI_경진대회\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [3]:
import gc
import time
import joblib
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# # 1. 데이터 로딩
# train_df = pd.read_csv("../data/important_train.csv")

# # Feature / Label 분리
# y = train_df["Segment"]
# X = train_df.drop(columns=["Segment"])

# # Label 인코딩
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# # 스케일링
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # 👉 인코더와 스케일러 저장
# joblib.dump(label_encoder, "label_encoder2.joblib")
# joblib.dump(scaler, "scaler2.joblib")
# print("✅ label_encoder 및 scaler 저장 완료")

# # 학습/테스트 분리
# X_train, X_test, y_train, y_test = train_test_split(
#     X_scaled, y_encoded, test_size=0.2, stratify=y_encoded, random_state=8432
# )

depth_values = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

for depth in depth_values:
    print(f"\n🌲 LightGBM max_depth={depth}")

    params = {
        'objective': 'multiclass',
        'num_class': len(label_encoder.classes_),
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'max_depth': depth,
        'learning_rate': 0.1,
        'num_leaves': 2 ** depth - 1,  # 일반적으로 max_depth와 연동
        'is_unbalance': True  # 클래스 불균형 처리
    }

    train_data = lgb.Dataset(X_train, label=y_train)

    start_time = time.time()
    model = lgb.train(params, train_data, num_boost_round=100)
    elapsed_time = time.time() - start_time

    y_pred = model.predict(X_test)
    y_pred_labels = y_pred.argmax(axis=1)

    f1 = f1_score(y_test, y_pred_labels, average='macro')
    print(f"🔍 F1_macro score: {f1:.4f}")
    print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))
    print(f"⏱ 학습 및 예측 시간: {elapsed_time:.2f}초")

    # 모델 저장
    model_filename = f"lightgbm_depth{depth}_f1{f1:.4f}.txt"
    model.save_model(model_filename)
    print(f"💾 모델 저장 완료: {model_filename}")

    del model
    gc.collect()



🌲 LightGBM max_depth=6
🔍 F1_macro score: 0.4958
              precision    recall  f1-score   support

           A       0.10      0.25      0.15       194
           B       0.00      0.00      0.00        29
           C       0.74      0.62      0.68     25518
           D       0.72      0.68      0.70     69848
           E       0.95      0.97      0.96    384411

    accuracy                           0.91    480000
   macro avg       0.50      0.50      0.50    480000
weighted avg       0.90      0.91      0.90    480000

⏱ 학습 및 예측 시간: 82.14초
💾 모델 저장 완료: lightgbm_depth6_f10.4958.txt

🌲 LightGBM max_depth=7
🔍 F1_macro score: 0.5085
              precision    recall  f1-score   support

           A       0.13      0.25      0.17       194
           B       0.00      0.03      0.00        29
           C       0.74      0.64      0.69     25518
           D       0.74      0.70      0.72     69848
           E       0.95      0.97      0.96    384411

    accuracy             

KeyboardInterrupt: 