## Import

In [2]:
import pandas as pd
import numpy as np
import gc
import joblib

import xgboost as xgb
import optuna
# from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


## Data Load

In [3]:
X = pd.read_csv("../data/X.csv")

In [4]:
y_df  = pd.read_csv("../data/y_df.csv")

In [5]:
X.head()

Unnamed: 0,기준년월,남녀구분코드,연령,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,소지카드수_이용가능_신용,입회일자_신용,...,변동률_RV일시불평잔,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M
0,201807,2,40.0,1,1,0,1,1,1,20130101,...,0.999998,1.042805,0.9997,0.999998,0.999998,0.261886,0.270752,0.0,1.044401,1.280543
1,201807,1,30.0,1,1,1,1,1,1,20170801,...,1.092698,0.905663,0.999998,0.999998,0.999998,-0.563388,-0.670348,0.0,0.0,0.0
2,201807,1,30.0,1,1,0,1,1,1,20080401,...,1.006124,1.99359,0.852567,0.999998,0.999998,-0.046516,0.058114,-0.014191,0.524159,1.20842
3,201807,2,40.0,1,1,0,1,2,2,20160501,...,0.999998,1.050646,0.999877,0.999998,0.999998,0.023821,0.258943,0.0,0.880925,1.657124
4,201807,2,40.0,1,1,1,1,1,1,20180601,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.0,0.0,0.0,0.0,0.0


In [6]:
y_encoded = y_df["label"]

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

### 여기 부터

In [11]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

# 스케일링 포함 파이프라인
model = make_pipeline(
    StandardScaler(),
    SGDClassifier(
        loss='log_loss',              # 로지스틱 회귀
        max_iter=1000,
        tol=1e-3,
        class_weight='balanced',     # 불균형 대응
        random_state=42,
        n_jobs=-1
    )
)

# 학습
model.fit(X_train, y_train)

# 예측 및 평가
y_pred = model.predict(X_val)
print("Macro F1:", f1_score(y_val, y_pred, average='macro'))


Macro F1: 0.5055194699683725


In [None]:
import optuna
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

def objective(trial):
    params = {
        'loss': trial.suggest_categorical('loss', ['log_loss', 'modified_huber']),
        'penalty': trial.suggest_categorical('penalty', ['l2', 'l1', 'elasticnet']),
        'alpha': trial.suggest_float('alpha', 1e-5, 1e-2, log=True),
        'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'optimal', 'invscaling', 'adaptive']),
        'eta0': trial.suggest_float('eta0', 0.001, 0.5, log=True),  # initial learning rate
        'max_iter': 1000,
        'tol': 1e-3,
        'class_weight': 'balanced',
        'random_state': 42
    }

    # 스케일러 + 모델 파이프라인
    model = make_pipeline(
        StandardScaler(),
        SGDClassifier(**params)
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred, average='macro')

# Optuna 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# 결과 출력
print("✅ Best Params:", study.best_params)
print("✅ Best Macro F1-score:", study.best_value)


[I 2025-04-10 17:05:10,054] A new study created in memory with name: no-name-d43105d9-bf6c-4090-9560-049ad25ff05d
[I 2025-04-10 17:10:04,880] Trial 0 finished with value: 0.5046941537251258 and parameters: {'loss': 'modified_huber', 'penalty': 'l2', 'alpha': 1.9817873225559373e-05, 'learning_rate': 'invscaling', 'eta0': 0.005532664165077781}. Best is trial 0 with value: 0.5046941537251258.
[I 2025-04-10 17:20:35,008] Trial 1 finished with value: 0.5073375397669017 and parameters: {'loss': 'log_loss', 'penalty': 'l2', 'alpha': 0.0031208363509499133, 'learning_rate': 'optimal', 'eta0': 0.01794478621230259}. Best is trial 1 with value: 0.5073375397669017.
[I 2025-04-10 17:49:10,230] Trial 2 finished with value: 0.49939455176501646 and parameters: {'loss': 'modified_huber', 'penalty': 'l2', 'alpha': 0.0018088183085055436, 'learning_rate': 'optimal', 'eta0': 0.004392603856042477}. Best is trial 1 with value: 0.5073375397669017.


In [None]:
# 성능 출력
le_target = joblib.load('le_target.pkl')
macro_f1 = f1_score(y_val, y_pred, average='macro')
print(f"\n✅ Best Macro F1-score: {macro_f1:.4f}")
print(f"✅ Best Parameters: {study.best_params}")
print("Macro F1:", f1_score(y_val, y_pred, average='macro'))
print(classification_report(y_val, y_pred, target_names=le_target.classes_))

### Train

In [9]:
from sklearn.model_selection import StratifiedKFold

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_encoded),
    y=y_encoded
)
class_weight_dict = {cls: w for cls, w in zip(np.unique(y_encoded), class_weights)}

#3. 각 샘플에 weight 부여
sample_weights = np.array([class_weight_dict[label] for label in y_train])
best_params = {
    'n_estimators': 300,
    'learning_rate': 0.1911192423062586,
    'max_depth': 8,
    'subsample': 0.7088976909107676,
    'colsample_bytree': 0.7711289150731236,
    'min_child_weight': 10,
    'gamma': 0.42772311341079505,
    'random_state': 42,
    'eval_metric': 'mlogloss',
    'tree_method': "hist",
    'device': "cuda"
}

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=9)
f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded.iloc[train_idx], y_encoded.iloc[val_idx]

    model = xgb.XGBClassifier(**best_params)
    model.fit(X_train, y_train, sample_weight = sample_weights)

    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred, average='macro')
    f1_scores.append(score)

    print(f"[Fold {fold}] Macro F1-score: {score:.4f}")

print(f"평균 Macro F1-score (3-Fold): {np.mean(f1_scores):.4f}")


[Fold 1] Macro F1-score: 0.7496
[Fold 2] Macro F1-score: 0.7005
[Fold 3] Macro F1-score: 0.8687
평균 Macro F1-score (3-Fold): 0.7730


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=7) #stratify=y_encoded

In [8]:

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_encoded),
    y=y_encoded
)
class_weight_dict = {cls: w for cls, w in zip(np.unique(y_encoded), class_weights)}

#3. 각 샘플에 weight 부여
sample_weights = np.array([class_weight_dict[label] for label in y_train])

In [9]:
print(sample_weights)

[0.2497331  0.2497331  1.37440514 ... 0.2497331  3.76205032 0.2497331 ]


In [10]:
y_encoded.head()

0    3
1    4
2    2
3    3
4    4
Name: label, dtype: int64

###  smote

In [20]:
# smote = SMOTE(random_state=42)
# X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
# print("SMOTE 완료:", np.bincount(y_train_res))

In [21]:
# # XGBoost 모델 학습
# model = xgb.XGBClassifier(
#     n_estimators=100,
#     learning_rate=0.1,
#     max_depth=4,
#     random_state=42,
#     use_label_encoder=False,
#     eval_metric='mlogloss'
# )

# model.fit(X_train, y_train)

# # 예측 및 평가
# y_pred = model.predict(X_val)
# print("Validation Accuracy:", accuracy_score(y_val, y_pred))
# print(classification_report(y_val, y_pred, target_names=le_target.classes_))

# # 평가: micro F1-score
# micro_f1 = f1_score(y_val, y_pred, average='micro')
# print("Micro F1-score:", micro_f1)

In [11]:
for col in X.select_dtypes(include='int64'):
    X_train[col] = pd.to_numeric(X_train[col], downcast='integer')

for col in X.select_dtypes(include='float64'):
    X_train[col] = pd.to_numeric(X_train[col], downcast='float')

In [12]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3),
        'max_depth': trial.suggest_int('max_depth', 5, 14),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 7,
        'gamma': trial.suggest_float('gamma', 0, 5),
        'eval_metric': 'mlogloss',
        'tree_method': "hist"
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred, average='macro')

# 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=6)  # n_trials=3은 실험 횟수 (더 늘리면 성능 향상 가능)

# 결과 출력
print("Best Parameters:", study.best_params)
print("Best Micro F1-score:", study.best_value)

[I 2025-04-09 11:33:31,807] A new study created in memory with name: no-name-d9292344-61cd-40d6-9538-7a5c533bf93d
[I 2025-04-09 11:49:55,401] Trial 0 finished with value: 0.8511451468512234 and parameters: {'n_estimators': 300, 'learning_rate': 0.1911192423062586, 'max_depth': 8, 'subsample': 0.7088976909107676, 'colsample_bytree': 0.7711289150731236, 'min_child_weight': 10, 'gamma': 0.42772311341079505}. Best is trial 0 with value: 0.8511451468512234.
[I 2025-04-09 12:15:19,959] Trial 1 finished with value: 0.7604401481666405 and parameters: {'n_estimators': 411, 'learning_rate': 0.07350639774257146, 'max_depth': 11, 'subsample': 0.7758553062783222, 'colsample_bytree': 0.9309725297994952, 'min_child_weight': 10, 'gamma': 4.788937177371047}. Best is trial 0 with value: 0.8511451468512234.
[I 2025-04-09 12:35:32,021] Trial 2 finished with value: 0.764778896696509 and parameters: {'n_estimators': 454, 'learning_rate': 0.11913804577137886, 'max_depth': 6, 'subsample': 0.840693703848217, '

Best Parameters: {'n_estimators': 300, 'learning_rate': 0.1911192423062586, 'max_depth': 8, 'subsample': 0.7088976909107676, 'colsample_bytree': 0.7711289150731236, 'min_child_weight': 10, 'gamma': 0.42772311341079505}
Best Micro F1-score: 0.8511451468512234


In [11]:
best_params = {
    'n_estimators': 300,
    'learning_rate': 0.1911192423062586,
    'max_depth': 8,
    'subsample': 0.7088976909107676,
    'colsample_bytree': 0.7711289150731236,
    'min_child_weight': 10,
    'gamma': 0.42772311341079505,
    'random_state': 42,
    'eval_metric': 'mlogloss',
    'tree_method': 'gpu_hist'
}

best_model = xgb.XGBClassifier(
    **best_params
)

best_model.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"



In [None]:
# best_model.fit(X_train, y_train,sample_weight=sample_weights)


In [12]:
le_target = joblib.load('le_target.pkl')

In [13]:
y_pred = best_model.predict(X_val)
print("Macro F1:", f1_score(y_val, y_pred, average='macro'))
print(classification_report(y_val, y_pred, target_names=le_target.classes_))


    E.g. tree_method = "hist", device = "cuda"



Macro F1: 0.7800072209095033
              precision    recall  f1-score   support

           A       0.93      0.60      0.73       324
           B       1.00      0.38      0.55        48
           C       0.86      0.79      0.82     42530
           D       0.84      0.81      0.83    116414
           E       0.97      0.98      0.98    640684

    accuracy                           0.95    800000
   macro avg       0.92      0.71      0.78    800000
weighted avg       0.94      0.95      0.95    800000



### 모델 저장

In [14]:
import joblib
joblib.dump(best_model, 'pkl/XGBOOST_078.pkl')  # 모델 저장

['pkl/XGBOOST_078.pkl']

### Predict

In [None]:
# X_test.drop(columns=['ID'],inplace=True)

In [None]:
# # row-level 예측 수행
# y_test_pred = model.predict(X_test)
# # 예측 결과를 변환
# y_test_pred_labels = le_target.inverse_transform(y_test_pred)

# # row 단위 예측 결과를 test_data에 추가
# test_data = test_df.copy()  # 원본 유지
# test_data["pred_label"] = y_test_pred_labels

### Submission

In [None]:
# submission = test_data.groupby("ID")["pred_label"] \
#     .agg(lambda x: x.value_counts().idxmax()) \
#     .reset_index()

# submission.columns = ["ID", "Segment"]
# submission.to_csv('../submit/0327.csv',index=False)