In [1]:
# %pip install optuna numpy pandas scikit-learn matplotlib seaborn catboost

In [1]:
import pandas as pd

train_df = pd.read_csv('../data/all_train.csv').iloc[:, 1:]
test_df = pd.read_csv('../data/all_test.csv').iloc[:, 1:]

  train_df = pd.read_csv('../data/all_train.csv').iloc[:, 1:]
  test_df = pd.read_csv('../data/all_test.csv').iloc[:, 1:]


In [2]:
import pandas as pd
import numpy as np
import optuna
import gc
import joblib
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold

# ---------------------------
# 🧹 데이터 준비 및 전처리
# ---------------------------

# feature 및 target 분리
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]
X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()
X_test = test_df.copy()

# 🔠 타겟 라벨 인코딩
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)
joblib.dump(le_target, "label_encoder.pkl")
print("✅ 라벨 인코더 저장 완료: label_encoder.pkl")

# 🧠 범주형 변수 지정: object type 컬럼 전부
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

# 🚫 NaN 처리 (범주형만): CatBoost는 cat_features에 대해 NaN 허용하지 않음
for col in categorical_features:
    X[col] = X[col].astype(str).fillna("nan")
    X_test[col] = X_test[col].astype(str).fillna("nan")

# ❌ 수치형 결측치는 CatBoost가 자동 처리하므로 그대로 둠

# 테스트 데이터 전처리
X_test.drop(columns=['ID'], inplace=True)
test_pool = Pool(X_test, cat_features=categorical_features)

# ---------------------------
# 🧪 Optuna 최적화
# ---------------------------

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "loss_function": "MultiClass",
        "task_type": "GPU",
        "random_seed": 42,
        "verbose": 0
    }

    model = CatBoostClassifier(**params)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    score = cross_val_score(model, X, y_encoded, cv=cv, scoring="accuracy")

    # 모델 저장
    model.fit(X, y_encoded, cat_features=categorical_features)
    model_path = f"catboost_trial_{trial.number}.cbm"
    model.save_model(model_path)
    print(f"📦 모델 저장됨: {model_path} | score: {score.mean():.4f}")

    return score.mean()

# 최적화 실행
gc.collect()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# ---------------------------
# 🧠 최적 모델 학습 및 저장
# ---------------------------

best_params = study.best_params
joblib.dump(best_params, "best_params.pkl")
print("🧠 최적 파라미터 저장 완료: best_params.pkl")

best_model = CatBoostClassifier(
    **best_params,
    loss_function="MultiClass",
    task_type="GPU",
    random_seed=42,
    verbose=0
)
best_model.fit(X, y_encoded, cat_features=categorical_features)
best_model.save_model("catboost_best_model.cbm")
print("🎯 최적 모델 저장 완료: catboost_best_model.cbm")

# ---------------------------
# 🔮 예측 및 제출 생성
# ---------------------------

# 예측
y_test_pred = best_model.predict(test_pool)
y_test_pred = y_test_pred.astype(int).flatten()
y_test_pred_labels = le_target.inverse_transform(y_test_pred)

# 결과 결합 및 다수결 처리
test_data = test_df.copy()
test_data["pred_label"] = y_test_pred_labels

submission = test_data.groupby("ID")["pred_label"] \
    .agg(lambda x: x.value_counts().idxmax()) \
    .reset_index()

submission.columns = ["ID", "Segment"]
submission.to_csv("submission.csv", index=False)
print("📄 submission.csv 저장 완료!")


  from .autonotebook import tqdm as notebook_tqdm


✅ 라벨 인코더 저장 완료: label_encoder.pkl


[I 2025-03-29 21:28:19,685] A new study created in memory with name: no-name-1b734f61-6ae8-4f57-8d39-7a222c2c4fa9
[W 2025-03-29 21:28:40,568] Trial 0 failed with parameters: {'iterations': 629, 'depth': 6, 'learning_rate': 0.09155164881067279} because of the following error: ValueError('\nAll the 3 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score=\'raise\'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n2 fits failed with the following error:\nTraceback (most recent call last):\n  File "_catboost.pyx", line 2547, in _catboost.get_float_feature\n  File "_catboost.pyx", line 1226, in _catboost._FloatOrNan\n  File "_catboost.pyx", line 1021, in _catboost._FloatOrNanFromString\nTypeError: Cannot convert \'b\'40\\xeb\\x8c\\x80\'\' to float\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "_catboost.pyx", line 2547, in _catboost.get_float_feature
  File "_catboost.pyx", line 1226, in _catboost._FloatOrNan
  File "_catboost.pyx", line 1021, in _catboost._FloatOrNanFromString
TypeError: Cannot convert 'b'40\xeb\x8c\x80'' to float

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 2275, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs, graph,
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 1513, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, graph=graph, weight=sample_weight, group_id=group_id,
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 855, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 1491, in _init
    self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "_catboost.pyx", line 4339, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4391, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4200, in _catboost._PoolBase._init_features_order_layout_pool
  File "_catboost.pyx", line 3127, in _catboost._set_features_order_data_pd_data_frame
  File "_catboost.pyx", line 2591, in _catboost.create_num_factor_data
  File "_catboost.pyx", line 2549, in _catboost.get_float_feature
_catboost.CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=2]="40대": Cannot convert 'b'40\xeb\x8c\x80'' to float

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "_catboost.pyx", line 2547, in _catboost.get_float_feature
  File "_catboost.pyx", line 1226, in _catboost._FloatOrNan
  File "_catboost.pyx", line 1021, in _catboost._FloatOrNanFromString
TypeError: Cannot convert 'b'30\xeb\x8c\x80'' to float

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 2275, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs, graph,
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 1513, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, graph=graph, weight=sample_weight, group_id=group_id,
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 855, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "c:\Users\shyo2\OneDrive\바탕 화면\ssafy\DACON\.venv\lib\site-packages\catboost\core.py", line 1491, in _init
    self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "_catboost.pyx", line 4339, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4391, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4200, in _catboost._PoolBase._init_features_order_layout_pool
  File "_catboost.pyx", line 3127, in _catboost._set_features_order_data_pd_data_frame
  File "_catboost.pyx", line 2591, in _catboost.create_num_factor_data
  File "_catboost.pyx", line 2549, in _catboost.get_float_feature
_catboost.CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=2]="30대": Cannot convert 'b'30\xeb\x8c\x80'' to float
