## Import

In [1]:
# %pip install pyarrow
# %pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import gc
import joblib

import xgboost as xgb
import optuna
# from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


## Data Load

In [3]:
# 데이터 분할(폴더) 구분
data_splits = ["train", "test"]

# 각 데이터 유형별 폴더명, 파일 접미사, 변수 접두어 설정
data_categories = {
    # "회원정보": {"folder": "1.회원정보", "suffix": "회원정보", "var_prefix": "customer"},
    "신용정보": {"folder": "2.신용정보", "suffix": "신용정보", "var_prefix": "credit"},
    "승인매출정보": {"folder": "3.승인매출정보", "suffix": "승인매출정보", "var_prefix": "sales"},
    "청구정보": {"folder": "4.청구입금정보", "suffix": "청구정보", "var_prefix": "billing"},
    "잔액정보": {"folder": "5.잔액정보", "suffix": "잔액정보", "var_prefix": "balance"},
    "채널정보": {"folder": "6.채널정보", "suffix": "채널정보", "var_prefix": "channel"},
    "마케팅정보": {"folder": "7.마케팅정보", "suffix": "마케팅정보", "var_prefix": "marketing"},
    "성과정보": {"folder": "8.성과정보", "suffix": "성과정보", "var_prefix": "performance"}
}

# 2018년 7월부터 12월까지의 월 리스트
months = ['07', '08', '09', '10', '11', '12']

for split in data_splits:
    for category, info in data_categories.items():
        folder = info["folder"]
        suffix = info["suffix"]
        var_prefix = info["var_prefix"]
        
        for month in months:
            # 파일명 형식: 2018{month}_{split}_{suffix}.parquet
            file_path = f"../data/{split}/{folder}/2018{month}_{split}_{suffix}.parquet"
            print(file_path)
            # 변수명 형식: {var_prefix}_{split}_{month}
            variable_name = f"{var_prefix}_{split}_{month}"
            globals()[variable_name] = pd.read_parquet(file_path)
            print(f"{variable_name} is loaded from {file_path}")

gc.collect()

../data/train/2.신용정보/201807_train_신용정보.parquet
credit_train_07 is loaded from ../data/train/2.신용정보/201807_train_신용정보.parquet
../data/train/2.신용정보/201808_train_신용정보.parquet
credit_train_08 is loaded from ../data/train/2.신용정보/201808_train_신용정보.parquet
../data/train/2.신용정보/201809_train_신용정보.parquet
credit_train_09 is loaded from ../data/train/2.신용정보/201809_train_신용정보.parquet
../data/train/2.신용정보/201810_train_신용정보.parquet
credit_train_10 is loaded from ../data/train/2.신용정보/201810_train_신용정보.parquet
../data/train/2.신용정보/201811_train_신용정보.parquet
credit_train_11 is loaded from ../data/train/2.신용정보/201811_train_신용정보.parquet
../data/train/2.신용정보/201812_train_신용정보.parquet
credit_train_12 is loaded from ../data/train/2.신용정보/201812_train_신용정보.parquet
../data/train/3.승인매출정보/201807_train_승인매출정보.parquet
sales_train_07 is loaded from ../data/train/3.승인매출정보/201807_train_승인매출정보.parquet
../data/train/3.승인매출정보/201808_train_승인매출정보.parquet
sales_train_08 is loaded from ../data/train/3.승인매출정보/201808_train_승

0

## Data Preprocessing

### 1. Concat & Merge

In [4]:
# 데이터 유형별 설정 
info_categories = ["credit", "sales", "billing", "balance", "channel", "marketing", "performance"] # "customer", 

# 월 설정
months = ['07', '08', '09', '10', '11', '12']

In [5]:
#### Train ####

# 각 유형별로 월별 데이터를 합쳐서 새로운 변수에 저장
train_dfs = {}

for prefix in info_categories:
    # globals()에서 동적 변수명으로 데이터프레임들을 가져와 리스트에 저장
    df_list = [globals()[f"{prefix}_train_{month}"] for month in months]
    train_dfs[f"{prefix}_train_df"] = pd.concat(df_list, axis=0)
    gc.collect()
    print(f"{prefix}_train_df is created with shape: {train_dfs[f'{prefix}_train_df'].shape}")


# customer_train_df = train_dfs["customer_train_df"]
credit_train_df   = train_dfs["credit_train_df"]
sales_train_df    = train_dfs["sales_train_df"]
billing_train_df  = train_dfs["billing_train_df"]
balance_train_df  = train_dfs["balance_train_df"]
channel_train_df  = train_dfs["channel_train_df"]
marketing_train_df= train_dfs["marketing_train_df"]
performance_train_df = train_dfs["performance_train_df"]

gc.collect()

credit_train_df is created with shape: (2400000, 42)
sales_train_df is created with shape: (2400000, 406)
billing_train_df is created with shape: (2400000, 46)
balance_train_df is created with shape: (2400000, 82)
channel_train_df is created with shape: (2400000, 105)
marketing_train_df is created with shape: (2400000, 64)
performance_train_df is created with shape: (2400000, 49)


0

In [6]:
# #### Test ####

# # test 데이터에 대해 train과 동일한 방법 적용
# test_dfs = {}

# for prefix in info_categories:
#     df_list = [globals()[f"{prefix}_test_{month}"] for month in months]
#     test_dfs[f"{prefix}_test_df"] = pd.concat(df_list, axis=0)
#     gc.collect()
#     print(f"{prefix}_test_df is created with shape: {test_dfs[f'{prefix}_test_df'].shape}")


# # customer_test_df = test_dfs["customer_test_df"]
# credit_test_df   = test_dfs["credit_test_df"]
# sales_test_df    = test_dfs["sales_test_df"]
# billing_test_df  = test_dfs["billing_test_df"]
# balance_test_df  = test_dfs["balance_test_df"]
# channel_test_df  = test_dfs["channel_test_df"]
# marketing_test_df= test_dfs["marketing_test_df"]
# performance_test_df = test_dfs["performance_test_df"]

# gc.collect()

In [7]:
#### Train ####

customer_train_df = pd.read_csv('../clean_data/userinfo_train_preprocessed.csv')
customer_train_df['기준년월'] = pd.to_datetime(customer_train_df['기준년월'])
customer_train_df['기준년월'] = customer_train_df['기준년월'].dt.strftime('%Y%m').astype(int)

train_df = customer_train_df.merge(credit_train_df, on=['기준년월', 'ID'], how='left')
print("Step1 저장 완료: train_step1, shape:", train_df.shape)
del customer_train_df, credit_train_df
gc.collect()

# 이후 merge할 데이터프레임 이름과 단계 정보를 리스트에 저장
merge_list = [
    ("sales_train_df",    "Step2"),
    ("billing_train_df",  "Step3"),
    ("balance_train_df",  "Step4"),
    ("channel_train_df",  "Step5"),
    ("marketing_train_df","Step6"),
    ("performance_train_df", "최종")
]

# 나머지 단계 merge
for df_name, step in merge_list:
    # globals()로 동적 변수 접근하여 merge 수행
    train_df = train_df.merge(globals()[df_name], on=['기준년월', 'ID'], how='left')
    print(f"{step} 저장 완료: train_{step}, shape:", train_df.shape)
    # 사용한 변수는 메모리 해제를 위해 삭제
    del globals()[df_name]
    gc.collect()

Step1 저장 완료: train_step1, shape: (2400000, 107)
Step2 저장 완료: train_Step2, shape: (2400000, 511)
Step3 저장 완료: train_Step3, shape: (2400000, 555)
Step4 저장 완료: train_Step4, shape: (2400000, 635)
Step5 저장 완료: train_Step5, shape: (2400000, 738)
Step6 저장 완료: train_Step6, shape: (2400000, 800)
최종 저장 완료: train_최종, shape: (2400000, 847)


In [8]:
# #### Test ####

# customer_test_df = pd.read_csv('../clean_data/userinfo_test_preprocessed.csv')
# customer_test_df['기준년월'] = pd.to_datetime(customer_test_df['기준년월'])
# customer_test_df['기준년월'] = customer_test_df['기준년월'].dt.strftime('%Y%m').astype(int)

# test_df = customer_test_df.merge(credit_test_df, on=['기준년월', 'ID'], how='left')
# print("Step1 저장 완료: test_step1, shape:", test_df.shape)
# del customer_test_df, credit_test_df
# gc.collect()

# # 이후 merge할 데이터프레임 이름과 단계 정보를 리스트에 저장
# merge_list = [
#     ("sales_test_df",    "Step2"),
#     ("billing_test_df",  "Step3"),
#     ("balance_test_df",  "Step4"),
#     ("channel_test_df",  "Step5"),
#     ("marketing_test_df","Step6"),
#     ("performance_test_df", "최종")
# ]

# # 나머지 단계 merge
# for df_name, step in merge_list:
#     # globals()로 동적 변수 접근하여 merge 수행
#     test_df = test_df.merge(globals()[df_name], on=['기준년월', 'ID'], how='left')
#     print(f"{step} 저장 완료: test_{step}, shape:", test_df.shape)
#     # 사용한 변수는 메모리 해제를 위해 삭제
#     del globals()[df_name]
#     gc.collect()

### 데이터 확인

In [9]:
# print(train_df.groupby("기준연월").value_counts())

In [10]:
# print(train_df.Segment.value_counts(normalize=True))

In [11]:
# print(train_df["한도증액횟수_R12M"].value_counts(normalize=True))

In [12]:
# # 상위 7개 샘플 저장
# top10 = train_df.head(10)
# top10.to_csv("train_sample_top10.csv", index=False)  # index=False: 인덱스 컬럼 저장 X

# display(top10.transpose())

In [13]:
# 결측치 수 기준 정렬해서 DataFrame으로 보기
na_df = train_df.isnull().sum().reset_index()
na_df.columns = ['column', 'na_count']
na_df = na_df[na_df['na_count'] > 0].sort_values(by='na_count', ascending=False)
na_df['na_ratio'] = na_df['na_count'] / len(train_df)

# 전체 출력
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(na_df)

Unnamed: 0,column,na_count,na_ratio
565,연체일자_B0M,2394336,0.99764
288,_3순위여유업종,2377725,0.990719
294,_3순위납부업종,2310187,0.962578
286,_2순위여유업종,2302286,0.959286
282,_3순위교통업종,2045455,0.852273
292,_2순위납부업종,2033640,0.84735
375,최종카드론_대출일자,1988330,0.828471
284,_1순위여유업종,1987260,0.828025
373,최종카드론_신청경로코드,1958226,0.815928
372,최종카드론_금융상환방식코드,1958126,0.815886


### 2. Encoding

In [14]:
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]

X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()

# 타깃 라벨 인코딩
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)


categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    # unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    # if unseen_labels_val:
    #     le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    # X_test[col] = le_train.transform(X_test[col])

In [15]:
joblib.dump(le_target, 'le_target.pkl')

['le_target.pkl']

In [16]:
joblib.dump(encoders, 'encoders.pkl')

['encoders.pkl']

### Train

In [17]:
print(X.isnull().sum())
# print(y_encoded.isnull().sum())
print(y_encoded)
# print(X)
for col in na_df["column"]:
    X[col] = X[col].fillna(0)

기준년월                           0
남녀구분코드                         0
연령                             0
회원여부_이용가능                      0
회원여부_이용가능_CA                   0
회원여부_이용가능_카드론                  0
소지여부_신용                        0
소지카드수_유효_신용                    0
소지카드수_이용가능_신용                  0
입회일자_신용                        0
입회경과개월수_신용                     0
회원여부_연체                        0
이용거절여부_카드론                     0
동의여부_한도증액안내                    0
수신거부여부_TM                      0
수신거부여부_DM                      0
수신거부여부_메일                      0
수신거부여부_SMS                     0
가입통신회사코드                       0
탈회횟수_누적                        0
최종탈회후경과월                       0
탈회횟수_발급6개월이내                   0
탈회횟수_발급1년이내                    0
거주시도명                          0
직장시도명                          0
마케팅동의여부                        0
유효카드수_신용체크                     0
유효카드수_신용                       0
유효카드수_신용_가족                    0
유효카드수_체크                       0
유효카드수_체크_가

In [25]:
X.to_csv("X.csv", index = False)

In [26]:
y_df = pd.DataFrame(y_encoded, columns=['label'])
y_df.to_csv("y_df.csv", index = False)

### 여기 부터

In [3]:
X = pd.read_csv("../data/X.csv")

In [4]:
y_df  = pd.read_csv("../data/y_df.csv")

In [5]:
X.head()

Unnamed: 0,기준년월,남녀구분코드,연령,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,소지카드수_이용가능_신용,입회일자_신용,...,변동률_RV일시불평잔,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M
0,201807,2,40.0,1,1,0,1,1,1,20130101,...,0.999998,1.042805,0.9997,0.999998,0.999998,0.261886,0.270752,0.0,1.044401,1.280543
1,201807,1,30.0,1,1,1,1,1,1,20170801,...,1.092698,0.905663,0.999998,0.999998,0.999998,-0.563388,-0.670348,0.0,0.0,0.0
2,201807,1,30.0,1,1,0,1,1,1,20080401,...,1.006124,1.99359,0.852567,0.999998,0.999998,-0.046516,0.058114,-0.014191,0.524159,1.20842
3,201807,2,40.0,1,1,0,1,2,2,20160501,...,0.999998,1.050646,0.999877,0.999998,0.999998,0.023821,0.258943,0.0,0.880925,1.657124
4,201807,2,40.0,1,1,1,1,1,1,20180601,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.0,0.0,0.0,0.0,0.0


In [5]:
y_encoded = y_df["label"]

## K-fold

In [7]:
from sklearn.model_selection import StratifiedKFold

best_params = {
    'n_estimators': 300,
    'learning_rate': 0.1911192423062586,
    'max_depth': 8,
    'subsample': 0.7088976909107676,
    'colsample_bytree': 0.7711289150731236,
    'min_child_weight': 10,
    'gamma': 0.42772311341079505,
    'random_state': 42,
    'eval_metric': 'mlogloss',
    'tree_method': 'gpu_hist'
}

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=9)
f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded.iloc[train_idx], y_encoded.iloc[val_idx]

    model = xgb.XGBClassifier(**best_params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred, average='macro')
    f1_scores.append(score)

    print(f"[Fold {fold}] Macro F1-score: {score:.4f}")

print(f"평균 Macro F1-score (3-Fold): {np.mean(f1_scores):.4f}")



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[Fold 1] Macro F1-score: 0.7855



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[Fold 2] Macro F1-score: 0.8081



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[Fold 3] Macro F1-score: 0.7800
평균 Macro F1-score (3-Fold): 0.7912


In [9]:
from sklearn.model_selection import StratifiedKFold

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_encoded),
    y=y_encoded
)
class_weight_dict = {cls: w for cls, w in zip(np.unique(y_encoded), class_weights)}

#3. 각 샘플에 weight 부여
sample_weights = np.array([class_weight_dict[label] for label in y_train])
best_params = {
    'n_estimators': 300,
    'learning_rate': 0.1911192423062586,
    'max_depth': 8,
    'subsample': 0.7088976909107676,
    'colsample_bytree': 0.7711289150731236,
    'min_child_weight': 10,
    'gamma': 0.42772311341079505,
    'random_state': 42,
    'eval_metric': 'mlogloss',
    'tree_method': "hist",
    'device': "cuda"
}

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=9)
f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded.iloc[train_idx], y_encoded.iloc[val_idx]

    model = xgb.XGBClassifier(**best_params)
    model.fit(X_train, y_train, sample_weight = sample_weights)

    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred, average='macro')
    f1_scores.append(score)

    print(f"[Fold {fold}] Macro F1-score: {score:.4f}")

print(f"평균 Macro F1-score (3-Fold): {np.mean(f1_scores):.4f}")


[Fold 1] Macro F1-score: 0.7496
[Fold 2] Macro F1-score: 0.7005
[Fold 3] Macro F1-score: 0.8687
평균 Macro F1-score (3-Fold): 0.7730


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=7) #stratify=y_encoded

In [8]:

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_encoded),
    y=y_encoded
)
class_weight_dict = {cls: w for cls, w in zip(np.unique(y_encoded), class_weights)}

#3. 각 샘플에 weight 부여
sample_weights = np.array([class_weight_dict[label] for label in y_train])

In [9]:
print(sample_weights)

[0.2497331  0.2497331  1.37440514 ... 0.2497331  3.76205032 0.2497331 ]


In [10]:
y_encoded.head()

0    3
1    4
2    2
3    3
4    4
Name: label, dtype: int64

###  smote

In [20]:
# smote = SMOTE(random_state=42)
# X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
# print("SMOTE 완료:", np.bincount(y_train_res))

In [21]:
# # XGBoost 모델 학습
# model = xgb.XGBClassifier(
#     n_estimators=100,
#     learning_rate=0.1,
#     max_depth=4,
#     random_state=42,
#     use_label_encoder=False,
#     eval_metric='mlogloss'
# )

# model.fit(X_train, y_train)

# # 예측 및 평가
# y_pred = model.predict(X_val)
# print("Validation Accuracy:", accuracy_score(y_val, y_pred))
# print(classification_report(y_val, y_pred, target_names=le_target.classes_))

# # 평가: micro F1-score
# micro_f1 = f1_score(y_val, y_pred, average='micro')
# print("Micro F1-score:", micro_f1)

In [11]:
for col in X.select_dtypes(include='int64'):
    X_train[col] = pd.to_numeric(X_train[col], downcast='integer')

for col in X.select_dtypes(include='float64'):
    X_train[col] = pd.to_numeric(X_train[col], downcast='float')

In [12]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3),
        'max_depth': trial.suggest_int('max_depth', 5, 14),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 7,
        'gamma': trial.suggest_float('gamma', 0, 5),
        'eval_metric': 'mlogloss',
        'tree_method': "hist"
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred, average='macro')

# 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=6)  # n_trials=3은 실험 횟수 (더 늘리면 성능 향상 가능)

# 결과 출력
print("Best Parameters:", study.best_params)
print("Best Micro F1-score:", study.best_value)

[I 2025-04-09 11:33:31,807] A new study created in memory with name: no-name-d9292344-61cd-40d6-9538-7a5c533bf93d
[I 2025-04-09 11:49:55,401] Trial 0 finished with value: 0.8511451468512234 and parameters: {'n_estimators': 300, 'learning_rate': 0.1911192423062586, 'max_depth': 8, 'subsample': 0.7088976909107676, 'colsample_bytree': 0.7711289150731236, 'min_child_weight': 10, 'gamma': 0.42772311341079505}. Best is trial 0 with value: 0.8511451468512234.
[I 2025-04-09 12:15:19,959] Trial 1 finished with value: 0.7604401481666405 and parameters: {'n_estimators': 411, 'learning_rate': 0.07350639774257146, 'max_depth': 11, 'subsample': 0.7758553062783222, 'colsample_bytree': 0.9309725297994952, 'min_child_weight': 10, 'gamma': 4.788937177371047}. Best is trial 0 with value: 0.8511451468512234.
[I 2025-04-09 12:35:32,021] Trial 2 finished with value: 0.764778896696509 and parameters: {'n_estimators': 454, 'learning_rate': 0.11913804577137886, 'max_depth': 6, 'subsample': 0.840693703848217, '

Best Parameters: {'n_estimators': 300, 'learning_rate': 0.1911192423062586, 'max_depth': 8, 'subsample': 0.7088976909107676, 'colsample_bytree': 0.7711289150731236, 'min_child_weight': 10, 'gamma': 0.42772311341079505}
Best Micro F1-score: 0.8511451468512234


In [11]:
best_params = {
    'n_estimators': 300,
    'learning_rate': 0.1911192423062586,
    'max_depth': 8,
    'subsample': 0.7088976909107676,
    'colsample_bytree': 0.7711289150731236,
    'min_child_weight': 10,
    'gamma': 0.42772311341079505,
    'random_state': 42,
    'eval_metric': 'mlogloss',
    'tree_method': 'gpu_hist'
}

best_model = xgb.XGBClassifier(
    **best_params
)

best_model.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"



In [None]:
# best_model.fit(X_train, y_train,sample_weight=sample_weights)


In [12]:
le_target = joblib.load('le_target.pkl')

In [13]:
y_pred = best_model.predict(X_val)
print("Macro F1:", f1_score(y_val, y_pred, average='macro'))
print(classification_report(y_val, y_pred, target_names=le_target.classes_))


    E.g. tree_method = "hist", device = "cuda"



Macro F1: 0.7800072209095033
              precision    recall  f1-score   support

           A       0.93      0.60      0.73       324
           B       1.00      0.38      0.55        48
           C       0.86      0.79      0.82     42530
           D       0.84      0.81      0.83    116414
           E       0.97      0.98      0.98    640684

    accuracy                           0.95    800000
   macro avg       0.92      0.71      0.78    800000
weighted avg       0.94      0.95      0.95    800000



### 모델 저장

In [14]:
import joblib
joblib.dump(best_model, 'pkl/XGBOOST_078.pkl')  # 모델 저장

['pkl/XGBOOST_078.pkl']

### Predict

In [None]:
# X_test.drop(columns=['ID'],inplace=True)

In [None]:
# # row-level 예측 수행
# y_test_pred = model.predict(X_test)
# # 예측 결과를 변환
# y_test_pred_labels = le_target.inverse_transform(y_test_pred)

# # row 단위 예측 결과를 test_data에 추가
# test_data = test_df.copy()  # 원본 유지
# test_data["pred_label"] = y_test_pred_labels

### Submission

In [None]:
# submission = test_data.groupby("ID")["pred_label"] \
#     .agg(lambda x: x.value_counts().idxmax()) \
#     .reset_index()

# submission.columns = ["ID", "Segment"]
# submission.to_csv('../submit/0327.csv',index=False)