In [1]:
# ▶ CatBoost 설치 (최신 버전 추천)
!pip install -q catboost

In [2]:
import catboost
print(catboost.__version__)

# GPU 인식 확인
!nvidia-smi

1.2.8
Sat Jul 12 01:15:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   50C    P8             N/A /  115W |     922MiB /   8188MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                          

In [3]:
# ▶ 1. 필수 라이브러리
import pandas as pd
import numpy as np
import pickle
from catboost import CatBoostClassifier, Pool

# ▶ 2. 저장된 항목 로드
base_path = 'New unique'

X_vif = pd.read_csv(f'{base_path}/X_vif.csv')
y_vif_encoded = pd.read_csv(f'{base_path}/y_vif_encoded.csv')['Segment_encoded']

with open(f'{base_path}/cat_features.pkl', 'rb') as f:
    cat_features = pickle.load(f)

with open(f'{base_path}/label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

# ▶ 3. 최종 모델 학습
final_model = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="MultiClass",
    task_type="GPU",
    learning_rate=0.01,
    iterations=70000,
    l2_leaf_reg=50,
    random_seed=42,
    od_type="Iter",
    depth=5,
    early_stopping_rounds=15000,
    border_count=64,
    verbose=100
)

final_pool = Pool(X_vif, y_vif_encoded, cat_features=cat_features)
final_model.fit(final_pool)

# ▶ (선택) 학습된 모델 저장
final_model.save_model(f'{base_path}/final_catboost_model.cbm')

0:	learn: 1.5800769	total: 51.6ms	remaining: 1h 10s
100:	learn: 0.6176383	total: 4.77s	remaining: 54m 58s
200:	learn: 0.4219914	total: 9.48s	remaining: 54m 52s
300:	learn: 0.3547965	total: 14.2s	remaining: 54m 50s
400:	learn: 0.3275203	total: 18.9s	remaining: 54m 38s
500:	learn: 0.3144163	total: 23.5s	remaining: 54m 26s
600:	learn: 0.3067587	total: 28.2s	remaining: 54m 18s
700:	learn: 0.3014077	total: 32.9s	remaining: 54m 16s
800:	learn: 0.2974878	total: 37.6s	remaining: 54m 6s
900:	learn: 0.2943808	total: 42.2s	remaining: 53m 54s
1000:	learn: 0.2916911	total: 46.8s	remaining: 53m 45s
1100:	learn: 0.2894749	total: 51.4s	remaining: 53m 37s
1200:	learn: 0.2873693	total: 56.1s	remaining: 53m 35s
1300:	learn: 0.2855496	total: 1m	remaining: 53m 27s
1400:	learn: 0.2838527	total: 1m 5s	remaining: 53m 35s
1500:	learn: 0.2822571	total: 1m 10s	remaining: 53m 28s
1600:	learn: 0.2808254	total: 1m 14s	remaining: 53m 18s
1700:	learn: 0.2794807	total: 1m 19s	remaining: 53m 5s
1800:	learn: 0.2782659	t

In [4]:
# ▶ 테스트 데이터 불러오기
test = pd.read_csv('병합/merged_final_test_셀_추가병합.csv')

# ▶ ID 컬럼 저장 후 피처 정리
test_id = test['ID']
X_test = test[X_vif.columns]  # 컬럼 순서 맞추기
X_test = X_test.drop(columns=['ID'], errors='ignore')

# ▶ Pool 생성
test_pool = Pool(data=X_test, cat_features=cat_features)

# ▶ 예측 (class index → 문자 라벨로 복원)
preds_class = final_model.predict(test_pool)
preds_labels = le.inverse_transform(preds_class.astype(int).flatten())

# ▶ 제출 파일 생성
submission = pd.DataFrame({
    'ID': test_id,
    'Segment': preds_labels
})

# ▶ 저장
submission_path = '병합/submission_catboost.csv'
submission.to_csv(submission_path, index=False)

In [6]:
sub = pd.read_csv('병합/submission_new_unique.csv')
sub

Unnamed: 0,ID,Segment
0,TEST_00000,E
1,TEST_00001,E
2,TEST_00002,D
3,TEST_00003,E
4,TEST_00004,E
...,...,...
599995,TEST_99995,E
599996,TEST_99996,E
599997,TEST_99997,E
599998,TEST_99998,C


In [None]:
# 마지막 100,000개만 추출
submission_tail = submission.tail(100000)

# 제출 파일 저장
submission_tail.to_csv(
    '/content/drive/MyDrive/멋쟁이사자처럼_파이널프로젝트/catboost/submission_tail.csv',
    index=False,
    encoding='utf-8-sig'
)

print("✅ 마지막 100,000개로 제출 파일 저장 완료!")

✅ 마지막 100,000개로 제출 파일 저장 완료!


In [7]:
sub_tail = pd.read_csv('/content/drive/MyDrive/멋쟁이사자처럼_파이널프로젝트/catboost/submission_tail.csv')
sub_tail

NameError: name 'pd' is not defined

In [8]:
import pandas as pd

In [9]:
test1 = pd.read_csv('/content/drive/MyDrive/멋쟁이사자처럼_파이널프로젝트/catboost/submission_catboost.csv')
test1

Unnamed: 0,ID,Segment
0,TEST_00000,E
1,TEST_00001,E
2,TEST_00002,D
3,TEST_00003,E
4,TEST_00004,E
...,...,...
599995,TEST_99995,E
599996,TEST_99996,E
599997,TEST_99997,E
599998,TEST_99998,C


In [7]:
# ID별 Segment 다수결
submit_df = (
    sub
    .groupby('ID')['Segment']
    .agg(lambda x: x.value_counts().idxmax())
    .reset_index()
)

# 저장
submit_df.to_csv('병합/sub_new_unique_winner.csv', index=False, encoding='utf-8-sig')

In [8]:
test_win = pd.read_csv('병합/sub_new_unique_winner.csv')
test_win

Unnamed: 0,ID,Segment
0,TEST_00000,E
1,TEST_00001,E
2,TEST_00002,E
3,TEST_00003,E
4,TEST_00004,E
...,...,...
99995,TEST_99995,E
99996,TEST_99996,E
99997,TEST_99997,E
99998,TEST_99998,C
