# SPA + CatBoost + Class Weights

In [2]:
%pip install pandas numpy scikit-learn catboost

Note: you may need to restart the kernel to use updated packages.


## 0. 데이터셋 생성

In [3]:
import pandas as pd

# CSV 파일 불러오기 (첫 열이 인덱스로 저장되어 있을 경우)
df1 = pd.read_csv('../data/all_train.csv', index_col=0)
# df2 = pd.read_csv('../data/all_test.csv', index_col=0)

# 인덱스 열 제거된 상태로 다시 저장
# df1.to_csv('../data/all_train.csv', index=False)
# df2.to_csv('../data/all_test.csv', index=False)

  df1 = pd.read_csv('../data/all_train.csv', index_col=0)


In [4]:
df1.head()

Unnamed: 0,기준년월,ID,남녀구분코드,연령,Segment,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,...,변동률_RV일시불평잔,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M
0,201807,TRAIN_000000,2,40대,D,1,1,0,1,1,...,0.999998,1.042805,0.9997,0.999998,0.999998,0.261886,0.270752,0.0,1.044401,1.280543
1,201807,TRAIN_000001,1,30대,E,1,1,1,1,1,...,1.092698,0.905663,0.999998,0.999998,0.999998,-0.563388,-0.670348,0.0,0.0,0.0
2,201807,TRAIN_000002,1,30대,C,1,1,0,1,1,...,1.006124,1.99359,0.852567,0.999998,0.999998,-0.046516,0.058114,-0.014191,0.524159,1.20842
3,201807,TRAIN_000003,2,40대,D,1,1,0,1,2,...,0.999998,1.050646,0.999877,0.999998,0.999998,0.023821,0.258943,0.0,0.880925,1.657124
4,201807,TRAIN_000004,2,40대,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.0,0.0,0.0,,


## 1. 수치형 데이터 분리 및 차원 축소 (SPA 이용)

In [6]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import pandas as pd

# 숫자형 컬럼만 선택
numeric_cols = df1.select_dtypes(include='number').columns
numeric_data = df1[numeric_cols]

# 결측치를 0으로 대체
imputer = SimpleImputer(strategy='constant', fill_value=0)
numeric_data_imputed = imputer.fit_transform(numeric_data)

# PCA로 95% 설명하는 주성분 수 자동 설정
pca = PCA(n_components=0.95, random_state=42)
reduced_numeric = pca.fit_transform(numeric_data_imputed)

# 결과를 DataFrame으로 변환
reduced_numeric_df = pd.DataFrame(reduced_numeric, columns=[f'pca_{i}' for i in range(reduced_numeric.shape[1])])

In [7]:
reduced_numeric_df.shape

(2400000, 4)

In [8]:
reduced_numeric_df.head()

Unnamed: 0,pca_0,pca_1,pca_2,pca_3
0,18422790.0,10582600.0,14218320.0,3070852.0
1,-84673800.0,18683020.0,-13627040.0,-3517066.0
2,-84859280.0,22090880.0,4979183.0,-10146770.0
3,18423130.0,10576250.0,14182250.0,3083927.0
4,19866190.0,-8914273.0,11211960.0,-12442550.0


In [9]:
%pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [10]:
import joblib

# imputer와 pca 객체 저장
joblib.dump(imputer, 'imputer.pkl')
joblib.dump(pca, 'pca.pkl')

# reduced_numeric_df도 필요시 저장
reduced_numeric_df.to_csv('train_pca_transformed.csv', index=False)

In [12]:
# #  추후 테스트에 사용할 코드
# import pandas as pd
# import joblib

# # test 데이터 로드
# test_df = pd.read_csv('test.csv')

# # 숫자형 컬럼만 추출
# numeric_cols = test_df.select_dtypes(include='number').columns
# numeric_test_data = test_df[numeric_cols]

# # 저장한 imputer 및 pca 불러오기
# imputer = joblib.load('imputer.pkl')
# pca = joblib.load('pca.pkl')

# # 동일하게 결측치 처리 후 PCA 적용
# numeric_test_imputed = imputer.transform(numeric_test_data)
# reduced_test = pca.transform(numeric_test_imputed)

# # DataFrame으로 변환
# reduced_test_df = pd.DataFrame(reduced_test, columns=[f'pca_{i}' for i in range(reduced_test.shape[1])])
# reduced_test_df.to_csv('test_pca_transformed.csv', index=False)


## 2. 범주형 데이터 처리

In [None]:
# object 타입 컬럼만 선택
categorical_cols = df1.select_dtypes(include='object').columns.tolist()

# 결측치를 문자열 'nan'으로 대체
df1[categorical_cols] = df1[categorical_cols].fillna('nan').astype(str)


In [21]:
categorical_data = df1[categorical_cols]


## 3. 축소된 수치형 + 범주형 데이터 합치기

In [None]:
X = pd.concat([reduced_numeric_df, categorical_data], axis=1)
y = df1['Segment']

In [27]:
# 최종 데이터프레임 생성
final_df = pd.concat([X, y], axis=1)

# CSV로 저장
final_df.to_csv('PCA_Data.csv', index=False)

## 4. 불균형 대응 (가중치 적용)

In [23]:
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y)
weights = compute_class_weight('balanced', classes=classes, y=y)
class_weights = dict(zip(classes, weights))


In [24]:
X.shape

(2400000, 54)

## 5. 모델 학습 및 평가

In [26]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.03,
        depth=6,
        eval_metric='MultiClass',  # or 'Accuracy'
        custom_metric='F1',
        cat_features=categorical_cols,
        class_weights=class_weights,
        verbose=0,
        random_seed=42,
        early_stopping_rounds=50
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    preds = model.predict(X_val)
    score = f1_score(y_val, preds, average='macro')
    f1_scores.append(score)

print("평균 Macro F1 Score:", np.mean(f1_scores))


KeyboardInterrupt: 

In [None]:
# 집가서

import pandas as pd
import optuna
import joblib
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 데이터 불러오기
df = pd.read_csv('PCA_Data.csv')
X = df.drop(columns=['Segment'])
y = df['Segment']

# 범주형 컬럼 자동 추출
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna 목적 함수 정의
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 300, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'random_seed': 42,
        'task_type': 'GPU',        # GPU 사용
        'devices': '0',
        'loss_function': 'MultiClass',
        'eval_metric': 'Accuracy',
        'early_stopping_rounds': 50,
        'verbose': 0
    }

    # Pool 객체로 학습
    train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
    val_pool = Pool(X_val, y_val, cat_features=categorical_cols)

    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)

    return acc

# Optuna 튜닝
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# 최적 하이퍼파라미터로 모델 재학습
best_params = study.best_params
best_params.update({
    'task_type': 'GPU',
    'devices': '0',
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'early_stopping_rounds': 50,
    'random_seed': 42,
    'verbose': 100
})

final_pool = Pool(X, y, cat_features=categorical_cols)
final_model = CatBoostClassifier(**best_params)
final_model.fit(final_pool)

# 모델 및 관련 정보 저장
joblib.dump(final_model, 'best_catboost_model.pkl')
joblib.dump(categorical_cols, 'cat_features.pkl')
joblib.dump(best_params, 'best_params.pkl')

print("최적 하이퍼파라미터:", study.best_params)
print("모델, 범주형 컬럼, 파라미터 저장 완료.")
