# SPA + CatBoost + Class Weights

In [None]:
# %pip install pandas numpy scikit-learn catboost

## 0. 데이터셋 생성

In [None]:
import pandas as pd

# CSV 파일 불러오기 (첫 열이 인덱스로 저장되어 있을 경우)
df1 = pd.read_csv('../data/numeric_train.csv', index_col=0)
# df2 = pd.read_csv('../data/all_test.csv', index_col=0)

# 인덱스 열 제거된 상태로 다시 저장
# df1.to_csv('../data/all_train.csv', index=False)
# df2.to_csv('../data/all_test.csv', index=False)

## 1. 수치형 데이터 분리 및 차원 축소 (SPA 이용)

In [None]:
from sklearn.decomposition import PCA

numeric_cols = df1.select_dtypes(include='number').columns
numeric_data = df1[numeric_cols]

# 예시: PCA로 95% 설명하는 주성분 수 자동 설정
pca = PCA(n_components=0.95, random_state=42)
reduced_numeric = pca.fit_transform(numeric_data)
reduced_numeric_df = pd.DataFrame(reduced_numeric, columns=[f'pca_{i}' for i in range(reduced_numeric.shape[1])])

## 2. 범주형 데이터 처리

In [None]:
categorical_cols = df1.select_dtypes(include='object').columns.tolist()
categorical_data = df1[categorical_cols]


## 3. 축소된 수치형 + 범주형 데이터 합치기

In [None]:
X = pd.concat([reduced_numeric_df, categorical_data], axis=1)
y = df1['target']


## 4. 불균형 대응 (가중치 적용)

In [None]:
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y)
weights = compute_class_weight('balanced', classes=classes, y=y)
class_weights = dict(zip(classes, weights))


## 5. 모델 학습 및 평가

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.03,
        depth=6,
        eval_metric='F1',
        cat_features=categorical_cols,
        class_weights=class_weights,
        verbose=0,
        random_seed=42,
        early_stopping_rounds=50
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    preds = model.predict(X_val)
    score = f1_score(y_val, preds, average='macro')
    f1_scores.append(score)

print("평균 Macro F1 Score:", np.mean(f1_scores))
