# 제품 이상여부 판별 프로젝트


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. 데이터 불러오기


### 필수 라이브러리


In [13]:
!pip install catBoost
!pip install optuna



In [14]:
import os
import optuna
import sklearn
from pprint import pprint
from catboost import CatBoostClassifier, Pool

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import(
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### 데이터 읽어오기


In [15]:
ROOT_DIR = "/content/drive/MyDrive/Colab Notebooks/LG AIMERS/예선(수정)/0820"
RANDOM_STATE = 110

# Load data
train_data= pd.read_csv(os.path.join(ROOT_DIR, "train_data_cleaned(스케일링후).csv"))
train_data

Unnamed: 0,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE STANDBY POSITION X Collect Result_Dam,CURE STANDBY POSITION Z Collect Result_Dam,...,Head Clean Position X Collect Result_Fill2,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,Production Qty Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,0.0,0.0,0.0,0.857143,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.364023,0.014374,0.058824,Normal
1,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.007505,0.379877,0.000000,Normal
2,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1.0,1.0,1.0,0.428571,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.364023,0.020534,0.058824,Normal
3,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1.0,1.0,1.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.008630,0.550308,0.000000,Normal
4,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.007880,0.248460,0.000000,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40094,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.006004,0.652977,0.000000,Normal
40095,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1.0,1.0,1.0,0.857143,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.364023,0.028747,0.058824,Normal
40096,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,0.0,0.0,0.0,0.857143,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.252908,0.002053,0.058824,Normal
40097,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1.0,1.0,1.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.009381,0.240246,0.000000,Normal


### 언더 샘플링


데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.


In [16]:
# 언더 샘플링 비율 조정
normal_ratio = 1.0
additional_ratio = 0.5  # 추가로 유지할 Normal 데이터의 비율

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

# 언더 샘플링: Normal 데이터를 AbNormal 데이터 수와 맞춤
df_normal_sampled = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)

# 추가로 일부 Normal 데이터를 유지
df_additional_normal = df_normal.sample(n=int(num_abnormal * additional_ratio), replace=False, random_state=RANDOM_STATE)

# 최종 데이터셋 결합
df_concat = pd.concat([df_normal_sampled, df_additional_normal, df_abnormal], axis=0).reset_index(drop=True)

# 결과 확인
print(df_concat.value_counts("target"))

  Total: Normal: 37774, AbNormal: 2325
target
Normal      3487
AbNormal    2325
Name: count, dtype: int64


### 데이터 분할


In [17]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.2,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "Abnormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 2789, AbNormal: 0 ratio: 0.0
  Total: Normal: 698, AbNormal: 0 ratio: 0.0


## 3. 모델 학습


In [18]:
# 학습 데이터에서 target을 제외한 features 리스트 정의
features = [col for col in df_train.columns if col != 'target']
# 학습 데이터에서 cat_features 인덱스 리스트 정의
cat_features = [i for i, col in enumerate(features) if df_train[col].dtype == 'object']

# train_x와 train_y 설정
train_x = df_train[features]
train_y = df_train["target"]

In [19]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_y_encoded = label_encoder.fit_transform(train_y)

### 모델 정의


In [32]:
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import make_scorer, accuracy_score
clf = CatBoostClassifier()
scorer = make_scorer(accuracy_score)

train_x_train, train_x_val, train_y, train_y_val = sklearn.model_selection.train_test_split(train_x, train_y_encoded, random_state=110)

def objective(trial):
   model_pool = Pool(data = train_x_train, label = train_y, cat_features = cat_features)
   eval_pool = Pool(data = train_x_val, label = train_y_val, cat_features = cat_features)
   params = {
        "learning_rate" : trial.suggest_float("learning_rate", 0.001, 0.1),
        "colsample_bylevel" : trial.suggest_float("colsample_bylevel", 0.1,1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "iterations": trial.suggest_int("iterations", 500, 1000),
        "l2_leaf_reg": trial.suggest_discrete_uniform("l2_leaf_reg", 0.001, 10, 0.1),
        "min_child_samples": trial.suggest_categorical("min_child_samples", [1, 4, 8, 16, 32]),
        "border_count": trial.suggest_int("border_count", 30, 250),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
            }

   model = CatBoostClassifier(**params, objective='Logloss',eval_metric='F1', grow_policy='SymmetricTree')
   model.fit(model_pool, eval_set=eval_pool, early_stopping_rounds=20, verbose=0)
   y_pred = model.predict(train_x_val)
   error = sklearn.metrics.mean_squared_error(train_y_val, y_pred)
   return error

In [33]:
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials = 10)
hyperparams = study.best_params

[I 2024-08-24 02:53:04,335] A new study created in memory with name: no-name-7da6f04c-e277-4fdb-80c7-ff8614b226bd
  "l2_leaf_reg": trial.suggest_discrete_uniform("l2_leaf_reg", 0.001, 10, 0.1),
[I 2024-08-24 02:53:05,260] Trial 0 finished with value: 0.352536543422184 and parameters: {'learning_rate': 0.030930574782374702, 'colsample_bylevel': 0.103919214803733, 'depth': 4, 'boosting_type': 'Ordered', 'iterations': 998, 'l2_leaf_reg': 1.201, 'min_child_samples': 8, 'border_count': 116, 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.352536543422184.
  "l2_leaf_reg": trial.suggest_discrete_uniform("l2_leaf_reg", 0.001, 10, 0.1),
[I 2024-08-24 02:53:09,638] Trial 1 finished with value: 0.351676698194325 and parameters: {'learning_rate': 0.038314458784279984, 'colsample_bylevel': 0.2374034191252368, 'depth': 10, 'boosting_type': 'Ordered', 'iterations': 934, 'l2_leaf_reg': 1.701, 'min_child_samples': 16, 'border_count': 166, 'bootstrap_type': 'Bayesian'}. Best is trial 1 with valu

In [None]:
'''
{'learning_rate': 0.030930574782374702, 'colsample_bylevel': 0.103919214803733, 'depth': 4, 'boosting_type': 'Ordered', 'iterations': 998, 'l2_leaf_reg': 1.201, 'min_child_samples': 8, 'border_count': 116, 'bootstrap_type': 'MVS'}.

{'learning_rate': 0.038314458784279984, 'colsample_bylevel': 0.2374034191252368, 'depth': 10, 'boosting_type': 'Ordered', 'iterations': 934, 'l2_leaf_reg': 1.701, 'min_child_samples': 16, 'border_count': 166, 'bootstrap_type': 'Bayesian'}

{'learning_rate': 0.09373862539744054, 'colsample_bylevel': 0.9222253783391615, 'depth': 8, 'boosting_type': 'Plain', 'iterations': 922, 'l2_leaf_reg': 6.801000000000001, 'min_child_samples': 16, 'border_count': 204, 'bootstrap_type': 'Bernoulli'}

{'learning_rate': 0.036591497324882365, 'colsample_bylevel': 0.14571590169210963, 'depth': 4, 'boosting_type': 'Ordered', 'iterations': 573, 'l2_leaf_reg': 5.401000000000001, 'min_child_samples': 16, 'border_count': 108, 'bootstrap_type': 'Bayesian'}

{'learning_rate': 0.09429663085721737, 'colsample_bylevel': 0.6801106325179391, 'depth': 10, 'boosting_type': 'Ordered', 'iterations': 557, 'l2_leaf_reg': 8.201, 'min_child_samples': 1, 'border_count': 118, 'bootstrap_type': 'Bernoulli'}

{'learning_rate': 0.09906318708829609, 'colsample_bylevel': 0.7793346248230172, 'depth': 2, 'boosting_type': 'Ordered', 'iterations': 589, 'l2_leaf_reg': 0.001, 'min_child_samples': 32, 'border_count': 217, 'bootstrap_type': 'MVS'}

{'learning_rate': 0.010321851600690938, 'colsample_bylevel': 0.7761766634024944, 'depth': 4, 'boosting_type': 'Ordered', 'iterations': 825, 'l2_leaf_reg': 1.701, 'min_child_samples': 16, 'border_count': 132, 'bootstrap_type': 'Bernoulli'}

{'learning_rate': 0.08064580790835855, 'colsample_bylevel': 0.3985339981292758, 'depth': 5, 'boosting_type': 'Ordered', 'iterations': 907, 'l2_leaf_reg': 6.401000000000001, 'min_child_samples': 8, 'border_count': 38, 'bootstrap_type': 'MVS'}

{'learning_rate': 0.026726488495421405, 'colsample_bylevel': 0.6890711015515465, 'depth': 10, 'boosting_type': 'Plain', 'iterations': 692, 'l2_leaf_reg': 8.600999999999999, 'min_child_samples': 8, 'border_count': 147, 'bootstrap_type': 'Bayesian'}

{'learning_rate': 0.04103926981099322, 'colsample_bylevel': 0.5112422296737442, 'depth': 2, 'boosting_type': 'Plain', 'iterations': 599, 'l2_leaf_reg': 1.901, 'min_child_samples': 32, 'border_count': 87, 'bootstrap_type': 'Bayesian'}
'''

### 모델학습



In [35]:
# 학습 데이터에서 target을 제외한 features 리스트 정의
features = [col for col in df_train.columns if col != 'target']
# 학습 데이터에서 cat_features 인덱스 리스트 정의
cat_features = [i for i, col in enumerate(features) if df_train[col].dtype == 'object']

# train_x와 train_y 설정
train_x = df_train[features]
train_y = df_train["target"]

In [36]:
# 최적 하이퍼파라미터로 모델 설정 및 재학습
best_model = CatBoostClassifier(**hyperparams, class_weights=[1.2, 1.0])

# 필터링된 학습 데이터로 모델 재학습
best_model.fit(train_x, train_y, cat_features=cat_features)

# F1-Score: 0.5083

0:	learn: 0.6864903	total: 97.8ms	remaining: 1m 30s
1:	learn: 0.6815451	total: 179ms	remaining: 1m 22s
2:	learn: 0.6765254	total: 270ms	remaining: 1m 22s
3:	learn: 0.6728142	total: 392ms	remaining: 1m 30s
4:	learn: 0.6696190	total: 491ms	remaining: 1m 30s
5:	learn: 0.6640876	total: 605ms	remaining: 1m 32s
6:	learn: 0.6591729	total: 717ms	remaining: 1m 33s
7:	learn: 0.6555234	total: 827ms	remaining: 1m 34s
8:	learn: 0.6521025	total: 928ms	remaining: 1m 34s
9:	learn: 0.6488401	total: 1.06s	remaining: 1m 36s
10:	learn: 0.6474447	total: 1.14s	remaining: 1m 34s
11:	learn: 0.6451307	total: 1.27s	remaining: 1m 36s
12:	learn: 0.6425310	total: 1.39s	remaining: 1m 36s
13:	learn: 0.6386751	total: 1.5s	remaining: 1m 37s
14:	learn: 0.6364791	total: 1.6s	remaining: 1m 36s
15:	learn: 0.6340530	total: 1.7s	remaining: 1m 36s
16:	learn: 0.6323353	total: 1.8s	remaining: 1m 35s
17:	learn: 0.6299276	total: 1.88s	remaining: 1m 34s
18:	learn: 0.6280441	total: 1.96s	remaining: 1m 33s
19:	learn: 0.6270017	tota

<catboost.core.CatBoostClassifier at 0x7fc29b192f50>

In [37]:
# 피처 중요도 계산
feature_importances = best_model.get_feature_importance()

# 중요도와 피처 이름을 함께 정리
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# 중요도가 0.5% 이하인 피처 필터링
low_importance_threshold = 0.5  # 임계값을 0.5%로 설정
low_importance_features = importance_df[importance_df['Importance'] < low_importance_threshold]['Feature']

# 중요도가 0인 피처를 학습 데이터와 테스트 데이터에서 제거
train_x_filtered = train_x.drop(columns = low_importance_features)
train_x_filtered

Unnamed: 0,Model.Suffix_Dam,Workorder_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage2) Collect Result_Dam,Dispense Volume(Stage3) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam,...,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1,HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1,HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1,HEAD Standby Position X Collect Result_Fill1,Machine Tact time Collect Result_Fill1,Production Qty Collect Result_Fill1,Machine Tact time Collect Result_Fill2,Production Qty Collect Result_Fill2
205,AJX75334501,4B1XC554-1,0.341880,0.661765,0.338983,0.470588,0.294737,1.000000,0.003070,0.779363,...,0.442178,0.000600,0.997656,0.577652,0.577652,0.005143,0.724195,0.028747,0.252908,0.028747
3257,AJX75334501,4B1X9671-1,0.307692,0.632353,0.305085,0.441176,0.263158,0.000000,0.991814,0.771210,...,0.442325,0.000343,0.001116,0.791667,0.791667,0.007669,0.724195,0.004107,0.252908,0.004107
4290,AJX75334501,3H1XE642-1,0.008547,0.161765,0.008475,0.117647,0.000000,0.997938,0.772064,0.001274,...,0.000000,0.235113,0.001451,0.000000,0.000000,1.000000,0.409579,0.032854,0.006379,0.032854
427,AJX75334501,4A1XA642-1,0.307692,0.558824,0.305085,0.397059,0.263158,1.000000,0.003837,0.780127,...,0.441444,0.000171,0.997656,0.142045,0.142045,0.000000,0.724195,0.028747,0.252908,0.028747
0,AJX75334501,4B1XE007-1,0.299145,0.661765,0.296610,0.470588,0.252632,1.000000,0.003070,0.779363,...,0.442178,0.000600,0.997656,0.577652,0.577652,0.005143,0.724195,0.030801,0.252908,0.030801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,AJX75334502,3K1X7331-1,1.000000,0.676471,0.991525,0.470588,0.863158,0.000000,0.778460,1.000000,...,0.000000,1.000000,0.996539,0.189394,0.189394,1.000000,0.146160,0.049281,0.009381,0.049281
3541,AJX75334501,3F1X9644-1,0.008547,0.014706,0.008475,0.014706,0.000000,0.996649,0.772064,0.002548,...,0.000734,0.234856,0.000781,0.189394,0.189394,1.000000,0.417836,0.205339,0.004878,0.205339
1450,AJX75334501,3G1X8646-1,0.000000,0.014706,0.008475,0.014706,0.000000,0.996649,0.772064,0.002548,...,0.000734,0.234856,0.000781,0.189394,0.189394,1.000000,0.422791,0.804928,0.004128,0.804928
3483,AJX75334501,3M1XB515-1,0.307692,0.411765,0.296610,0.676471,0.810526,0.996649,0.007675,0.783694,...,0.442912,0.000771,0.997656,0.189394,0.189394,0.000559,0.864575,0.022587,0.252908,0.022587


## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [38]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_data_cleaned(스케일링후).csv"))

In [39]:
test_data

Unnamed: 0,Set ID,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE STANDBY POSITION X Collect Result_Dam,...,Head Clean Position X Collect Result_Fill2,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,Production Qty Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,0001be084fbc4aaa9d921f39e595961b,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XF767-1,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.021669,0.401235,0.000000,
1,0005bbd180064abd99e63f9ed3e1ac80,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1XD472-2,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.694444,0.028807,0.058824,
2,000948934c4140d883d670adcb609584,Dam Dispenser,Dam dispenser #1,AJX75334501,3H1XE355-1,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.020637,0.201646,0.000000,
3,000a6bfd02874c6296dc7b2e9c5678a7,Dam Dispenser,Dam dispenser #2,AJX75334501,3L1XA128-1,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.694444,0.028807,0.058824,
4,0018e78ce91343678716e2ea27a51c95,Dam Dispenser,Dam dispenser #1,AJX75334501,4A1XA639-1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.694444,0.002058,0.058824,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1XB597-1,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.694444,0.028807,0.058824,
17357,ffed8923c8a448a98afc641b770be153,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XB974-1,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.694444,0.024691,0.058824,
17358,fff1e73734da40adbe805359b3efb462,Dam Dispenser,Dam dispenser #1,AJX75334501,3L1XA998-1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.694444,0.008230,0.058824,
17359,fff8e38bdd09470baf95f71e92075dec,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1XC376-1,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.012382,0.240741,0.000000,


In [41]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_data_cleaned(스케일링후).csv"))
# 테스트 데이터에서 학습에 사용된 features만 선택 및 피처 일치
df_test_x_filtered = test_data[train_x_filtered.columns.intersection(test_data.columns)]

# 학습 데이터와 동일한 순서로 테스트 데이터의 피처 정렬
df_test_x_filtered = df_test_x_filtered.reindex(columns=train_x_filtered.columns, fill_value=0)

# 최적 하이퍼파라미터로 모델 설정 및 재학습
best_model_filtered = CatBoostClassifier(
    cat_features=[i for i, col in enumerate(train_x_filtered.columns) if train_x_filtered[col].dtype == 'object'],
    **hyperparams, class_weights=[1.2, 1.0]
)

# 필터링된 학습 데이터로 모델 재학습
best_model_filtered.fit(train_x_filtered, train_y)

# CatBoost에서 사용할 Pool 객체 생성 (범주형 열 지정)
test_pool_filtered = Pool(df_test_x_filtered, cat_features=[i for i, col in enumerate(df_test_x_filtered.columns) if df_test_x_filtered[col].dtype == 'object'])

# 최적 모델로 예측 수행
test_pred_filtered = best_model_filtered.predict(test_pool_filtered)
test_pred_filtered

0:	learn: 0.6860006	total: 59.1ms	remaining: 54.4s
1:	learn: 0.6815573	total: 110ms	remaining: 50.4s
2:	learn: 0.6783205	total: 135ms	remaining: 41.2s
3:	learn: 0.6734495	total: 160ms	remaining: 36.7s
4:	learn: 0.6658575	total: 201ms	remaining: 36.8s
5:	learn: 0.6598426	total: 236ms	remaining: 36.1s
6:	learn: 0.6539098	total: 275ms	remaining: 36s
7:	learn: 0.6507497	total: 317ms	remaining: 36.2s
8:	learn: 0.6471252	total: 356ms	remaining: 36.1s
9:	learn: 0.6430365	total: 393ms	remaining: 35.8s
10:	learn: 0.6411772	total: 438ms	remaining: 36.3s
11:	learn: 0.6377451	total: 461ms	remaining: 34.9s
12:	learn: 0.6331754	total: 481ms	remaining: 33.6s
13:	learn: 0.6305746	total: 505ms	remaining: 32.8s
14:	learn: 0.6273695	total: 528ms	remaining: 31.9s
15:	learn: 0.6272276	total: 534ms	remaining: 30.2s
16:	learn: 0.6260081	total: 552ms	remaining: 29.4s
17:	learn: 0.6247812	total: 575ms	remaining: 28.9s
18:	learn: 0.6236688	total: 590ms	remaining: 28.1s
19:	learn: 0.6216750	total: 615ms	remainin

array(['AbNormal', 'Normal', 'AbNormal', ..., 'AbNormal', 'AbNormal',
       'Normal'], dtype=object)

In [42]:
from sklearn.metrics import f1_score
# 검증 데이터에서 사용할 특성과 타겟 설정
test_x = df_val[features]  # df_val에서 사용할 특성들
test_y = df_val["target"]  # df_val에서 실제 타겟값

# 검증 데이터에 대한 예측 수행
# 범주형 피처는 train_x_filtered와 동일하게 지정
test_pool_filtered = Pool(
    test_x,
    cat_features=[i for i, col in enumerate(test_x.columns) if test_x[col].dtype == 'object']
)

test_pred_filtered = best_model_filtered.predict(test_pool_filtered)

# F1-score 계산 (AbNormal 기준)
f1 = f1_score(test_y, test_pred_filtered, average='binary', pos_label='AbNormal')
print(f"F1-Score: {f1:.4f}")

F1-Score: 0.5690
