In [0]:
import pandas as pd

unsupervised_pdf = pd.read_csv("/dbfs/mnt/my-mount/unsupervised_pdf.csv")
print(unsupervised_pdf.head()) 

      transaction_time  TransactionDT  ... pseudo_label_avg  pseudo_label_weighted
0  2025-01-14 09:56:23          86400  ...                0                      0
1  2025-04-20 22:42:41          86401  ...                0                      0
2  2025-04-09 19:27:13          86469  ...                0                      0
3  2025-01-30 01:26:54          86499  ...                0                      0
4  2025-01-24 22:58:30          86506  ...                0                      0

[5 rows x 44 columns]


In [0]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from bayes_opt import BayesianOptimization


In [0]:
from sklearn.preprocessing import StandardScaler

feature_cols = [
    'TransactionAmt', 'hour', 'voice_match', 'is_new_account_for_user',
    'avg_amount_to_bank', 'amount_ratio_to_bank_avg', 'is_nighttime', 
    'recent_transaction_gap','is_new_device', 'vpn', 'rooting'
]

# 입력 데이터
X_raw = unsupervised_pdf[feature_cols]
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)
print(f'X:{X}')

#타겟
y = unsupervised_pdf['pseudo_label_avg']
print(f'y:{y}')

X:[[-0.27816747 -0.35987014  0.99989163 ... -1.00064369 -0.99886271
  -0.99915706]
 [-0.44332726  1.51973769 -1.00010838 ... -1.00064369 -0.99886271
  -0.99915706]
 [-0.31788944  1.08598204  0.99989163 ... -1.00064369  1.00113859
   1.00084365]
 ...
 [-0.4351738   1.51973769 -1.00010838 ... -1.00064369 -0.99886271
   1.00084365]
 [-0.07537632  1.23056725  0.99989163 ... -1.00064369 -0.99886271
   1.00084365]
 [ 0.6059601  -0.7936258  -1.00010838 ... -1.00064369  1.00113859
  -0.99915706]]
y:0         0
1         0
2         0
3         0
4         0
         ..
590535    0
590536    0
590537    0
590538    0
590539    1
Name: pseudo_label_avg, Length: 590540, dtype: int64


In [0]:
# X, y는 전체 데이터셋
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [0]:
#XGBoost 훈련
xgb_model =XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    scale_pos_weight=(y==0).sum() / (y==1).sum(),
    random_state=42
)
xgb_model.fit(X_train, y_train)

In [0]:
rf_model = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=100,
    random_state=42
)
rf_model.fit(X_train, y_train)

In [0]:
lgbm_model = LGBMClassifier(
    class_weight='balanced',
    random_state=42
)
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 23622, number of negative: 448810
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [0]:
stack_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('lgbm', lgbm_model)
    ],
    final_estimator=LogisticRegression(class_weight='balanced'),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    passthrough=True,
    n_jobs=-1
)

stack_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 18897, number of negative: 359048
[LightGBM] [Info] Number of positive: 18898, number of negative: 359048
[LightGBM] [Info] Number of positive: 18898, number of negative: 359048
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028824 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 377946, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041496 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data point

In [0]:
y_pred = stack_model.predict(X_val)
print(y_pred)

[0 0 0 ... 1 0 0]


In [0]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score

print("📊 Classification Report")
print(classification_report(y_val, y_pred, target_names=["정상", "이상"]))

print("🧩 Confusion Matrix")
print(confusion_matrix(y_val, y_pred))

print(f"🎯 F1 Score: {f1_score(y_val, y_pred):.4f}")


📊 Classification Report
              precision    recall  f1-score   support

          정상       1.00      0.94      0.97    112203
          이상       0.47      0.95      0.63      5905

    accuracy                           0.94    118108
   macro avg       0.73      0.95      0.80    118108
weighted avg       0.97      0.94      0.95    118108

🧩 Confusion Matrix
[[105816   6387]
 [   277   5628]]
🎯 F1 Score: 0.6281


In [0]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import f1_score

y_proba = xgb_model.predict_proba(X_val)
y_proba_class1 = y_proba[:, 1]

# 목적 함수 정의
def optimize_threshold(threshold):
    threshold = float(threshold)
    y_pred = (y_proba_class1 >= threshold).astype(int)
    return f1_score(y_val, y_pred)

# Bayesian Optimization 설정
optimizer = BayesianOptimization(
    f=optimize_threshold,
    pbounds={"threshold": (0.1, 0.9)},
    random_state=42,
    verbose=2
)

optimizer.maximize(init_points=5, n_iter=25)

best_threshold = optimizer.max['params']['threshold']
print(f"✅ Best threshold by Bayesian Optimization: {best_threshold:.4f}")


|   iter    |  target   | threshold |
-------------------------------------
| [39m1        [39m | [39m0.5727   [39m | [39m0.3996   [39m |
| [35m2        [39m | [35m0.7562   [39m | [35m0.8606   [39m |
| [39m3        [39m | [39m0.6799   [39m | [39m0.6856   [39m |
| [39m4        [39m | [39m0.6413   [39m | [39m0.5789   [39m |
| [39m5        [39m | [39m0.4879   [39m | [39m0.2248   [39m |
| [35m6        [39m | [35m0.7702   [39m | [35m0.8998   [39m |
| [39m7        [39m | [39m0.7684   [39m | [39m0.895    [39m |
| [39m8        [39m | [39m0.7702   [39m | [39m0.8999   [39m |
| [35m9        [39m | [35m0.7703   [39m | [35m0.8998   [39m |
| [39m10       [39m | [39m0.7701   [39m | [39m0.9      [39m |
| [39m11       [39m | [39m0.7701   [39m | [39m0.9      [39m |
| [35m12       [39m | [35m0.7703   [39m | [35m0.8998   [39m |
| [39m13       [39m | [39m0.7699   [39m | [39m0.9      [39m |
| [39m14       [39m | [39m0.77     [

In [0]:
custom_threshold = 0.89
y_pred_custom = (y_proba[:, 1] >= custom_threshold).astype(int)
from sklearn.metrics import classification_report, confusion_matrix

print(f"📊 Classification Report (threshold={custom_threshold})")
print(classification_report(y_val, y_pred_custom, target_names=["정상", "이상"]))

print("🧩 Confusion Matrix")
print(confusion_matrix(y_val, y_pred_custom))

📊 Classification Report (threshold=0.89)
              precision    recall  f1-score   support

          정상       0.99      0.98      0.99    112203
          이상       0.73      0.81      0.77      5905

    accuracy                           0.98    118108
   macro avg       0.86      0.90      0.88    118108
weighted avg       0.98      0.98      0.98    118108

🧩 Confusion Matrix
[[110436   1767]
 [  1140   4765]]
