<a href="https://colab.research.google.com/github/bineee39/first-repository/blob/master/MainQuest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.describe()

In [None]:
train_df['id'].nunique()

In [None]:
sns.scatterplot(train_df,x=train_df['id'],y=train_df['Time'])

id에따라 time이 증가한다. id를 부여할때 time을 오름차순으로 두고 부여한것으로 보인다.

In [None]:
sns.scatterplot(train_df,x=train_df['Time'],y=train_df['Amount'])

In [None]:
이상치가 사기거래를 탐지하는 역할을 할 수도 있으므로 이상치를 제거하기보단 스케일링만 진행했습니다.

In [None]:
sns.countplot(x='Class', data=train_df)

In [None]:
train_df['Class'].value_counts()

In [None]:
train_df['Class'].value_counts(normalize=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix = train_df.drop('id', axis=1).corr()

corr_with_class = corr_matrix[['Class']]

plt.figure(figsize=(15, 8))
sns.heatmap(corr_with_class.sort_values(by='Class', ascending=False),
            annot=True,
            fmt='.2f',
            cmap='coolwarm')
plt.title('Correlation of all features with Class')
plt.show()

이미 pca를 거친 변수들이므로 상관계수가 낮게나온다

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
cols_to_scale = ['Time', 'Amount']

train_df[cols_to_scale] = scaler.fit_transform(train_df[cols_to_scale])

test_df[cols_to_scale] = scaler.transform(test_df[cols_to_scale])

train_df[cols_to_scale].describe()

In [None]:
train_df.head()

In [None]:
train_df.drop('id',axis=1,inplace=True)

In [None]:
test_df.drop('id',axis=1,inplace=True)

In [None]:
test_df.head()

In [None]:
X = train_df.drop('Class', axis=1)
y = train_df['Class']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score, recall_score, precision_score

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=False,
    random_state=42
)

print(f"원본 y_train 클래스 분포:\n{y_train.value_counts()}")
#오버샘플링으로 불균형 Class맞추어줌
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"SMOTE 적용 후 y_train 클래스 분포:\n{y_train_smote.value_counts()}")

models = {
    "Logistic Regression (Lasso)": LogisticRegression(penalty='l1', C=1.0, solver='liblinear', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1),
    "XGBoost": XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss')
}

results = []

for name, model in models.items():
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

    print(f"[{name}] 검증 데이터 F1 Score: {f1:.4f}")

    print("\n[Classification Report]")
    print(classification_report(y_val, y_pred, target_names=['Normal (0)', 'Fraud (1)']))

final_results_df = pd.DataFrame(results).set_index('Model')

print("최종 성능 지표")
print(final_results_df.sort_values(by='F1 Score', ascending=False))

하이퍼파라미터 최적화를 거치지 않은 상태에서는 XGBoost가 가장 F1score가 높았다

In [None]:
!pip install optuna

# 로지스틱 하이퍼파라미터(C) 최적화

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
def objective_lr(trial):

    c_param = trial.suggest_loguniform('C', 1e-4, 100)

    model = LogisticRegression(
        penalty='l1',
        C=c_param,
        solver='liblinear',
        random_state=42
    )

    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_val)

    return f1_score(y_val, y_pred)

study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(objective_lr, n_trials=50, show_progress_bar=True)

print("\n[로지스틱 회귀] 최적 F1 Score:", study_lr.best_value)
print("[로지스틱 회귀] 최적 하이퍼파라미터:", study_lr.best_params)

best_lr_params = study_lr.best_params
best_lr_model = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, **best_lr_params)
best_lr_model.fit(X_train_smote, y_train_smote)

F1 score가 높아지긴 했지만 0.2로 유의미한 결과를 주진 못했다.

In [None]:
from lightgbm import LGBMClassifier
def objective_lgbm(trial):
    lgbm_params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'num_leaves': trial.suggest_int('num_leaves', 31, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
    }

    model = LGBMClassifier(
        random_state=42,
        n_jobs=-1,
        verbosity=-1,
        **lgbm_params
    )

    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred)
study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=100, show_progress_bar=True) # 더 많은 시도 (n_trials) 추천

print("\n[LightGBM] 최적 F1 Score:", study_lgbm.best_value)
print("[LightGBM] 최적 하이퍼파라미터:", study_lgbm.best_params)


best_lgbm_params = study_lgbm.best_params
best_lgbm_model = LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1, **best_lgbm_params)
best_lgbm_model.fit(X_train_smote, y_train_smote)

In [None]:
from xgboost import XGBClassifier

def objective_xgb(trial):
    xgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        **xgb_params
    )

    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=100, show_progress_bar=True)

print("\n[XGBoost] 최적 F1 Score:", study_xgb.best_value)
print("[XGBoost] 최적 하이퍼파라미터:", study_xgb.best_params)

best_xgb_params = study_xgb.best_params
best_xgb_model = XGBClassifier(random_state=42, n_jobs=-1, use_label_encoder=False, **best_xgb_params)
best_xgb_model.fit(X_train_smote, y_train_smote)

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier


best_lgbm_params = {'n_estimators': 949, 'learning_rate': 0.015103133155498064, 'max_depth': 12, 'num_leaves': 85, 'min_child_samples': 52, 'subsample': 0.6465824063100066, 'colsample_bytree': 0.8516036360089998, 'reg_alpha': 1.2742027666408652e-07, 'reg_lambda': 4.704365280380022e-08}
best_xgb_params = {'n_estimators': 792, 'learning_rate': 0.011287998067292129, 'max_depth': 12, 'min_child_weight': 1, 'gamma': 1.0236396419890437e-06, 'subsample': 0.9579569591763855, 'colsample_bytree': 0.7647619069349275, 'reg_alpha': 0.0014023169752579944, 'reg_lambda': 2.45071729037708e-05}



lgbm_clf = LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1, **best_lgbm_params)
xgb_clf = XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss', use_label_encoder=False, **best_xgb_params)


lgbm_clf.fit(X_train_smote, y_train_smote)
xgb_clf.fit(X_train_smote, y_train_smote)


voting_clf = VotingClassifier(
    estimators=[('lgbm', lgbm_clf), ('xgb', xgb_clf)],
    voting='soft',
    n_jobs=-1
)

print("앙상블 모델 학습 시작...")
voting_clf.fit(X_train_smote, y_train_smote)
print("앙상블 모델 학습 완료.")

앙상블로 묶음 (다양한 분류모델로 진행하였으므로 voting기법으로 묶음)

In [None]:
from sklearn.metrics import f1_score

y_pred_voting = voting_clf.predict(X_val)

f1_voting = f1_score(y_val, y_pred_voting)

print(f"앙상블 F1 Score: {f1_voting:.4f}")

print(f"   XGBoost : 0.8438")
print(f"   LightGBM : 0.8351")

In [None]:
import numpy as np

final_model = voting_clf


y_proba = final_model.predict_proba(X_val)[:, 1]

best_f1 = 0
best_threshold = 0.5


for threshold in np.arange(0.05, 0.95, 0.01):

    y_pred_threshold = (y_proba >= threshold).astype(int)


    f1 = f1_score(y_val, y_pred_threshold)

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"최적 임곗값: {best_threshold:.2f}")
print(f"최종 F1 Score: {best_f1:.4f}")

In [None]:
import pandas as pd
import numpy as np

original_test_df = pd.read_csv('./test.csv')
test_ids_for_submission = original_test_df['id']

cols_to_scale = ['Time', 'Amount']

original_test_df[cols_to_scale] = scaler.transform(original_test_df[cols_to_scale])

X_test_final = original_test_df.drop('id', axis=1)

final_model = voting_clf
best_threshold = 0.59


test_proba = final_model.predict_proba(X_test_final)[:, 1]


final_predictions = (test_proba >= best_threshold).astype(int)


submission = pd.DataFrame({
    "id": test_ids_for_submission,
    "Class": final_predictions
})


submission.to_csv("./submission_gyubin.csv", index=False)

print("예측 개수는", len(submission), "건입니다.")

과적합된것같아서 좀 더 느슨하게 다시 최적의 파라미터를 찾음

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import f1_score


def objective_xgb_re(trial):
    xgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        **xgb_params
    )

    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred)

study_xgb_re = optuna.create_study(direction='maximize')
study_xgb_re.optimize(objective_xgb_re, n_trials=50, show_progress_bar=True)

print("\n[ XGBoost] 최적 F1 Score:", study_xgb_re.best_value)
print("[ XGBoost] 최적 하이퍼파라미터:", study_xgb_re.best_params)
best_xgb_params_re = study_xgb_re.best_params

In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

def objective_lgbm_re(trial):
    lgbm_params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 9),
        'num_leaves': trial.suggest_int('num_leaves', 31, 70),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
    }

    model = LGBMClassifier(
        random_state=42,
        n_jobs=-1,
        verbosity=-1,
        **lgbm_params
    )

    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred)

print("---  LightGBM 최적화 시작 (n_trials=50) ---")
study_lgbm_re = optuna.create_study(direction='maximize')
study_lgbm_re.optimize(objective_lgbm_re, n_trials=50, show_progress_bar=True)

print("\n LightGBM] 최적 F1 Score:", study_lgbm_re.best_value)
print("LightGBM] 최적 하이퍼파라미터:", study_lgbm_re.best_params)

best_lgbm_params_re = study_lgbm_re.best_params

In [None]:
import optuna
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import numpy as np

best_lgbm_params = {'n_estimators': 874, 'learning_rate': 0.08326062054708025, 'max_depth': 8, 'num_leaves': 62, 'min_child_samples': 71, 'subsample': 0.9737480587317234, 'colsample_bytree': 0.8917908434852498, 'reg_alpha': 0.013213503671865254, 'reg_lambda': 5.4226824776857555e-05}
best_xgb_params = {'n_estimators': 921, 'learning_rate': 0.023703360450466202, 'max_depth': 9, 'min_child_weight': 1, 'gamma': 0.38887847738437764, 'subsample': 0.8954154427644301, 'colsample_bytree': 0.6157837367341784, 'reg_alpha': 0.005934695765543821, 'reg_lambda': 0.0010594014155312593}


def objective_voting_weights_re(trial):
    w1_lgbm = trial.suggest_float('w1_lgbm', 0.5, 1.0)
    w2_xgb = 1.0 - w1_lgbm


    lgbm_clf_opt = LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1, **best_lgbm_params)
    xgb_clf_opt = XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss', use_label_encoder=False, **best_xgb_params)

    voting_clf_opt = VotingClassifier(
        estimators=[('lgbm', lgbm_clf_opt), ('xgb', xgb_clf_opt)],
        voting='soft',
        weights=[w1_lgbm, w2_xgb],
        n_jobs=-1
    )

    voting_clf_opt.fit(X_train_smote, y_train_smote)
    y_pred = voting_clf_opt.predict(X_val)

    return f1_score(y_val, y_pred)

print("--- [재튜닝] 앙상블 가중치 최적화 시작 ---")
study_voting_weights_re = optuna.create_study(direction='maximize')
study_voting_weights_re.optimize(objective_voting_weights_re, n_trials=50, show_progress_bar=True)

print("\n[재튜닝 가중치 튜닝] 최적 F1 Score:", study_voting_weights_re.best_value)
print("[재튜닝 가중치 튜닝] 최적 가중치 (LGBM:XGB):", study_voting_weights_re.best_params['w1_lgbm'], ":", 1.0 - study_voting_weights_re.best_params['w1_lgbm'])

best_weights_lgbm = study_voting_weights_re.best_params['w1_lgbm']
best_weights_re = [best_weights_lgbm, 1.0 - best_weights_lgbm]

final_weighted_ensemble_model_re = VotingClassifier(
    estimators=[('lgbm', LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1, **best_lgbm_params)),
                ('xgb', XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss', use_label_encoder=False, **best_xgb_params))],
    voting='soft',
    weights=best_weights_re,
    n_jobs=-1
)

print("\n최적 가중치로 최종 앙상블 모델 재학습 시작...")
final_weighted_ensemble_model_re.fit(X_train_smote, y_train_smote)
print("최적 가중치로 최종 앙상블 모델 학습 완료.")

In [None]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


best_lgbm_params = {'n_estimators': 874, 'learning_rate': 0.08326062054708025, 'max_depth': 8, 'num_leaves': 62, 'min_child_samples': 71, 'subsample': 0.9737480587317234, 'colsample_bytree': 0.8917908434852498, 'reg_alpha': 0.013213503671865254, 'reg_lambda': 5.4226824776857555e-05}
best_xgb_params = {'n_estimators': 921, 'learning_rate': 0.023703360450466202, 'max_depth': 9, 'min_child_weight': 1, 'gamma': 0.38887847738437764, 'subsample': 0.8954154427644301, 'colsample_bytree': 0.6157837367341784, 'reg_alpha': 0.005934695765543821, 'reg_lambda': 0.0010594014155312593}

best_weights_lgbm = 0.5891367693504789
best_weights_re = [best_weights_lgbm, 1.0 - best_weights_lgbm]


final_weighted_ensemble_model_re = VotingClassifier(
    estimators=[('lgbm', LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1, **best_lgbm_params)),
                ('xgb', XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss', use_label_encoder=False, **best_xgb_params))],
    voting='soft',
    weights=best_weights_re,
    n_jobs=-1
)
print("최종 앙상블 모델 재학습 시작...")
final_weighted_ensemble_model_re.fit(X_train_smote, y_train_smote)
print("최종 앙상블 모델 학습 완료.")


final_model = final_weighted_ensemble_model_re

y_proba_final = final_model.predict_proba(X_val)[:, 1]

best_f1_refined = 0
best_threshold_refined = 0.5

for threshold in np.arange(0.50, 0.70, 0.001):
    y_pred_threshold = (y_proba_final >= threshold).astype(int)
    f1 = f1_score(y_val, y_pred_threshold)

    if f1 > best_f1_refined:
        best_f1_refined = f1
        best_threshold_refined = threshold

print(f"\n--- 최종 임곗값 정밀 튜닝 결과 ---")
print(f"최적 임곗값: {best_threshold_refined:.3f}")
print(f"최종 F1 Score: {best_f1_refined:.4f}")

In [None]:
import pandas as pd
import numpy as np


original_test_df = pd.read_csv('./test.csv')
test_ids_for_submission = original_test_df['id']

cols_to_scale = ['Time', 'Amount']

original_test_df[cols_to_scale] = scaler.transform(original_test_df[cols_to_scale])

X_test_final = original_test_df.drop('id', axis=1)


final_model = final_weighted_ensemble_model_re
best_threshold = 0.674


test_proba = final_model.predict_proba(X_test_final)[:, 1]


final_predictions = (test_proba >= best_threshold).astype(int)


submission = pd.DataFrame({
    "id": test_ids_for_submission,
    "Class": final_predictions
})

submission.to_csv("./submission.csv", index=False)

print("\n--- 최종 submission.csv 파일 생성 완료 ---")


과적합이 문제라고 생각해서 하이퍼파라미터를 더 느슨하게 만들었는데 오히려 성능이 떨어짐

# SMOTE를 버리고, LightGBM 단일 모델에 scale_pos_weight를 적용하여 재튜닝

In [None]:
import numpy as np

count_neg = (y_train == 0).sum()
count_pos = (y_train == 1).sum()


SCALE_POS_WEIGHT = count_neg / count_pos

print(f"원본 y_train의 사기 거래 비율: {count_pos / len(y_train) * 100:.3f}%")
print(f"계산된 LightGBM 클래스 가중치 (SCALE_POS_WEIGHT): {SCALE_POS_WEIGHT:.2f}")

In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

def objective_lgbm_final(trial):
    lgbm_params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 7),
        'num_leaves': trial.suggest_int('num_leaves', 31, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 80),
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 5.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 5.0),
    }

    model = LGBMClassifier(
        random_state=42,
        n_jobs=-1,
        verbosity=-1,
        scale_pos_weight=SCALE_POS_WEIGHT, # Correctly placed as a keyword argument
        **lgbm_params
    )

    # Use original data for training and validation
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    return f1_score(y_val, y_pred)

print("--- [최종] LightGBM 단일 모델 최적화 시작 (n_trials=50, 클래스 가중치 적용) ---")
study_lgbm_final = optuna.create_study(direction='maximize')
study_lgbm_final.optimize(objective_lgbm_final, n_trials=50, show_progress_bar=True)

print("\n[최종 LightGBM] 최적 F1 Score (단일 모델):", study_lgbm_final.best_value)
print("[최종 LightGBM] 최적 하이퍼파라미터:", study_lgbm_final.best_params)

# 최적 파라미터 저장
best_lgbm_final_params = study_lgbm_final.best_params

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score
import numpy as np

# Instantiate the final LightGBM model with the best parameters and scale_pos_weight
final_lgbm_model = LGBMClassifier(
    random_state=42,
    n_jobs=-1,
    verbosity=-1,
    scale_pos_weight=SCALE_POS_WEIGHT,
    **best_lgbm_final_params
)

print("최종 LightGBM 모델 학습 시작...")
final_lgbm_model.fit(X_train, y_train)
print("최종 LightGBM 모델 학습 완료.")

# Predict probabilities on the validation set
y_proba_final_lgbm = final_lgbm_model.predict_proba(X_val)[:, 1]

# Re-tune prediction threshold for the single LightGBM model
best_f1_lgbm_single = 0
best_threshold_lgbm_single = 0.5

for threshold in np.arange(0.01, 0.99, 0.01):
    y_pred_threshold_lgbm = (y_proba_final_lgbm >= threshold).astype(int)
    f1 = f1_score(y_val, y_pred_threshold_lgbm)

    if f1 > best_f1_lgbm_single:
        best_f1_lgbm_single = f1
        best_threshold_lgbm_single = threshold

print(f"\n--- 최종 LightGBM 단일 모델 성능 평가 (최적 임곗값 적용) ---")
print(f"최적 임곗값: {best_threshold_lgbm_single:.2f}")
print(f"최종 F1 Score: {best_f1_lgbm_single:.4f}")

# Evaluate with the best threshold
y_pred_final_lgbm = (y_proba_final_lgbm >= best_threshold_lgbm_single).astype(int)

accuracy = accuracy_score(y_val, y_pred_final_lgbm)
precision = precision_score(y_val, y_pred_final_lgbm)
recall = recall_score(y_val, y_pred_final_lgbm)
f1 = f1_score(y_val, y_pred_final_lgbm)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\n[Classification Report]")
print(classification_report(y_val, y_pred_final_lgbm, target_names=['Normal (0)', 'Fraud (1)']))

In [None]:

original_test_df = pd.read_csv('./test.csv')
test_ids_for_submission = original_test_df['id']
cols_to_scale = ['Time', 'Amount']

# test_df에 동일한 스케일러 적용 (fit 없이 transform만!)
original_test_df[cols_to_scale] = scaler.transform(original_test_df[cols_to_scale])
X_test_final = original_test_df.drop('id', axis=1)

# 2. 최종 예측 수행 및 최적 임곗값 적용
final_model_single = final_lgbm_model # 최종 LightGBM 모델 사용
best_threshold = best_threshold_lgbm_single # LightGBM 단일 모델의 최적 임곗값 사용

test_proba = final_model_single.predict_proba(X_test_final)[:, 1]
final_predictions = (test_proba >= best_threshold).astype(int)

# 3. submission.csv 파일 생성
submission = pd.DataFrame({
    "id": test_ids_for_submission,
    "Class": final_predictions
})

submission.to_csv("./submission2.csv", index=False)

print("\n--- 최종 submission.csv 파일 생성 완료 ---")

오버샘플링이 문제였던것같다..제일 효과가 좋았다