In [None]:
!pip install --upgrade imbalanced-learn
!pip install --upgrade scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold, cross_val_score
from lightgbm import LGBMClassifier


from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
import time

In [None]:
# 1. Dataview

In [None]:
df = pd.read_csv('./data/train.csv')
df.head()

In [None]:
df[df.duplicated()]

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# 최대 열 출력 수를 늘림
pd.set_option('display.max_columns', None)

df.filter(like='V').describe()

In [None]:
df.columns

In [None]:
df['Class'].value_counts(normalize=True)

In [None]:
df.hist(bins=30, figsize=(50, 25))

plt.show()

In [None]:
# 2. 데이터 분할 및 교차검증
# StratifiedKFold, Lgbm 모델로 교차검증 (amount 칼럼에만 스케일링 적용)
# 데이터 분할을 교차검증으로 stratifiedkfold로 진행하고 lgbm 모델로 성능평가하고, 이후에 스케일링을 amount 칼럼만 진행해서 데이터 리킹 없이 진행
# 스케일링 이후에 smote 진행

In [None]:
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate

numeric_features = ['Amount'] 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough' 
)

# 스케일링 -> SMOTE 오버샘플링 -> LGBM 모델
pipeline = Pipeline([
    ('preprocessor', preprocessor),   
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', LGBMClassifier())
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = ['precision', 'recall', 'f1', 'average_precision']
scores = cross_validate(pipeline, X, y, cv=skf, scoring=scoring)


print("Cross-Validation Results:")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", scores[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", scores[f'test_{metric}'].mean())

# 스케일링 유
# class 0 이 정상, class 1 이 사기

In [2]:
# 1

import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# id와 time 칼럼 제거
train = train.drop(columns=['id', 'Time'])
test = test.drop(columns=['id', 'Time'])

X_train = train.drop(columns=['Class'])
y_train = train['Class']
X_test = test

needscaling_features = ['Amount'] 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), needscaling_features) 
    ],
    remainder='passthrough' 
)

# Scaling -> SMOTE -> LGBM 
pipeline = Pipeline([
    ('preprocessor', preprocessor),   
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', LGBMClassifier(random_state=42)) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = ['precision', 'recall', 'f1', 'average_precision']

cv_results = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)

print("교차검증 결과 :")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", cv_results[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", cv_results[f'test_{metric}'].mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nTest Set Predictions:")
print("Predicted Classes:")
print(y_pred)

[LightGBM] [Info] Number of positive: 136418, number of negative: 136418
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 272836, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 136418, number of negative: 136418
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 272836, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 136418, number of negative: 136418
[LightGBM] [Info] Auto-choosing col-wise mu

In [2]:
# Logistic Regression

import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# id와 time 칼럼 제거
train = train.drop(columns=['id', 'Time'])
test = test.drop(columns=['id', 'Time'])

X_train = train.drop(columns=['Class'])
y_train = train['Class']
X_test = test

needscaling_features = ['Amount'] 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), needscaling_features) 
    ],
    remainder='passthrough' 
)

# Scaling -> SMOTE -> LogisticRegression
pipeline = Pipeline([
    ('preprocessor', preprocessor),   
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', LogisticRegression(random_state=42)) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = ['precision', 'recall', 'f1', 'average_precision']

cv_results = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)

print("교차검증 결과 :")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", cv_results[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", cv_results[f'test_{metric}'].mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nTest Set Predictions:")
print("Predicted Classes:")
print(y_pred)


교차검증 결과 :
Precision Scores: [0.08050314 0.07427938 0.09292035 0.07727797 0.09090909]
Mean Precision: 0.08317798774288836
Recall Scores: [0.88888889 0.93055556 0.875      0.93055556 0.91666667]
Mean Recall: 0.9083333333333334
F1 Scores: [0.14763552 0.137577   0.168      0.14270501 0.16541353]
Mean F1: 0.15226621320218856
Average_precision Scores: [0.77573057 0.73426507 0.76819835 0.73691215 0.79570683]
Mean Average_precision: 0.7621625939684141

Test Set Predictions:
Predicted Classes:
[0 0 0 ... 0 0 0]


In [3]:
# Random Forest

import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# id와 time 칼럼 제거
train = train.drop(columns=['id', 'Time'])
test = test.drop(columns=['id', 'Time'])

X_train = train.drop(columns=['Class'])
y_train = train['Class']
X_test = test

needscaling_features = ['Amount'] 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), needscaling_features) 
    ],
    remainder='passthrough' 
)

# Scaling -> SMOTE -> RandomForestClassifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),   
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', RandomForestClassifier(random_state=42)) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = ['precision', 'recall', 'f1', 'average_precision']

cv_results = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)

print("교차검증 결과 :")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", cv_results[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", cv_results[f'test_{metric}'].mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nTest Set Predictions:")
print("Predicted Classes:")
print(y_pred)


교차검증 결과 :
Precision Scores: [0.91176471 0.94029851 0.9047619  0.84210526 0.90909091]
Mean Precision: 0.9016042580711495
Recall Scores: [0.86111111 0.875      0.79166667 0.88888889 0.83333333]
Mean Recall: 0.85
F1 Scores: [0.88571429 0.90647482 0.84444444 0.86486486 0.86956522]
Mean F1: 0.8742127265117569
Average_precision Scores: [0.86338234 0.91147757 0.80850363 0.89876723 0.86096225]
Mean Average_precision: 0.8686186038763317

Test Set Predictions:
Predicted Classes:
[0 0 0 ... 0 0 0]


In [None]:
# XGBoost 
!pip install xgboost
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# id와 time 칼럼 제거
train = train.drop(columns=['id', 'Time'])
test = test.drop(columns=['id', 'Time'])

X_train = train.drop(columns=['Class'])
y_train = train['Class']
X_test = test

needscaling_features = ['Amount'] 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), needscaling_features) 
    ],
    remainder='passthrough' 
)

# Scaling -> SMOTE -> XGBoost 
pipeline = Pipeline([
    ('preprocessor', preprocessor),   
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = ['precision', 'recall', 'f1', 'average_precision']

cv_results = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)

print("교차검증 결과 :")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", cv_results[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", cv_results[f'test_{metric}'].mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nTest Set Predictions:")
print("Predicted Classes:")
print(y_pred)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# SVM

import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import classification_report

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# id와 time 칼럼 제거
train = train.drop(columns=['id', 'Time'])
test = test.drop(columns=['id', 'Time'])

X_train = train.drop(columns=['Class'])
y_train = train['Class']
X_test = test

needscaling_features = ['Amount'] 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), needscaling_features) 
    ],
    remainder='passthrough' 
)

# Scaling -> SMOTE -> SVM
pipeline = Pipeline([
    ('preprocessor', preprocessor),   
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', SVC(random_state=42)) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = ['precision', 'recall', 'f1', 'average_precision']

cv_results = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)

print("교차검증 결과 :")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", cv_results[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", cv_results[f'test_{metric}'].mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nTest Set Predictions:")
print("Predicted Classes:")
print(y_pred)

In [None]:
# KNN

import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# id와 time 칼럼 제거
train = train.drop(columns=['id', 'Time'])
test = test.drop(columns=['id', 'Time'])

X_train = train.drop(columns=['Class'])
y_train = train['Class']
X_test = test

needscaling_features = ['Amount'] 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), needscaling_features) 
    ],
    remainder='passthrough' 
)

# Scaling -> SMOTE -> KNN
pipeline = Pipeline([
    ('preprocessor', preprocessor),   
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', KNeighborsClassifier()) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = ['precision', 'recall', 'f1', 'average_precision']

cv_results = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)

print("교차검증 결과 :")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", cv_results[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", cv_results[f'test_{metric}'].mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nTest Set Predictions:")
print("Predicted Classes:")
print(y_pred)

# 스케일링 무

In [3]:
# 1

import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train = train.drop(columns=['id', 'Time'])
test = test.drop(columns=['id', 'Time'])

X_train = train.drop(columns=['Class'])
y_train = train['Class']
X_test = test

# SMOTE -> LGBM
pipeline = Pipeline([
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', LGBMClassifier(random_state=42)) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['precision', 'recall', 'f1', 'average_precision']

cv_results = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)

print("교차검증 결과 :")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", cv_results[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", cv_results[f'test_{metric}'].mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nTest Set Predictions:")
print("Predicted Classes:")
print(y_pred)


[LightGBM] [Info] Number of positive: 136418, number of negative: 136418
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 272836, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 136418, number of negative: 136418
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 272836, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 136418, number of negative: 136418
[LightGBM] [Info] Auto-choosing col-wise mu

In [4]:
#logistic regression       ### 2

from sklearn.linear_model import LogisticRegression

import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train = train.drop(columns=['id', 'Time'])
test = test.drop(columns=['id', 'Time'])

X_train = train.drop(columns=['Class'])
y_train = train['Class']
X_test = test

# SMOTE -> RandomForestClassifier
pipeline = Pipeline([
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', LogisticRegression(random_state=42)) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['precision', 'recall', 'f1', 'average_precision']

cv_results = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)

print("교차검증 결과 :")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", cv_results[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", cv_results[f'test_{metric}'].mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nTest Set Predictions:")
print("Predicted Classes:")
print(y_pred)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

교차검증 결과 :
Precision Scores: [0.10153846 0.09004093 0.13863636 0.10075188 0.13461538]
Mean Precision: 0.1131166034367729
Recall Scores: [0.91666667 0.91666667 0.84722222 0.93055556 0.875     ]
Mean Recall: 0.8972222222222221
F1 Scores: [0.18282548 0.16397516 0.23828125 0.18181818 0.23333333]
Mean F1: 0.20004668103911225
Average_precision Scores: [0.78626292 0.73930282 0.77254451 0.73551616 0.79457255]
Mean Average_precision: 0.765639791917656

Test Set Predictions:
Predicted Classes:
[0 0 0 ... 0 0 0]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1]:
#random forest 

import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train = train.drop(columns=['id', 'Time'])
test = test.drop(columns=['id', 'Time'])

X_train = train.drop(columns=['Class'])
y_train = train['Class']
X_test = test

# SMOTE -> RandomForestClassifier
pipeline = Pipeline([
    ('smote', SMOTE(sampling_strategy='minority', random_state=42)), 
    ('classifier', RandomForestClassifier(random_state=42)) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['precision', 'recall', 'f1', 'average_precision']

cv_results = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)

print("교차검증 결과 :")
for metric in scoring:
    print(f"{metric.capitalize()} Scores:", cv_results[f'test_{metric}'])
    print(f"Mean {metric.capitalize()}:", cv_results[f'test_{metric}'].mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nTest Set Predictions:")
print("Predicted Classes:")
print(y_pred)


교차검증 결과 :
Precision Scores: [0.89552239 0.87671233 0.90625    0.85333333 0.89393939]
Mean Precision: 0.8851514888199103
Recall Scores: [0.83333333 0.88888889 0.80555556 0.88888889 0.81944444]
Mean Recall: 0.8472222222222221
F1 Scores: [0.86330935 0.88275862 0.85294118 0.8707483  0.85507246]
Mean F1: 0.8649659825532146
Average_precision Scores: [0.8634813  0.91190895 0.81029053 0.89390369 0.86459003]
Mean Average_precision: 0.8688349001252054


KeyboardInterrupt: 

In [None]:
# 성능 지표 공부 - 어떤 성능지표와 시각화 도구 -> 해석하는데 용이하게/ 매치 시켜서 생각하는게 아님 "(ex 혼동행렬은 다른거 같음)"
# smote를 적용한 데이터에서 각각의 성능 지표를 모델에 적용시킬때 유의점 파악 (ex. 훈련데이터에서만 적용되는 것임, 따라서 교차검증을 통해 검증데이터에서는 적용이 안되는거..?) -> 이건 스모트 내부에서 작용 
# -> smote 작동원리 공부!!

In [None]:
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split

# 전체 데이터로 최종 모델 학습
pipeline.fit(X, y)

# 테스트 데이터 분할 (예시용으로 전체 데이터를 훈련 후 테스트 세트로 분할하여 평가)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 최종 모델로 예측 확률 계산
y_prob = pipeline.predict_proba(X_test)[:, 1]

# ROC Curve 계산
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

# ROC Curve 그리기
plt.figure()
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier (AUC = 0.5)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Precision-Recall Curve

# 설명: Precision-Recall Curve는 다양한 임곗값에서 **Precision(정밀도)**과 **Recall(재현율)**의 관계를 시각화한 그래프입니다.
# 용도: 특히 불균형 데이터셋에서 모델의 성능을 평가하는 데 유용합니다. Precision이 높을수록 양성 예측의 정확도가 높고, Recall이 높을수록 실제 양성을 잘 찾아냅니다.
# AUC-PR: 그래프 아래 면적(AUC)을 계산해 성능을 수치로 나타낼 수 있습니다. 

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

y_prob = pipeline.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_prob)
pr_auc = average_precision_score(y_test, y_prob)

plt.plot(recall, precision, label=f'Precision-Recall Curve (AUC = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()


In [None]:
# Confusion Matrix (혼동 행렬)

# 설명: Confusion Matrix는 모델의 예측 결과를 TP (True Positive), FP (False Positive), FN (False Negative), **TN (True Negative)**으로 나타낸 표입니다.
# 용도: 모델의 예측 결과를 자세히 분석할 수 있어, 특정 오류(예: 1종 오류와 2종 오류)를 줄이는 데 유용합니다.
# 시각화: heatmap으로 시각화하여 직관적으로 오류 분포를 파악할 수 있습니다.

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

y_pred = pipeline.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Pred Neg', 'Pred Pos'], yticklabels=['True Neg', 'True Pos'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Cross-Validation Results:
# Precision Scores: [0.67741935 0.73626374 0.81081081 0.67021277 0.6744186 ]
# Mean Precision: 0.7138250545043733

# Recall Scores: [0.875      0.93055556 0.83333333 0.875      0.80555556]
# Mean Recall: 0.8638888888888889

# F1 Scores: [0.76363636 0.82208589 0.82191781 0.75903614 0.73417722]
# Mean F1: 0.7801706842388562

# Average_precision Scores: [0.83686128 0.91986044 0.81887023 0.78667227 0.80993289]
# Mean Average_precision: 0.8344394228496679

In [None]:
# 이상치 처리 진행 안함

# import matplotlib.pyplot as plt

# for column in df.select_dtypes(include='number').columns:
#     plt.figure(figsize=(10, 6))
#     plt.boxplot(df[column])
#     plt.title(f'Boxplot of {column}')
#     plt.xlabel(column)
#     plt.show()
