In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.svm as svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics 
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score, cross_validate
from lightgbm import LGBMClassifier
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier

#추가
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, fbeta_score


In [41]:
def get_clf_eval(y_test, y_pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test,pred_proba)
    
# 아래 추가함 (pr_score, f2 score, gmean)
# pr_score : 모델이 양성 클래스를 얼마나 정확하게 찾아내는지에 대한 정보를 제공(클래스 불균형이 존재할 때 유용)
# fbeta_score : 정밀도(Precision)와 재현율(Recall)의 조화 평균을 계산하는 지표.재현율에 더 큰 가중치를 두는데, 이는 False Negatives를 최소화하는 데 중점
# G-Mean : Sensitivity와 Specificity의 조화 평균. 클래스 간의 불균형을 고려하여 모델의 성능을 측정하며, 특히 Positive 클래스의 예측 성능이 중요한 경우에 유용
    confusion = confusion_matrix(y_test, y_pred)
    pr_score = average_precision_score(y_test, y_pred)
    f2 = fbeta_score(y_test, y_pred, beta=2)

    # G-mean 계산
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    tpr = tp / (tp + fn)  # True Positive Rate
    tnr = tn / (tn + fp)  # True Negative Rate
    gmean = np.sqrt(tpr * tnr)

    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))
    print('Fbeta :{:.4f}'.format(f2))
    print('평균 정밀도 : {:.4f}'.format(pr_score))
    print('gmean : {:.4f}'.format(gmean))

In [42]:
# dataset 작업요
df = pd.read_csv('./final_datasets.csv', index_col=0)
df.head(3)

Unnamed: 0,TARGET,성별,수입 유형,최종 학력,결혼 여부,주거 형태,휴대전화 소유 여부,이메일 소유 여부,직업,산업군,...,rest_sec,rest_lab,cons_low,cons_drv,cons_sec,cons_lab,bus_low,bus_drv,bus_sec,bus_lab
0,0,2,3,0,0,3,1,0,1,5,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,0,3,1,0,4,16,...,0,0,0,0,0,0,0,0,0,0
2,0,2,1,0,0,3,1,0,5,16,...,0,0,0,0,0,0,0,0,0,0


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59989 entries, 0 to 59999
Data columns (total 29 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   TARGET      59989 non-null  int64
 1   성별          59989 non-null  int64
 2   수입 유형       59989 non-null  int64
 3   최종 학력       59989 non-null  int64
 4   결혼 여부       59989 non-null  int64
 5   주거 형태       59989 non-null  int64
 6   휴대전화 소유 여부  59989 non-null  int64
 7   이메일 소유 여부   59989 non-null  int64
 8   직업          59989 non-null  int64
 9   산업군         59989 non-null  int64
 10  가입연수        59989 non-null  int64
 11  도시구분        59989 non-null  int64
 12  월간 수입       59989 non-null  int64
 13  home_shape  59989 non-null  int64
 14  car_home    59989 non-null  int64
 15  combinedFY  59989 non-null  int64
 16  age_income  59989 non-null  int64
 17  rest_low    59989 non-null  int64
 18  rest_drv    59989 non-null  int64
 19  rest_sec    59989 non-null  int64
 20  rest_lab    59989 non-null  int64

In [44]:
df.columns

Index(['TARGET', '성별', '수입 유형', '최종 학력', '결혼 여부', '주거 형태', '휴대전화 소유 여부',
       '이메일 소유 여부', '직업', '산업군', '가입연수', '도시구분', '월간 수입', 'home_shape',
       'car_home', 'combinedFY', 'age_income', 'rest_low', 'rest_drv',
       'rest_sec', 'rest_lab', 'cons_low', 'cons_drv', 'cons_sec', 'cons_lab',
       'bus_low', 'bus_drv', 'bus_sec', 'bus_lab'],
      dtype='object')

In [45]:
df['TARGET'].value_counts()

TARGET
0    53562
1     6427
Name: count, dtype: int64

In [46]:
# feature와 label 분리
feature = df.drop('TARGET', axis=1)
label = df['TARGET']

In [47]:
# train test 분리
X_train_old, X_test, y_train_old , y_test = train_test_split(feature, label, test_size=0.3 , random_state=42)

In [48]:
y_train_old.value_counts()

TARGET
0    37465
1     4527
Name: count, dtype: int64

In [49]:
# # 언더샘플링 - 소수라벨과 다수라벨 비율 1:1

# from imblearn.under_sampling import RandomUnderSampler
# undersample = RandomUnderSampler(sampling_strategy='majority')
# X_train, y_train = undersample.fit_resample(X_train_old, y_train_old)

In [50]:
# # 오버샘플링 - 소수라벨과 다수라벨 비율 1:1 / 오버 샘플링이 좋음 투터님 피셜

# from imblearn.over_sampling import RandomOverSampler
# oversample = RandomOverSampler(sampling_strategy='minority')
# X_train, y_train = oversample.fit_resample(X_train_old, y_train_old)

In [51]:
# SMOTE

from imblearn.over_sampling import SMOTE
smote_sample = SMOTE(sampling_strategy='minority') 
X_train, y_train = smote_sample.fit_resample(X_train_old, y_train_old)

In [52]:
y_train.value_counts()

TARGET
0    37465
1    37465
Name: count, dtype: int64

In [53]:
# 1. Decision Tree를 이용한 학습

dt = DecisionTreeClassifier(random_state=43)

# 교차검증을 통한 과적합 or 과소적합 여부 확인
cross_val = cross_val_score(dt , X_train , y=y_train ,cv=5, scoring='accuracy')
print(np.mean(cross_val))

0.8087815294274655


In [54]:
# GridSearchCV를 적용해 Decision Tree의 교차검증 및 하이퍼파라미터 튜닝

parameters = {'max_depth' : [2, 3, 4, 5],
             'min_samples_split' : [1, 3, 5, 7, 9]}

grid_dt = GridSearchCV(dt, param_grid = parameters, cv=3, refit=True)
grid_dt.fit(X_train, y_train)

dt = grid_dt.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_dt.best_params_}")
print(f"최고 예측 정확도: {grid_dt.best_score_:.4f}")

최적 하이퍼 파라미터: {'max_depth': 5, 'min_samples_split': 3}
최고 예측 정확도: 0.6920


In [55]:
# 임곗값 조정을 통한 recall값 향상
dt.fit(X_train, y_train)
pred = dt.predict(X_test)
pred_proba = dt.predict_proba(X_test)[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=0.3).fit(pred_proba)
custom_pred = binarizer.transform(pred_proba)

get_clf_eval(y_test, custom_pred, pred_proba)

오차행렬:
 [[ 5766 10331]
 [  497  1403]]

정확도: 0.3983
정밀도: 0.1196
재현율: 0.7384
F1: 0.2058
AUC: 0.5947
Fbeta :0.3628
평균 정밀도 : 0.1159
gmean : 0.5143


In [56]:
# 트리 기반 모델의 특성 중요도 확인
for feature, importance in zip(X_train.columns, dt.feature_importances_):
    print(f"{feature}: {importance}")
# 0에 가까울수록 중요도 낮고, 0.4나 0.5 정도가 중요도 높다할 수 있음

성별: 0.24272196100720686
수입 유형: 0.17013157840718193
최종 학력: 0.2949407838557785
결혼 여부: 0.0
주거 형태: 0.0
휴대전화 소유 여부: 0.0
이메일 소유 여부: 0.0
직업: 0.01619430872433349
산업군: 0.0116569553945742
가입연수: 0.14296403678321312
도시구분: 0.003269314206014566
월간 수입: 0.022341456629613378
home_shape: 0.0
car_home: 0.09485340873747691
combinedFY: 0.0
age_income: 0.0009261962546071329
rest_low: 0.0
rest_drv: 0.0
rest_sec: 0.0
rest_lab: 0.0
cons_low: 0.0
cons_drv: 0.0
cons_sec: 0.0
cons_lab: 0.0
bus_low: 0.0
bus_drv: 0.0
bus_sec: 0.0
bus_lab: 0.0


In [57]:
# 2. RandomForest를 이용한 학습

rf = RandomForestClassifier(random_state=43)

cross_val = cross_val_score(rf , X_train , y=y_train ,cv=5, scoring='accuracy')
print(np.mean(cross_val))


0.8536767649806485


In [58]:
# 하이퍼파라미터 조정

parameters = {
    'max_depth' : [6, 8, 12],
    'min_samples_split' : [16, 24]
}

grid_rf = GridSearchCV(rf, param_grid = parameters, cv=3, refit=True)
grid_rf.fit(X_train, y_train)

rf = grid_rf.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_rf.best_params_}")
print(f"최고 예측 정확도: {grid_rf.best_score_:.4f}")

최적 하이퍼 파라미터: {'max_depth': 12, 'min_samples_split': 16}
최고 예측 정확도: 0.7514


In [59]:
# 임계값 조정 및 적용

rf.fit(X_train, y_train)
pred = rf.predict(X_test)
pred_proba = rf.predict_proba(X_test)[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=0.3).fit(pred_proba)
custom_pred = binarizer.transform(pred_proba)

get_clf_eval(y_test, custom_pred, pred_proba)


오차행렬:
 [[6601 9496]
 [ 539 1361]]

정확도: 0.4424
정밀도: 0.1254
재현율: 0.7163
F1: 0.2134
AUC: 0.6021
Fbeta :0.3687
평균 정밀도 : 0.1197
gmean : 0.5420


In [60]:
# 트리 기반 모델의 특성 중요도 확인
for feature, importance in zip(X_train.columns, rf.feature_importances_):
    print(f"{feature}: {importance}")

성별: 0.14279402620345877
수입 유형: 0.1158488864737136
최종 학력: 0.14804480322607788
결혼 여부: 0.024760643475298574
주거 형태: 0.011061576522012595
휴대전화 소유 여부: 7.613036939918717e-06
이메일 소유 여부: 0.030631078131550176
직업: 0.0473784111453448
산업군: 0.04627585735635944
가입연수: 0.09639170278189739
도시구분: 0.04828054992150942
월간 수입: 0.06862485545367127
home_shape: 0.02471290030505721
car_home: 0.067495007080093
combinedFY: 0.038901556930923065
age_income: 0.07761655086828147
rest_low: 2.038793933120122e-05
rest_drv: 0.00010881895000327317
rest_sec: 0.00013819110199687342
rest_lab: 7.049631246076451e-05
cons_low: 5.763364865175482e-05
cons_drv: 0.00047961259230085634
cons_sec: 0.00015303487236673405
cons_lab: 0.002615155031362884
bus_low: 0.0005010300307779717
bus_drv: 0.0022368914874273307
bus_sec: 0.0016586880336297001
bus_lab: 0.0031340410875021945


In [61]:
# 3. Logistic regrssion 을 이용한 학습

lr = LogisticRegression(random_state=43)

cross_val = cross_val_score(lr, X_train , y=y_train ,cv=5, scoring='accuracy')
print(np.mean(cross_val))


0.6951421326571466


In [62]:
parameters = {'penalty': ['l2','l1'],
          'C':[0.01,0.1,1,10]}

grid_lr = GridSearchCV(lr, param_grid = parameters, cv=3, refit=True)
grid_lr.fit(X_train, y_train)

lr = grid_lr.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_lr.best_params_}")
print(f"최고 예측 정확도: {grid_lr.best_score_:.4f}")

최적 하이퍼 파라미터: {'C': 0.1, 'penalty': 'l2'}
최고 예측 정확도: 0.6952


In [63]:
# 임계값 조정 및 모델 적용

lr.fit(X_train, y_train)
pred = lr.predict(X_test)
pred_proba = lr.predict_proba(X_test)[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=0.3).fit(pred_proba)
custom_pred = binarizer.transform(pred_proba)

get_clf_eval(y_test , custom_pred, pred_proba)

오차행렬:
 [[ 5844 10253]
 [  456  1444]]

정확도: 0.4050
정밀도: 0.1235
재현율: 0.7600
F1: 0.2124
AUC: 0.5974
Fbeta :0.3742
평균 정밀도 : 0.1192
gmean : 0.5253


In [64]:
# 학습된 모델의 특성 중요도 확인
feature_importance = lr.coef_[0]  # 특성의 가중치 또는 중요도

# 특성별 중요도 출력
for feature, importance in zip(X_train.columns, feature_importance):
    print(f"{feature}: {importance}")

# 가중치의 절댓값이 클수록 중요도 높음
# 0에 가까울수록 중요도 낮음

성별: -1.1230039762410964
수입 유형: -0.3124384738652864
최종 학력: -0.8580271129115739
결혼 여부: -0.07549284057591987
주거 형태: -0.17914056009943521
휴대전화 소유 여부: 2.0751142638569893
이메일 소유 여부: -1.5640571565614863
직업: -0.01296568604153789
산업군: -0.00711091794886099
가입연수: -0.5255404080839778
도시구분: -0.16030142157888905
월간 수입: -0.2672288802651252
home_shape: 0.13741566013391826
car_home: -0.42160702582152837
combinedFY: -0.03602249749420847
age_income: -0.03551457503870453
rest_low: -0.006121401362827918
rest_drv: -0.03001272767736507
rest_sec: -0.010601800572843854
rest_lab: -0.006682694356310621
cons_low: 0.010480506431156472
cons_drv: -0.0938149994803944
cons_sec: -0.017066562293013794
cons_lab: -0.020769554311190926
bus_low: -0.046256157695187046
bus_drv: -0.07198577419320697
bus_sec: -0.3269413828893412
bus_lab: -0.26743480025052535


In [65]:
# 4. KNN 을 이용한 학습

knn = KNeighborsClassifier()
# knn에는 무작위성 추출(random_state)를 하지 않음.

cross_val = cross_val_score(knn , X_train , y=y_train ,cv=5, scoring='accuracy')
print(np.mean(cross_val))


0.7944214600293608


In [66]:
# # 하이퍼파라미터 조정

# parameters = {'n_neighbors': [3, 5, 7, 9],
#               'weights': ['uniform', 'distance']
#           }

# grid_knn = GridSearchCV(knn, param_grid = parameters, cv=3, refit=True)
# grid_knn.fit(X_train, y_train)

# knn = grid_knn.best_estimator_

# print(f"최적 하이퍼 파라미터: {grid_knn.best_params_}")
# print(f"최고 예측 정확도: {grid_knn.best_score_:.4f}")

In [67]:
# # 임계값 조정 및 모델 적용

# knn.fit(X_train, y_train)
# pred = knn.predict(X_test)
# pred_proba = knn.predict_proba(X_test)[:, 1].reshape(-1, 1)

# binarizer = Binarizer(threshold=0.5).fit(pred_proba)
# custom_pred = binarizer.transform(pred_proba)

# get_clf_eval(y_test , custom_pred, pred_proba)

In [68]:
# 5. Lightgbm을 이용한 학습

lgbm= LGBMClassifier(random_state=43)

cross_val = cross_val_score(lgbm, X_train , y=y_train ,cv=5, scoring='accuracy')
print(np.mean(cross_val))

[LightGBM] [Info] Number of positive: 29972, number of negative: 29972
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 143
[LightGBM] [Info] Number of data points in the train set: 59944, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 29972, number of negative: 29972
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003446 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 143
[LightGBM] [Info] Number of data points in the train set: 59944, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000

In [69]:
# 하이퍼파라미터 조정

parameters = {'n_neighbors': [100, 500],
              'learning_rate': [0.05, 0.1]
          }

grid_lgbm = GridSearchCV(lgbm, param_grid = parameters, cv=2, verbose=1, refit=True)
grid_lgbm.fit(X_train, y_train)

lgbm = grid_lgbm.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_lgbm.best_params_}")
print(f"최고 예측 정확도: {grid_lgbm.best_score_:.4f}")

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[LightGBM] [Info] Number of positive: 18733, number of negative: 18732
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005805 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 143
[LightGBM] [Info] Number of data points in the train set: 37465, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500013 -> initscore=0.000053
[LightGBM] [Info] Start training from score 0.000053
[LightGBM] [Info] Number of positive: 18732, number of negative: 18733
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018990 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 143
[LightGBM] [Info] Number of data points in the train set

In [70]:
# 임계값 조정 및 모델 적용

lgbm.fit(X_train, y_train)
pred = lgbm.predict(X_test)
pred_proba = lgbm.predict_proba(X_test)[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=0.3).fit(pred_proba)
custom_pred = binarizer.transform(pred_proba)

get_clf_eval(y_test , custom_pred, pred_proba)


[LightGBM] [Info] Number of positive: 37465, number of negative: 37465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 145
[LightGBM] [Info] Number of data points in the train set: 74930, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
오차행렬:
 [[8306 7791]
 [ 757 1143]]

정확도: 0.5250
정밀도: 0.1279
재현율: 0.6016
F1: 0.2110
AUC: 0.5902
Fbeta :0.3457
평균 정밀도 : 0.1190
gmean : 0.5571


In [71]:
# 6. XGboost 
xgb = XGBClassifier(random_state=43)

cross_val = cross_val_score(xgb, X_train, y=y_train, cv=5, scoring='accuracy')
print(np.mean(cross_val))

0.7940611237154678


In [72]:
# 하이퍼파라미터 조정
parameters = {'learning_rat': [0.01, 0.1],  
              'n_estimators': [50, 100, 200]
          }

grid_xgb = GridSearchCV(xgb, param_grid=parameters, cv=5, verbose=1, refit=True)
grid_xgb.fit(X_train, y_train)

xgb = grid_xgb.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_xgb.best_params_}")
print(f"최고 예측 정확도: {grid_xgb.best_score_:.4f}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
최적 하이퍼 파라미터: {'learning_rat': 0.01, 'n_estimators': 200}
최고 예측 정확도: 0.8099


In [73]:
# 임계값 조정 및 모델 적용

xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
pred_proba = xgb.predict_proba(X_test)[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=0.3).fit(pred_proba)
custom_pred = binarizer.transform(pred_proba)

get_clf_eval(y_test , custom_pred, pred_proba)

오차행렬:
 [[10163  5934]
 [ 1012   888]]

정확도: 0.6140
정밀도: 0.1302
재현율: 0.4674
F1: 0.2036
AUC: 0.5732
Fbeta :0.3079
평균 정밀도 : 0.1171
gmean : 0.5432
