In [None]:
# Feature Importance를 살펴보는 것은 모델링 과정에서 어떤 특성이 모델 예측에 미치는 영향이 큰지 파악하는 데 도움을 주며, 
# 모델 해석과 선택에 있어서 유용한 정보를 제공할 수 있습니다. 
# 따라서 회귀 계수가 높은 값이면서 Feature Importance가 높다면 해당 특성이 모델에서 중요하다고 볼 수 있습니다.

In [None]:
# 재현율(Recall)은 이진 분류 모델의 평가 지표 중 하나로, 실제 양성 케이스(Positive cases) 중에서 모델이 정확하게 양성으로 예측한 케이스의 비율
# 정밀도(Precision)는 이진 분류 모델의 평가 지표 중 하나로, 모델이 예측한 양성 케이스 중에서 실제 양성인 케이스의 비율

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.svm as svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [93]:
train_df = pd.read_csv('../EDA/credit_data.csv')
train_df.head(1)

Unnamed: 0.1,Unnamed: 0,TARGET,성별,차량 소유 여부,부동산 소유 여부,자녀 수,연간 수입,수입 유형,최종 학력,결혼 여부,...,이메일 소유 여부,직업,가족 구성원 수,산업군,나이,근속연수,가입연수,인구수,도시구분,소득수준
0,0,0,2,1,1,2,18054000.0,3,0,0,...,0,1,4.0,5,39,1000,23.0,256685,3,2


In [94]:
train_copy = train_df.copy()

In [95]:
# 산업군을 기준으로 그룹화하고 각 산업군별 직업 출력
grouped = train_df.groupby('직업')

for job, group_data in grouped:
    print(f"직업{job} 이 종사하는 산업군:")
    print(group_data['산업군'].unique())  # 각 산업군에서 유일한 직업 목록 출력
    print()

직업0 이 종사하는 산업군:
[13 16  0  5 30 23 24 26 11 17 22 20 10  4 31  7  1  6]

직업1 이 종사하는 산업군:
[ 5 17 16 30 22 24 26 23 28  3 31  6 10 20  4 13 21 18  8  1  0 15 25  7
 11 14  2 32 27 19 12 29  9]

직업2 이 종사하는 산업군:
[24 16 10 18  5  8 23 31 26 21 17  6 20 11  3 30]

직업3 이 종사하는 산업군:
[ 8 24 16 13 29  5 26  3  2 21 20 31 10 30 23 25 22  7  0  6 14  4 12 15
 17 19 28 18  1 32 11  9 27]

직업4 이 종사하는 산업군:
[16 29 26 23 17 10  5 24  7  0 19 28 30 20 21  4 25  6  3 22  9 14  8 31
  1 13 18 11  2 32 15]

직업5 이 종사하는 산업군:
[16 25 24  4 17 20  5  0 26 19 10 28 18 31 30  6 21 11 23 32  1  8  7 29
 13  2 15  3  9 22 27 14]

직업6 이 종사하는 산업군:
[31 16 28 21 26  5 30 24 23 10  7  3 17  8 20  0 22 14 13 32  1 29 11  2
 25 19  6 27]

직업7 이 종사하는 산업군:
[ 5 13  1 16 17  7  6 30 10 24 28  0 23 21 20 26  4 22  8 31  3 15]

직업8 이 종사하는 산업군:
[16 15 18 24  5 26 23  0]

직업9 이 종사하는 산업군:
[ 5 20 16 24 23 26 30  0  3 10 22  7 28 21 12 17  6]

직업10 이 종사하는 산업군:
[16 10 17 24  5  8 26  0 22  6 20 15 12 23 31 18 28 11 14  9  4 29 19  2
 

In [96]:
def create_job_column(df, job_value):

#    새로운 컬럼을 생성하고 특정 조건에 따라 값을 할당하는 함수

#    Parameters:
#    - df: 데이터프레임
#    - job_value: 직업 구분코드

#    Returns:
#    - df: 새로운 컬럼을 추가한 데이터프레임
    
    for i in df['산업군'].unique():
        new_column_name = f"{i}_{job_value}"
        mask = (df['산업군'] == i) & (df['직업'] == job_value)
        df[new_column_name] = mask.astype(int)
    return df
# 함수 사용 예시
# create_job_column(your_df, 'job_value')

In [97]:
# 연체율 높은 직업 Top 5 -> 1, 5, 10, 12, 17

In [98]:
# 직업군과 산업컬럼을 겹쳐서 카테고리컬로
# 카테고리로 할 거면 원핫이코딩으로 헤줘야하고(스케일링 하면 안 됨)
# LogisticRegression 돌렸을 때 회귀계수 크게 나온다면 연체여부에 영향을 많이 끼치는 것
# 카테고리로 안 주고 실수값으로 밸류값을 지정해도 됨

In [99]:
create_job_column(train_copy, '1')

Unnamed: 0.1,Unnamed: 0,TARGET,성별,차량 소유 여부,부동산 소유 여부,자녀 수,연간 수입,수입 유형,최종 학력,결혼 여부,...,6_1,3_1,14_1,18_1,2_1,32_1,15_1,12_1,9_1,27_1
0,0,0,2,1,1,2,18054000.0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,0,0,59472000.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,2,0,1,0,29736000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,2,1,0,1,38232000.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,2,0,1,0,26550000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59984,59995,0,2,0,1,0,15930000.0,3,0,1,...,0,0,0,0,0,0,0,0,0,0
59985,59996,1,1,1,1,0,53100000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
59986,59997,1,2,0,0,0,47790000.0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
59987,59998,0,2,1,1,0,29736000.0,1,0,3,...,0,0,0,0,0,0,0,0,0,0


In [100]:
train_copy.head(1)

Unnamed: 0.1,Unnamed: 0,TARGET,성별,차량 소유 여부,부동산 소유 여부,자녀 수,연간 수입,수입 유형,최종 학력,결혼 여부,...,6_1,3_1,14_1,18_1,2_1,32_1,15_1,12_1,9_1,27_1
0,0,0,2,1,1,2,18054000.0,3,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
job_ind = pd.read_csv('../EDA/job_ind.csv')
job_ind.drop(job_ind.index[-1], inplace=True)
job_ind

Unnamed: 0,직업,산업군,total,default
0,1,5.0,0.014818,0.138323
1,1,16.0,0.005917,0.055236
2,1,24.0,0.001533,0.014315
3,1,17.0,0.000933,0.008713
4,1,26.0,0.000700,0.006535
...,...,...,...,...
459,0,26.0,0.000000,0.000000
460,0,31.0,0.000000,0.000000
461,0,11.0,0.000000,0.000000
462,0,0.0,0.000000,0.000000


In [102]:
job_ind['산업군'] = job_ind['산업군'].astype(int)
job_ind['직업'] = job_ind['직업'].astype(int)

In [103]:
job_ind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464 entries, 0 to 463
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   직업       464 non-null    int32  
 1   산업군      464 non-null    int32  
 2   total    464 non-null    float64
 3   default  464 non-null    float64
dtypes: float64(2), int32(2)
memory usage: 11.0 KB


In [104]:
job_ind.head()

Unnamed: 0,직업,산업군,total,default
0,1,5,0.014818,0.138323
1,1,16,0.005917,0.055236
2,1,24,0.001533,0.014315
3,1,17,0.000933,0.008713
4,1,26,0.0007,0.006535


In [105]:
job_ind.loc[(job_ind['산업군'] == 1) & (job_ind['직업'] == 5), 'total'].values

array([3.33356e-05])

In [106]:
train_copy.head()

Unnamed: 0.1,Unnamed: 0,TARGET,성별,차량 소유 여부,부동산 소유 여부,자녀 수,연간 수입,수입 유형,최종 학력,결혼 여부,...,6_1,3_1,14_1,18_1,2_1,32_1,15_1,12_1,9_1,27_1
0,0,0,2,1,1,2,18054000.0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,0,0,59472000.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,2,0,1,0,29736000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,2,1,0,1,38232000.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,2,0,1,0,26550000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
# 산업군_직업 컬럼 추가 (total를 value값으로 대입)
for i in train_copy['산업군'].unique():
    for j in train_copy['직업'].unique():
        # # job_ind 데이터프레임에서 조건에 맞는 total 값을 가져옴
        value = job_ind.loc[(job_ind['산업군'] == i) & (job_ind['직업'] == j), 'total'].values

        # train_copy에서 해당 조건에 맞는 행에 새로운 컬럼 '산업군_직업'을 만들어 value 값을 할당
        train_copy.loc[(train_copy['산업군'] == i) & (train_copy['직업'] == j), '산업군_직업'] = value[0] if len(value) > 0 else None

train_copy


Unnamed: 0.1,Unnamed: 0,TARGET,성별,차량 소유 여부,부동산 소유 여부,자녀 수,연간 수입,수입 유형,최종 학력,결혼 여부,...,3_1,14_1,18_1,2_1,32_1,15_1,12_1,9_1,27_1,산업군_직업
0,0,0,2,1,1,2,18054000.0,3,0,0,...,0,0,0,0,0,0,0,0,0,0.014818
1,1,0,1,1,0,0,59472000.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0.001100
2,2,0,2,0,1,0,29736000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.010751
3,3,0,2,1,0,1,38232000.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0.000933
4,4,0,2,0,1,0,26550000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.005917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59984,59995,0,2,0,1,0,15930000.0,3,0,1,...,0,0,0,0,0,0,0,0,0,0.014818
59985,59996,1,1,1,1,0,53100000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.002567
59986,59997,1,2,0,0,0,47790000.0,1,1,1,...,0,0,0,0,0,0,0,0,0,0.000100
59987,59998,0,2,1,1,0,29736000.0,1,0,3,...,0,0,0,0,0,0,0,0,0,0.000233


In [108]:
train_copy[['산업군','직업','산업군_직업']]

Unnamed: 0,산업군,직업,산업군_직업
0,5,1,0.014818
1,16,4,0.001100
2,16,5,0.010751
3,17,1,0.000933
4,16,1,0.005917
...,...,...,...
59984,5,1,0.014818
59985,24,12,0.002567
59986,22,1,0.000100
59987,19,17,0.000233


In [109]:
# 산업군_직업2 컬럼 추가 (default를 value값으로 대입)
for i in train_copy['산업군'].unique():
    for j in train_copy['직업'].unique():
        # # job_ind 데이터프레임에서 조건에 맞는 total 값을 가져옴
        value = job_ind.loc[(job_ind['산업군'] == i) & (job_ind['직업'] == j), 'default'].values

        # train_copy에서 해당 조건에 맞는 행에 새로운 컬럼 '산업군_직업'을 만들어 value 값을 할당
        train_copy.loc[(train_copy['산업군'] == i) & (train_copy['직업'] == j), '산업군_직업2'] = value[0] if len(value) > 0 else None

train_copy

Unnamed: 0.1,Unnamed: 0,TARGET,성별,차량 소유 여부,부동산 소유 여부,자녀 수,연간 수입,수입 유형,최종 학력,결혼 여부,...,14_1,18_1,2_1,32_1,15_1,12_1,9_1,27_1,산업군_직업,산업군_직업2
0,0,0,2,1,1,2,18054000.0,3,0,0,...,0,0,0,0,0,0,0,0,0.014818,0.138323
1,1,0,1,1,0,0,59472000.0,1,1,0,...,0,0,0,0,0,0,0,0,0.001100,0.010269
2,2,0,2,0,1,0,29736000.0,1,0,0,...,0,0,0,0,0,0,0,0,0.010751,0.100358
3,3,0,2,1,0,1,38232000.0,2,0,0,...,0,0,0,0,0,0,0,0,0.000933,0.008713
4,4,0,2,0,1,0,26550000.0,1,0,0,...,0,0,0,0,0,0,0,0,0.005917,0.055236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59984,59995,0,2,0,1,0,15930000.0,3,0,1,...,0,0,0,0,0,0,0,0,0.014818,0.138323
59985,59996,1,1,1,1,0,53100000.0,1,0,0,...,0,0,0,0,0,0,0,0,0.002567,0.023961
59986,59997,1,2,0,0,0,47790000.0,1,1,1,...,0,0,0,0,0,0,0,0,0.000100,0.000934
59987,59998,0,2,1,1,0,29736000.0,1,0,3,...,0,0,0,0,0,0,0,0,0.000233,0.002178


In [110]:
train_copy[['산업군','직업','산업군_직업2']]

Unnamed: 0,산업군,직업,산업군_직업2
0,5,1,0.138323
1,16,4,0.010269
2,16,5,0.100358
3,17,1,0.008713
4,16,1,0.055236
...,...,...,...
59984,5,1,0.138323
59985,24,12,0.023961
59986,22,1,0.000934
59987,19,17,0.002178


In [111]:
# 임금수준 & 나이
conditions1 = [
(df['나이구간'] == 0) & (df['소득수준'] == 0),
(df['나이구간'] == 0) & (df['소득수준'] == 1),
(df['나이구간'] == 0) & (df['소득수준'] == 2),
(df['나이구간'] == 0) & (df['소득수준'] == 3),
(df['나이구간'] == 0) & (df['소득수준'] == 4),
(df['나이구간'] == 1) & (df['소득수준'] == 0),
(df['나이구간'] == 1) & (df['소득수준'] == 1),
(df['나이구간'] == 1) & (df['소득수준'] == 2),
(df['나이구간'] == 1) & (df['소득수준'] == 3),
(df['나이구간'] == 1) & (df['소득수준'] == 4),
]
# 할당할 값 설정
values = [0,1, 2, 3, 4, 5, 6, 7, 8, 9]
# np.select를 사용하여 조건에 맞는 값을 'y' 컬럼에 할당
df['age_income'] = np.select(conditions1, values, default=0)
df.head(2)

NameError: name 'df' is not defined

In [112]:
# feature와 label 분리
feature = train_copy.drop('TARGET', axis=1)
label = train_copy['TARGET']

In [113]:
# train test 분리
X_train_old, X_test, y_train_old , y_test = train_test_split(feature, label, test_size=0.3 , random_state=42)

In [114]:
y_train_old.value_counts()

TARGET
0    37465
1     4527
Name: count, dtype: int64

In [115]:
from imblearn.over_sampling import SMOTE
X_train, y_train = SMOTE(random_state = 22).fit_resample(X_train_old, y_train_old)

In [116]:
y_train.value_counts()

TARGET
0    37465
1    37465
Name: count, dtype: int64

In [117]:
# 1. Decision Tree를 이용한 학습
# 1-1. GridSearchCV를 적용해 Decision Tree의 교차검증 및 하이퍼파라미터 튜닝

dt = DecisionTreeClassifier()

parameters = {'max_depth' : [2, 3, 4, 5],
             'min_samples_split' : [1, 3, 5, 7, 9]}

grid_dt = GridSearchCV(dt, param_grid = parameters, cv=3, refit=True)
grid_dt.fit(X_train, y_train)

dt = grid_dt.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_dt.best_params_}")
print(f"최고 예측 정확도: {grid_dt.best_score_:.4f}")

최적 하이퍼 파라미터: {'max_depth': 5, 'min_samples_split': 5}
최고 예측 정확도: 0.7434


In [118]:
# Decision Tree 적용
pred = dt.predict(X_test)
pred_proba = dt.predict_proba(X_test)
pred_proba_1 = pred_proba[:, 1]

acc = accuracy_score(y_test , pred)
prec = precision_score(y_test , pred)
rec = recall_score(y_test , pred)
f1score = f1_score(y_test, pred)
auc_score = roc_auc_score(y_test , pred_proba_1)
print(f'의사결정나무 정확도 : {acc:.3f}')
print(f'의사결정나무 정밀도 : {prec:.3f}')
print(f'의사결정나무 재현율 : {rec:.3f}')
print(f'의사결정나무 f1_score : {f1score:.3f}')
print(f'의사결정나무 roc_auc : {auc_score:.3f}')

의사결정나무 정확도 : 0.824
의사결정나무 정밀도 : 0.170
의사결정나무 재현율 : 0.173
의사결정나무 f1_score : 0.171
의사결정나무 roc_auc : 0.587


In [119]:
# 트리 기반 모델의 특성 중요도 확인
for feature, importance in zip(X_train.columns, dt.feature_importances_):
    print(f"{feature}: {importance}")

Unnamed: 0: 0.0
성별: 0.2710018986766985
차량 소유 여부: 0.20772339205111126
부동산 소유 여부: 0.00039290495932091385
자녀 수: 0.0
연간 수입: 0.0028749456700666114
수입 유형: 0.049471838448613546
최종 학력: 0.01879244406259424
결혼 여부: 0.0
주거 형태: 0.0
거주지 인구 비율: 0.0
휴대전화 소유 여부: 0.0
업무용 휴대전화 소유 여부: 0.19310818833425278
이메일 소유 여부: 0.0
직업: 0.042827779031611335
가족 구성원 수: 0.06741193129972155
산업군: 0.0
나이: 0.0
근속연수: 1.2696074384288864e-05
가입연수: 0.0
인구수: 0.0
도시구분: 0.0
소득수준: 0.0
5_1: 0.0
16_1: 0.0
17_1: 0.0
10_1: 0.0
25_1: 0.0
20_1: 0.0
30_1: 0.0
23_1: 0.0
24_1: 0.0
7_1: 0.0
4_1: 0.0
8_1: 0.0
1_1: 0.0
22_1: 0.0
29_1: 0.0
26_1: 0.0
0_1: 0.0
11_1: 0.0
31_1: 0.0
28_1: 0.0
21_1: 0.0
19_1: 0.0
13_1: 0.0
6_1: 0.0
3_1: 0.0
14_1: 0.0
18_1: 0.0
2_1: 0.0
32_1: 0.0
15_1: 0.0
12_1: 0.0
9_1: 0.0
27_1: 0.0
산업군_직업: 0.10880089576103694
산업군_직업2: 0.03758108563058797


In [120]:
# 2. RandomForest를 이용한 학습
# 2-1. GridSearchCV를 적용해 RandomForest의 교차검증 및 하이퍼파라미터 튜닝

rf = RandomForestClassifier()

parameters = {
    'max_depth' : [6, 8, 12],
    'min_samples_split' : [16, 24]
}

grid_rf = GridSearchCV(rf, param_grid = parameters, cv=3, refit=True)
grid_rf.fit(X_train, y_train)

rf = grid_rf.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_rf.best_params_}")
print(f"최고 예측 정확도: {grid_rf.best_score_:.4f}")

최적 하이퍼 파라미터: {'max_depth': 12, 'min_samples_split': 16}
최고 예측 정확도: 0.8679


In [121]:
# RandomForest 적용

rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
pred_proba = rf.predict_proba(X_test)
pred_proba_1 = pred_proba[:, 1]

acc = accuracy_score(y_test , pred)
prec = precision_score(y_test , pred)
rec = recall_score(y_test , pred)
f1score = f1_score(y_test, pred)
auc_score = roc_auc_score(y_test , pred_proba_1)

print(f'랜덤포레스트 정확도 : {acc:.3f}')
print(f'랜덤포레스트 정밀도 : {prec:.3f}')
print(f'랜덤포레스트 재현율 : {rec:.3f}')
print(f'랜덤포레스트 f1_score : {f1score:.3f}')
print(f'랜덤포레스트 roc_auc : {auc_score:.3f}')

랜덤포레스트 정확도 : 0.880
랜덤포레스트 정밀도 : 0.204
랜덤포레스트 재현율 : 0.047
랜덤포레스트 f1_score : 0.076
랜덤포레스트 roc_auc : 0.614


In [122]:
# 트리 기반 모델의 특성 중요도 확인
for feature, importance in zip(X_train.columns, rf.feature_importances_):
    print(f"{feature}: {importance}")

Unnamed: 0: 0.050500698867512905
성별: 0.0593148271484762
차량 소유 여부: 0.04654341737760776
부동산 소유 여부: 0.015221291780453823
자녀 수: 0.020659249742840707
연간 수입: 0.04188221657863742
수입 유형: 0.03761677142588114
최종 학력: 0.029334972288208618
결혼 여부: 0.024026597763192936
주거 형태: 0.010443466037337545
거주지 인구 비율: 0.040926783130587105
휴대전화 소유 여부: 2.2960812032696348e-06
업무용 휴대전화 소유 여부: 0.029540827928364963
이메일 소유 여부: 0.006343032393266098
직업: 0.054131372052481024
가족 구성원 수: 0.08872529181073127
산업군: 0.051035747611599375
나이: 0.04937966837281438
근속연수: 0.0640321871499332
가입연수: 0.052766506225011146
인구수: 0.04072091679582731
도시구분: 0.009394170787886959
소득수준: 0.0068267453600939386
5_1: 0.0
16_1: 0.0
17_1: 0.0
10_1: 0.0
25_1: 0.0
20_1: 0.0
30_1: 0.0
23_1: 0.0
24_1: 0.0
7_1: 0.0
4_1: 0.0
8_1: 0.0
1_1: 0.0
22_1: 0.0
29_1: 0.0
26_1: 0.0
0_1: 0.0
11_1: 0.0
31_1: 0.0
28_1: 0.0
21_1: 0.0
19_1: 0.0
13_1: 0.0
6_1: 0.0
3_1: 0.0
14_1: 0.0
18_1: 0.0
2_1: 0.0
32_1: 0.0
15_1: 0.0
12_1: 0.0
9_1: 0.0
27_1: 0.0
산업군_직업: 0.08237472601194

In [123]:
# 3. Logistic regrssion 을 이용한 학습
# 3-1. GridSearchCV를 적용해 Logistic regrssion의 교차검증 및 하이퍼파라미터 튜닝

lr = LogisticRegression()

parameters = {'penalty': ['l2','l1'],
          'C':[0.01,0.1,1,10]}

grid_lr = GridSearchCV(lr, param_grid = parameters, cv=3, refit=True)
grid_lr.fit(X_train, y_train)

lr = grid_lr.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_lr.best_params_}")
print(f"최고 예측 정확도: {grid_lr.best_score_:.4f}")

최적 하이퍼 파라미터: {'C': 0.01, 'penalty': 'l2'}
최고 예측 정확도: 0.5168


In [124]:
#3-2. Logistic Regression 적용
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
pred_proba = lr.predict_proba(X_test)
pred_proba_1 = pred_proba[:, 1]

acc = accuracy_score(y_test , pred)
prec = precision_score(y_test , pred)
rec = recall_score(y_test , pred)
auc_score = roc_auc_score(y_test , pred_proba_1)
f1score = f1_score(y_test, pred)

print(f'Logistic regrssion 정확도 : {acc:.3f}')
print(f'Logistic regrssion 정밀도 : {prec:.3f}')
print(f'Logistic regrssion 재현율 : {rec:.3f}')
print(f'Logistic regrssion f1_score : {f1score:.3f}')
print(f'Logistic regrssion roc_auc : {auc_score:.3f}')

Logistic regrssion 정확도 : 0.585
Logistic regrssion 정밀도 : 0.111
Logistic regrssion 재현율 : 0.419
Logistic regrssion f1_score : 0.176
Logistic regrssion roc_auc : 0.533


In [125]:
# 학습된 모델의 특성 중요도 확인
feature_importance = lr.coef_[0]  # 특성의 가중치 또는 중요도

# 특성별 중요도 출력
for feature, importance in zip(X_train.columns, feature_importance):
    print(f"{feature}: {importance}")

Unnamed: 0: 4.1809396411797e-06
성별: -5.078592182983814e-10
차량 소유 여부: -4.4538612113579434e-10
부동산 소유 여부: -3.324261513478449e-10
자녀 수: -2.7251399728449205e-10
연간 수입: -2.9198750074618027e-10
수입 유형: -6.537375789622785e-10
최종 학력: -4.784502435071298e-10
결혼 여부: -3.326402869270209e-10
주거 형태: 1.410386402897201e-10
거주지 인구 비율: -2.7518371431856147e-15
휴대전화 소유 여부: 1.302222858040472e-10
업무용 휴대전화 소유 여부: 5.0193624605885314e-11
이메일 소유 여부: -9.897584816463837e-11
직업: 7.59913680918654e-10
가족 구성원 수: 4.381881808136712e-10
산업군: 2.819040314190602e-09
나이: -3.314050079894934e-09
근속연수: -1.369463462176056e-07
가입연수: -1.5783959444276755e-09
인구수: -1.42900961624675e-07
도시구분: 8.724470494045961e-11
소득수준: 2.110635837008218e-10
5_1: 0.0
16_1: 0.0
17_1: 0.0
10_1: 0.0
25_1: 0.0
20_1: 0.0
30_1: 0.0
23_1: 0.0
24_1: 0.0
7_1: 0.0
4_1: 0.0
8_1: 0.0
1_1: 0.0
22_1: 0.0
29_1: 0.0
26_1: 0.0
0_1: 0.0
11_1: 0.0
31_1: 0.0
28_1: 0.0
21_1: 0.0
19_1: 0.0
13_1: 0.0
6_1: 0.0
3_1: 0.0
14_1: 0.0
18_1: 0.0
2_1: 0.0
32_1: 0.0
15_1: 0.0
12_1: 0

In [126]:
# 5. KNN 을 이용한 학습
# 5-1. GridSearchCV를 적용해 KNN의 교차검증 및 하이퍼파라미터 튜닝

knn = KNeighborsClassifier()

parameters = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance']
          }

grid_knn = GridSearchCV(knn, param_grid = parameters, cv=3, refit=True)
grid_knn.fit(X_train, y_train)

knn = grid_knn.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_knn.best_params_}")
print(f"최고 예측 정확도: {grid_knn.best_score_:.4f}")

최적 하이퍼 파라미터: {'n_neighbors': 3, 'weights': 'distance'}
최고 예측 정확도: 0.7283


In [127]:
# 5-2 knn 적용
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)
pred = knn.predict(X_test) 

acc = accuracy_score(y_test , pred)
prec = precision_score(y_test , pred)
rec = recall_score(y_test , pred)
auc_score = roc_auc_score(y_test , pred)
f1score = f1_score(y_test, pred)

print(f'KNN 정확도 : {acc:.3f}')
print(f'KNN 정밀도 : {prec:.3f}')
print(f'KNN 재현율 : {rec:.3f}')
print(f'KNN f1_score : {f1score:.3f}')
print(f'KNN roc_auc : {auc_score:.3f}')

KNN 정확도 : 0.645
KNN 정밀도 : 0.110
KNN 재현율 : 0.331
KNN f1_score : 0.165
KNN roc_auc : 0.507


In [128]:
# 6. xgboost 를 이용한 학습
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators = 150,
                            learning_rate = 0.2,
                            max_depth = 10,
                            min_child_weight = 5,
                            gamma = 10)

# fit
xgb_model.fit(X_train, y_train)

# 예측
y_pred = xgb_model.predict(X_test)


# 모델 성능평가
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, y_pred)

acc = accuracy_score(y_test , pred)
prec = precision_score(y_test , pred)
rec = recall_score(y_test , pred)
auc_score = roc_auc_score(y_test , pred)
f1score = f1_score(y_test, pred)


# score print
print(f'xgboost 정확도 : {acc:.3f}')
print(f'xgboost 정밀도 : {prec:.3f}')
print(f'xgboost 재현율 : {rec:.3f}')
print(f'xgboost f1_score : {f1score:.3f}')
print(f'xgboost roc_auc : {auc_score:.3f}')

xgboost 정확도 : 0.645
xgboost 정밀도 : 0.110
xgboost 재현율 : 0.331
xgboost f1_score : 0.165
xgboost roc_auc : 0.507
