In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.svm as svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics


In [2]:
train_df = pd.read_csv('../EDA/dataset_modify_age.csv')
train_df.head(1)

Unnamed: 0,TARGET,성별,차량 소유 여부,부동산 소유 여부,자녀 수,연간 수입,수입 유형,최종 학력,결혼 여부,주거 형태,...,직업,가족 구성원 수,산업군,나이,근속연수,가입연수,인구수,도시구분,나이구간,소득수준
0,0,1,1,1,2,18054000.0,5,0,0,3,...,1,4.0,5,39,1000,23.0,256685,1,2,0


In [3]:
train_copy = train_df.copy()

In [47]:
train_copy['직업'].unique()

array([ 1,  4,  5, 17, 12, 11, 10, 13,  3,  2, 18, 15, 16,  6,  9,  7,  8,
        0, 14], dtype=int64)

In [None]:
# 내가 확인하고 싶은 건 특정 직업들의 산업군간 비교를 하고싶음
# 즉 모든 산업에서 특정 직업인 사람들만 뽑아서 그 사람들의 밸류를 1로 두고 타겟 데이터와 연관성을 확인하고싶음

In [4]:
def create_job_column(df, job_value):

#    새로운 컬럼을 생성하고 특정 조건에 따라 값을 할당하는 함수

#    Parameters:
#    - df: 데이터프레임
#    - job_value: 직업 구분코드

#    Returns:
#    - df: 새로운 컬럼을 추가한 데이터프레임

    for i in df['산업군'].unique():
        new_column_name = f"{i}_{job_value}"
        mask = (df['산업군'] == i) & (df['직업'] == job_value)
        df[new_column_name] = mask.astype(int)
    return df
# 함수 사용 예시
# create_job_column(your_df, 'job_value')

In [5]:
# 함수 사용하여 각 산업군 내의 특정 직업(고정) 컬럼 생성 ex) ㅇㅇ산업군의 영업직, ㅁㅁ산업군의 영업직 ...
# 각 산업군 내에 직업 10에 해당하는 데이터에 밸류값 부여(0,1)
create_job_column(train_copy, '10')

Unnamed: 0,TARGET,성별,차량 소유 여부,부동산 소유 여부,자녀 수,연간 수입,수입 유형,최종 학력,결혼 여부,주거 형태,...,6_10,3_10,14_10,18_10,2_10,32_10,15_10,12_10,9_10,27_10
0,0,1,1,1,2,18054000.0,5,0,0,3,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,59472000.0,1,1,0,3,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,29736000.0,1,0,0,3,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,0,1,38232000.0,2,0,0,3,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,1,0,26550000.0,1,0,0,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59991,0,1,0,1,0,15930000.0,5,0,1,3,...,0,0,0,0,0,0,0,0,0,0
59992,1,0,1,1,0,53100000.0,1,0,0,3,...,0,0,0,0,0,0,0,0,0,0
59993,1,1,0,0,0,47790000.0,1,1,1,3,...,0,0,0,0,0,0,0,0,0,0
59994,0,1,1,1,0,29736000.0,1,0,3,3,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_copy = train_copy.drop(['이메일 소유 여부', '소득수준', '도시구분', '수입 유형', '최종 학력', '업무용 휴대전화 소유 여부', '성별', '자녀 수'], axis=1)
train_copy.head(2)

In [6]:
# feature와 label 분리
X = train_copy.drop('TARGET', axis=1)
y = train_copy['TARGET']

# train test 분리
X_train, X_test, y_train , y_test = train_test_split(X, y , test_size=0.2 , random_state=42)

In [7]:
# train test 분리
X_train_old, X_test, y_train_old , y_test = train_test_split(X, y, test_size=0.3 , random_state=42)

In [8]:
y_train_old.value_counts()

TARGET
0    37477
1     4520
Name: count, dtype: int64

In [9]:
# over sampling
from imblearn.over_sampling import SMOTE

X_train, y_train = SMOTE(random_state = 22).fit_resample(X_train_old, y_train_old)

In [82]:
y_train.value_counts()

TARGET
1    37477
0    37477
Name: count, dtype: int64

In [10]:
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
rf = RandomForestClassifier(random_state=42)
lr = LogisticRegression()
svm =svm.SVC(gamma=0.01, C=100)
knn = KNeighborsClassifier(n_neighbors=6)

In [85]:
# 1. Decision Tree를 이용한 학습
# 1-1. GridSearchCV를 적용해 Decision Tree의 교차검증 및 하이퍼파라미터 튜닝

dt = DecisionTreeClassifier()

parameters = {'max_depth' : [2, 3, 4, 5],
             'min_samples_split' : [1, 3, 5, 7, 9]}

grid_dt = GridSearchCV(dt, param_grid = parameters, cv=3, refit=True)
grid_dt.fit(X_train, y_train)

dt = grid_dt.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_dt.best_params_}")
print(f"최고 예측 정확도: {grid_dt.best_score_:.4f}")

최적 하이퍼 파라미터: {'max_depth': 5, 'min_samples_split': 3}
최고 예측 정확도: 0.7505


In [87]:
# 트리 기반 모델의 특성 중요도 확인
for feature, importance in zip(X_train.columns, dt.feature_importances_):
    print(f"{feature}: {importance}")

차량 소유 여부: 0.19355482859130216
부동산 소유 여부: 0.0
연간 수입: 0.0011098595652963728
결혼 여부: 0.00036784448023273757
주거 형태: 0.0
거주지 인구 비율: 0.0
휴대전화 소유 여부: 0.0
직업: 0.12951465672506027
가족 구성원 수: 0.16934501298776516
산업군: 0.0
나이: 0.07399025993307431
근속연수: 0.13651538456607443
가입연수: 0.0006575590003967781
인구수: 0.0
나이구간: 0.29494459415079777
5_10: 0.0
16_10: 0.0
17_10: 0.0
10_10: 0.0
25_10: 0.0
20_10: 0.0
30_10: 0.0
23_10: 0.0
24_10: 0.0
7_10: 0.0
4_10: 0.0
8_10: 0.0
1_10: 0.0
22_10: 0.0
29_10: 0.0
26_10: 0.0
0_10: 0.0
11_10: 0.0
31_10: 0.0
28_10: 0.0
21_10: 0.0
19_10: 0.0
13_10: 0.0
6_10: 0.0
3_10: 0.0
14_10: 0.0
18_10: 0.0
2_10: 0.0
32_10: 0.0
15_10: 0.0
12_10: 0.0
9_10: 0.0
27_10: 0.0


In [88]:
# Decision Tree 적용
## 위의 모델 dt를 사용하여 예측을 하거나 평가를 진행
pred = dt.predict(X_test)
pred_proba = dt.predict_proba(X_test)
pred_proba_1 = pred_proba[:, 1]

acc = accuracy_score(y_test , pred)
prec = precision_score(y_test , pred)
rec = recall_score(y_test , pred)
f1score = f1_score(y_test, pred)
auc_score = roc_auc_score(y_test , pred_proba_1)
print(f'의사결정나무 정확도 : {acc:.3f}')
print(f'의사결정나무 정밀도 : {prec:.3f}')
print(f'의사결정나무 재현율 : {rec:.3f}')
print(f'의사결정나무 f1_score : {f1score:.3f}')

의사결정나무 정확도 : 0.700
의사결정나무 정밀도 : 0.139
의사결정나무 재현율 : 0.352
의사결정나무 f1_score : 0.200


In [89]:
# 2. RandomForest를 이용한 학습
# 2-1. GridSearchCV를 적용해 RandomForest의 교차검증 및 하이퍼파라미터 튜닝

rf = RandomForestClassifier()

parameters = {
    'max_depth' : [6, 8, 12],
    'min_samples_split' : [16, 24]
}

grid_rf = GridSearchCV(rf, param_grid = parameters, cv=3, refit=True)
grid_rf.fit(X_train, y_train)

rf = grid_rf.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_rf.best_params_}")
print(f"최고 예측 정확도: {grid_rf.best_score_:.4f}")


최적 하이퍼 파라미터: {'max_depth': 12, 'min_samples_split': 16}
최고 예측 정확도: 0.8413


In [90]:
# RandomForest 적용

rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
pred_proba = rf.predict_proba(X_test)
pred_proba_1 = pred_proba[:, 1]

acc = accuracy_score(y_test , pred)
prec = precision_score(y_test , pred)
rec = recall_score(y_test , pred)
f1score = f1_score(y_test, pred)
auc_score = roc_auc_score(y_test , pred_proba_1)

print(f'랜덤포레스트 정확도 : {acc:.3f}')
print(f'랜덤포레스트 정밀도 : {prec:.3f}')
print(f'랜덤포레스트 재현율 : {rec:.3f}')
print(f'랜덤포레스트 f1_score : {f1score:.3f}')
print(f'랜덤포레스트 roc_auc : {auc_score:.3f}')

랜덤포레스트 정확도 : 0.862
랜덤포레스트 정밀도 : 0.159
랜덤포레스트 재현율 : 0.071
랜덤포레스트 f1_score : 0.098
랜덤포레스트 roc_auc : 0.582


In [91]:
print(rf.feature_importances_)

[4.78564321e-02 1.84942780e-02 7.16037602e-02 3.78749681e-02
 1.51856591e-02 7.02579317e-02 7.87208120e-06 8.99983993e-02
 1.49379941e-01 8.32475483e-02 8.91041072e-02 1.01545114e-01
 8.86375828e-02 7.01626058e-02 6.66438000e-02 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]


In [92]:
# 3. Logistic regrssion 을 이용한 학습
# 3-1. GridSearchCV를 적용해 Logistic regrssion의 교차검증 및 하이퍼파라미터 튜닝

lr = LogisticRegression()

parameters = {'penalty': ['l2','l1'],
          'C':[0.01,0.1,1,10]}

grid_lr = GridSearchCV(lr, param_grid = parameters, cv=3, refit=True)
grid_lr.fit(X_train, y_train)

lr = grid_lr.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_lr.best_params_}")
print(f"최고 예측 정확도: {grid_lr.best_score_:.4f}")

최적 하이퍼 파라미터: {'C': 0.01, 'penalty': 'l2'}
최고 예측 정확도: 0.5122


In [93]:
#3-2. Logistic Regression 적용
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
pred_proba = lr.predict_proba(X_test)
pred_proba_1 = pred_proba[:, 1]

acc = accuracy_score(y_test , pred)
prec = precision_score(y_test , pred)
rec = recall_score(y_test , pred)
auc_score = roc_auc_score(y_test , pred_proba_1)
f1score = f1_score(y_test, pred)

print(f'Logistic regrssion 정확도 : {acc:.3f}')
print(f'Logistic regrssion 정밀도 : {prec:.3f}')
print(f'Logistic regrssion 재현율 : {rec:.3f}')
print(f'Logistic regrssion f1_score : {f1score:.3f}')
print(f'Logistic regrssion roc_auc : {auc_score:.3f}')

Logistic regrssion 정확도 : 0.700
Logistic regrssion 정밀도 : 0.114
Logistic regrssion 재현율 : 0.270
Logistic regrssion f1_score : 0.160
Logistic regrssion roc_auc : 0.530


In [94]:
# 학습된 모델의 특성 중요도 확인
feature_importance = lr.coef_[0]  # 특성의 가중치 또는 중요도

# 특성별 중요도 출력
for feature, importance in zip(X_train.columns, feature_importance):
    print(f"{feature}: {importance}")

차량 소유 여부: -3.474615022535368e-13
부동산 소유 여부: -2.776238138298365e-13
연간 수입: 1.8325573098891797e-09
결혼 여부: -3.099054859410271e-13
주거 형태: 3.669725012908596e-14
거주지 인구 비율: -2.4396109711741284e-15
휴대전화 소유 여부: 6.410008367661657e-14
직업: 5.585556337732725e-13
가족 구성원 수: 3.0998716830336534e-13
산업군: 2.1700415579760806e-12
나이: -5.679769100509746e-12
근속연수: -1.5221876907501946e-10
가입연수: -2.7459345428044152e-12
인구수: -1.262527668827148e-07
나이구간: -1.1597284619572216e-12
5_10: 0.0
16_10: 0.0
17_10: 0.0
10_10: 0.0
25_10: 0.0
20_10: 0.0
30_10: 0.0
23_10: 0.0
24_10: 0.0
7_10: 0.0
4_10: 0.0
8_10: 0.0
1_10: 0.0
22_10: 0.0
29_10: 0.0
26_10: 0.0
0_10: 0.0
11_10: 0.0
31_10: 0.0
28_10: 0.0
21_10: 0.0
19_10: 0.0
13_10: 0.0
6_10: 0.0
3_10: 0.0
14_10: 0.0
18_10: 0.0
2_10: 0.0
32_10: 0.0
15_10: 0.0
12_10: 0.0
9_10: 0.0
27_10: 0.0


In [95]:
# 5. KNN 을 이용한 학습
# 5-1. GridSearchCV를 적용해 KNN의 교차검증 및 하이퍼파라미터 튜닝

knn = KNeighborsClassifier()

parameters = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance']
          }

grid_knn = GridSearchCV(knn, param_grid = parameters, cv=3, refit=True)
grid_knn.fit(X_train, y_train)

knn = grid_knn.best_estimator_

print(f"최적 하이퍼 파라미터: {grid_knn.best_params_}")
print(f"최고 예측 정확도: {grid_knn.best_score_:.4f}")

최적 하이퍼 파라미터: {'n_neighbors': 3, 'weights': 'distance'}
최고 예측 정확도: 0.8181


In [96]:
# 5-2 knn 적용
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)
pred = knn.predict(X_test) 

acc = accuracy_score(y_test , pred)
prec = precision_score(y_test , pred)
rec = recall_score(y_test , pred)
auc_score = roc_auc_score(y_test , pred)
f1score = f1_score(y_test, pred)

print(f'KNN 정확도 : {acc:.3f}')
print(f'KNN 정밀도 : {prec:.3f}')
print(f'KNN 재현율 : {rec:.3f}')
print(f'KNN f1_score : {f1score:.3f}')
print(f'KNN roc_auc : {auc_score:.3f}')

KNN 정확도 : 0.701
KNN 정밀도 : 0.124
KNN 재현율 : 0.298
KNN f1_score : 0.175
KNN roc_auc : 0.524
