# [ PART 05. 정형 데이터마이닝 ] - 2장. 분류 분석
- 분류분석은 데이터가 어떤 그룹에 속하는지 예측하는데 사용하는 기법.
- 의사결정나무, 앙상블기법, 인공신경망 등이 있음.
- ADP 실기에서는 "분류기법들을 활용해 가장 정확도같은 지표가 좋은 기법과 결과를 보고서에 나타내라"는 형식의 문제가 출제됨.

## - 불러올 패키지 목록

In [1]:
# 데이터 핸들링을 위한 패키지
import numpy as np
import pandas as pd

import statsmodels.api as sm
from statsmodels.formula.api import ols

import statsmodels.regression.linear_model


from sklearn.model_selection import train_test_split    # training/test data set을 추출하는 패키지
from sklearn.linear_model import *    # 로지스틱 회귀 분석을 위한 패키지
from sklearn.metrics import *
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler



## 1. 로지스틱 회귀분석
- 반응변수가 범주형인 경우 적용되는 회귀분석
- 새로운 설명변수가 주어질 때 반응변수의 각 범주에 속할 확률이 얼마인지를 추정하여 추정확률을 기준치에 따라 분류하는 목적으로 활용

In [2]:
# 데이터 로드 및 데이터프레임 생성
df_credit = pd.read_csv( 'C:/Users/Administrator/GitHub/TIL/ADP_study/rawdata/credit_final.csv')


# 회귀모델의 상수항에 해당하는 컬럼 추가
df_credit = sm.add_constant( df_credit,
                             has_constant = "add" )

df_credit.head()

Unnamed: 0,const,credit.rating,account.balance,credit.duration.months,previous.credit.payment.status,credit.purpose,credit.amount,savings,employment.duration,installment.rate,...,residence.duration,current.assets,age,other.credits,apartment.type,bank.credits,occupation,dependents,telephone,foreign.worker
0,1.0,1,1,18,3,2,1049,1,1,4,...,4,2,21,2,1,1,3,1,1,1
1,1.0,1,1,9,3,4,2799,1,2,2,...,2,1,36,2,1,2,3,2,1,1
2,1.0,1,2,12,2,4,841,2,3,2,...,4,1,23,2,1,1,2,1,1,1
3,1.0,1,1,12,3,4,2122,1,2,3,...,2,1,39,2,1,2,2,2,1,2
4,1.0,1,1,12,3,4,2171,1,2,4,...,4,2,38,1,2,2,2,1,1,2


In [3]:
# 독립변수, 종속변수 지정
x = df_credit.drop('credit.rating', axis = 1)
y = df_credit['credit.rating']


# 데이터 분할 : train 70%, test 30%
train_x, test_x, train_y, test_y = train_test_split( x, y,
                                                     train_size = 0.7, test_size = 0.3,
                                                     random_state = 153 )

print( 'train_x', train_x.shape,'\n', 'test_x', test_x.shape)
print( 'train_y', train_y.shape,'\n', 'test_y', test_y.shape)

train_x (700, 21) 
 test_x (300, 21)
train_y (700,) 
 test_y (300,)


### sklearn 패키지 사용

In [4]:
# 로지스틱 회귀를 이용하여 학습 및 예측 수행. 
lr_clf = LogisticRegressionCV(max_iter = 1000)
lr_clf.fit( train_x, train_y )

LogisticRegressionCV(max_iter=1000)

In [5]:
lr_preds = lr_clf.predict(test_x)

# accuracy와 roc_auc 측정
print('accuracy: {:0.3f}'.format(accuracy_score(test_y, lr_preds)))
print('roc_auc: {:0.3f}'.format(roc_auc_score(test_y , lr_preds)))

accuracy: 0.723
roc_auc: 0.628


### statsmodels 패키지 사용

In [6]:
# 로지스틱 회귀분석 실행

model = sm.Logit(train_y, train_x)
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.461446
         Iterations 7


In [7]:
print( result.summary() )

                           Logit Regression Results                           
Dep. Variable:          credit.rating   No. Observations:                  700
Model:                          Logit   Df Residuals:                      679
Method:                           MLE   Df Model:                           20
Date:                Sat, 07 Aug 2021   Pseudo R-squ.:                  0.2369
Time:                        16:41:57   Log-Likelihood:                -323.01
converged:                       True   LL-Null:                       -423.28
Covariance Type:            nonrobust   LLR p-value:                 8.790e-32
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const                             -2.4741      1.557     -1.589      0.112      -5.527       0.579
account.balance                    0.8728      0.125      7.004      0.

In [8]:
df_credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   const                           1000 non-null   float64
 1   credit.rating                   1000 non-null   int64  
 2   account.balance                 1000 non-null   int64  
 3   credit.duration.months          1000 non-null   int64  
 4   previous.credit.payment.status  1000 non-null   int64  
 5   credit.purpose                  1000 non-null   int64  
 6   credit.amount                   1000 non-null   int64  
 7   savings                         1000 non-null   int64  
 8   employment.duration             1000 non-null   int64  
 9   installment.rate                1000 non-null   int64  
 10  marital.status                  1000 non-null   int64  
 11  guarantor                       1000 non-null   int64  
 12  residence.duration              100

In [9]:
# 독립변수, 종속변수 지정
x2 = df_credit[ ['account.balance', 'credit.duration.months', 'previous.credit.payment.status',
                 'savings', 'credit.purpose', 'current.assets', 'apartment.type', 'foreign.worker',
                 'guarantor', 'age', 'other.credits', 'marital.status', 'installment.rate'] ]
y = df_credit['credit.rating']


# 데이터 분할 : train 70%, test 30%
train_x, test_x, train_y, test_y = train_test_split( x2, y,
                                                     train_size = 0.7, test_size = 0.3,
                                                     stratify = y,
                                                     random_state = 153 )

print( 'train_x', train_x.shape,'\n', 'test_x', test_x.shape)
print( 'train_y', train_y.shape,'\n', 'test_y', test_y.shape)

train_x (700, 13) 
 test_x (300, 13)
train_y (700,) 
 test_y (300,)


In [10]:
# 로지스틱 회귀분석 실행

model = sm.Logit(train_y, train_x)
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.481089
         Iterations 6


In [11]:
print( result.summary() )

                           Logit Regression Results                           
Dep. Variable:          credit.rating   No. Observations:                  700
Model:                          Logit   Df Residuals:                      687
Method:                           MLE   Df Model:                           12
Date:                Sat, 07 Aug 2021   Pseudo R-squ.:                  0.2124
Time:                        16:41:58   Log-Likelihood:                -336.76
converged:                       True   LL-Null:                       -427.61
Covariance Type:            nonrobust   LLR p-value:                 1.924e-32
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
account.balance                    0.7475      0.117      6.365      0.000       0.517       0.978
credit.duration.months            -0.0495      0.008     -6.017      0.