In [1]:
# 모듈 가져오기
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# 데이터 로드
credit_data = pd.read_csv('./Data/german_credit_new.csv')

In [3]:
# 데이터 확인 - head()
credit_data.head()

Unnamed: 0,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,class
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [4]:
# 데이터 shape 확인
credit_data.shape

(1000, 21)

In [5]:
# 데이터 타입 확인
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                                                    Non-Null Count  Dtype 
---  ------                                                    --------------  ----- 
 0   Status of existing checking account                       1000 non-null   object
 1   Duration in month                                         1000 non-null   int64 
 2   Credit history                                            1000 non-null   object
 3   Purpose                                                   1000 non-null   object
 4   Credit amount                                             1000 non-null   int64 
 5   Savings account/bonds                                     1000 non-null   object
 6   Present employment since                                  1000 non-null   object
 7   Installment rate in percentage of disposable income       1000 non-null   int64 
 8   Personal status and sex      

In [6]:
# 데이터 타입이 object와 int 두가지가 있다.
# class가 종속변수
# 종속변수를 확인해보자.
credit_data['class'].unique()

array([1, 2], dtype=int64)

In [7]:
# 데이터 전처리
# class를 0과 1로 바꿔주자. => 0 : good, 1 : bad
# 공백을 '_'로 바꿔 주자
credit_data['class'] = credit_data['class'] - 1
credit_data.rename(columns = lambda x : x.replace(" ", "_"), inplace = True)

In [8]:
# 데이터 분석 처리 전략
# object => 범주형 변수로 간주
# int => 연속형 변수로 간주

In [9]:
# Credit history에 대해서 데이터 확인 해보자.
credit_data['Credit_history'].unique()

array(['A34', 'A32', 'A33', 'A30', 'A31'], dtype=object)

In [10]:
# 데이터 그루핑을 위한 함수 만들기
def IV_calc(data, var):
    
    if data[var].dtypes == "object":
        dataf = data.groupby([var])['class'].agg(['count', 'sum'])
        dataf.columns = ['Total', 'bad']
        dataf['good'] = dataf['Total'] - dataf['bad']
        dataf['bad_per'] = dataf['bad']/ dataf['bad'].sum()
        dataf['good_per'] = dataf['good']/dataf['good'].sum()
        dataf['I_V'] = (dataf['good_per'] - dataf['bad_per']) * np.log(dataf['good_per']/dataf['bad_per'])
        return dataf
    else:
        data['bin_var'] = pd.qcut(data[var].rank(method = 'first'), 10)
        dataf = data.groupby(['bin_var'])['class'].agg(['count', 'sum'])
        dataf.columns = ['Total', 'bad'] 
        dataf['good'] = dataf['Total'] - dataf['bad']
        dataf['bad_per'] = dataf['bad']/ dataf['bad'].sum()
        dataf['good_per'] = dataf['good']/dataf['good'].sum()
        dataf['I_V'] = (dataf['good_per'] - dataf['bad_per']) * np.log(dataf['good_per']/dataf['bad_per'])
    return dataf


In [11]:
# object형 데이터 테스트
IV_calc(credit_data, 'Credit_history')

Unnamed: 0_level_0,Total,bad,good,bad_per,good_per,I_V
Credit_history,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A30,40,25,15,0.083333,0.021429,0.084074
A31,49,28,21,0.093333,0.03,0.071882
A32,530,169,361,0.563333,0.515714,0.004206
A33,88,28,60,0.093333,0.085714,0.000649
A34,293,50,243,0.166667,0.347143,0.132423


In [12]:
# int형 데이터 테스트
IV_calc(credit_data, 'Duration_in_month')

Unnamed: 0_level_0,Total,bad,good,bad_per,good_per,I_V
bin_var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(0.999, 100.9]",100,11,89,0.036667,0.127143,0.112502
"(100.9, 200.8]",100,20,80,0.066667,0.114286,0.025667
"(200.8, 300.7]",100,27,73,0.09,0.104286,0.002105
"(300.7, 400.6]",100,25,75,0.083333,0.107143,0.005984
"(400.6, 500.5]",100,26,74,0.086667,0.105714,0.003784
"(500.5, 600.4]",100,38,62,0.126667,0.088571,0.013629
"(600.4, 700.3]",100,32,68,0.106667,0.097143,0.000891
"(700.3, 800.2]",100,31,69,0.103333,0.098571,0.000225
"(800.2, 900.1]",100,42,58,0.14,0.082857,0.029973
"(900.1, 1000.0]",100,48,52,0.16,0.074286,0.065765


In [27]:
credit_data['Duration_in_month'].rank(method = 'first')

0        8.0
1      937.0
2      181.0
3      920.0
4      587.0
       ...  
995    358.0
996    827.0
997    359.0
998    934.0
999    935.0
Name: Duration_in_month, Length: 1000, dtype: float64

In [48]:
# 일단, 데이터 타입 확인 -> 자료형 series
credit_data.dtypes

Status_of_existing_checking_account                           object
Duration_in_month                                              int64
Credit_history                                                object
Purpose                                                       object
Credit_amount                                                  int64
Savings_account/bonds                                         object
Present_employment_since                                      object
Installment_rate_in_percentage_of_disposable_income            int64
Personal_status_and_sex                                       object
Other_debtors_/_guarantors                                    object
Present_residence_since                                        int64
Property                                                      object
Age_in_years                                                   int64
Other_installment_plans                                       object
Housing                           

In [53]:
# 범주형 변수와 연속형 변수를 나눠서 컬럼 지정을 하자
discrete_columns = list()
continuous_columns = list()
for i in range(len(credit_data.dtypes)):
    if credit_data.dtypes[i] == 'object':
        discrete_columns.append(credit_data.columns[i])  
    elif credit_data.dtypes[i] == 'int64':
        continuous_columns.append(credit_data.columns[i])   

In [55]:
# 데이터 확인
print(discrete_columns, len(discrete_columns))
print(continuous_columns, len(continuous_columns) )

['Status_of_existing_checking_account', 'Credit_history', 'Purpose', 'Savings_account/bonds', 'Present_employment_since', 'Personal_status_and_sex', 'Other_debtors_/_guarantors', 'Property', 'Other_installment_plans', 'Housing', 'Job', 'Telephone', 'foreign_worker'] 13
['Duration_in_month', 'Credit_amount', 'Installment_rate_in_percentage_of_disposable_income', 'Present_residence_since', 'Age_in_years', 'Number_of_existing_credits_at_this_bank', 'Number_of_people_being_liable_to_provide_maintenance_for', 'class'] 8


In [57]:
# 전체 컬럼 정의
total_columns = discrete_columns + continuous_columns
print(len(total_columns))

21


In [59]:
# IV값 리스트 들고 오기
IV_list = list()
for col in total_columns:
    assigned_data = IV_calc(data = credit_data, var = col)
    IV_val = round(assigned_data["I_V"].sum(),3)
    dt_type = credit_data[col].dtypes
    IV_list.append((IV_val, col, dt_type))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [64]:
# 데이터 확인 및 내림차순으로 정리
IV_list = sorted(IV_list, reverse = True)

In [65]:
# 데이터 확인
IV_list

[(inf, 'class', dtype('int64')),
 (0.666, 'Status_of_existing_checking_account', dtype('O')),
 (0.293, 'Credit_history', dtype('O')),
 (0.261, 'Duration_in_month', dtype('int64')),
 (0.196, 'Savings_account/bonds', dtype('O')),
 (0.169, 'Purpose', dtype('O')),
 (0.113, 'Property', dtype('O')),
 (0.113, 'Credit_amount', dtype('int64')),
 (0.103, 'Age_in_years', dtype('int64')),
 (0.098, 'Number_of_existing_credits_at_this_bank', dtype('int64')),
 (0.086, 'Present_employment_since', dtype('O')),
 (0.083, 'Housing', dtype('O')),
 (0.062,
  'Installment_rate_in_percentage_of_disposable_income',
  dtype('int64')),
 (0.058, 'Other_installment_plans', dtype('O')),
 (0.049, 'Present_residence_since', dtype('int64')),
 (0.045, 'Personal_status_and_sex', dtype('O')),
 (0.044, 'foreign_worker', dtype('O')),
 (0.034,
  'Number_of_people_being_liable_to_provide_maintenance_for',
  dtype('int64')),
 (0.032, 'Other_debtors_/_guarantors', dtype('O')),
 (0.009, 'Job', dtype('O')),
 (0.006, 'Telephone',

In [66]:
# 데이터 수정
IV_list = IV_list[1:]