In [1]:
# 모듈 가져오기
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# 데이터 로드
credit_data = pd.read_csv('./Data/german_credit_new.csv')

In [3]:
# 데이터 확인 - head()
credit_data.head()

Unnamed: 0,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,class
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [4]:
# 데이터 shape 확인
credit_data.shape

(1000, 21)

In [5]:
# 데이터 타입 확인
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                                                    Non-Null Count  Dtype 
---  ------                                                    --------------  ----- 
 0   Status of existing checking account                       1000 non-null   object
 1   Duration in month                                         1000 non-null   int64 
 2   Credit history                                            1000 non-null   object
 3   Purpose                                                   1000 non-null   object
 4   Credit amount                                             1000 non-null   int64 
 5   Savings account/bonds                                     1000 non-null   object
 6   Present employment since                                  1000 non-null   object
 7   Installment rate in percentage of disposable income       1000 non-null   int64 
 8   Personal status and sex      

In [6]:
# 데이터 타입이 object와 int 두가지가 있다.
# class가 종속변수
# 종속변수를 확인해보자.
credit_data['class'].unique()

array([1, 2], dtype=int64)

In [7]:
# 데이터 전처리
# class를 0과 1로 바꿔주자. => 0 : good, 1 : bad
# 공백을 '_'로 바꿔 주자
credit_data['class'] = credit_data['class'] - 1
credit_data.rename(columns = lambda x : x.replace(" ", "_"), inplace = True)

In [8]:
# 데이터 분석 처리 전략
# object => 범주형 변수로 간주
# int => 연속형 변수로 간주

In [9]:
# Credit history에 대해서 데이터 확인 해보자.
credit_data['Credit_history'].unique()

array(['A34', 'A32', 'A33', 'A30', 'A31'], dtype=object)

In [10]:
# 데이터 그루핑을 위한 함수 만들기
def IV_calc(data, var):
    
    if data[var].dtypes == "object":
        dataf = data.groupby([var])['class'].agg(['count', 'sum'])
        dataf.columns = ['Total', 'bad']
        dataf['good'] = dataf['Total'] - dataf['bad']
        dataf['bad_per'] = dataf['bad']/ dataf['bad'].sum()
        dataf['good_per'] = dataf['good']/dataf['good'].sum()
        dataf['I_V'] = (dataf['good_per'] - dataf['bad_per']) * np.log(dataf['good_per']/dataf['bad_per'])
        return dataf
    else:
        data['bin_var'] = pd.qcut(data[var].rank(method = 'first'), 10)
        dataf = data.groupby(['bin_var'])['class'].agg(['count', 'sum'])
        dataf.columns = ['Total', 'bad'] 
        dataf['good'] = dataf['Total'] - dataf['bad']
        dataf['bad_per'] = dataf['bad']/ dataf['bad'].sum()
        dataf['good_per'] = dataf['good']/dataf['good'].sum()
        dataf['I_V'] = (dataf['good_per'] - dataf['bad_per']) * np.log(dataf['good_per']/dataf['bad_per'])
    return dataf


In [11]:
# object형 데이터 테스트
IV_calc(credit_data, 'Credit_history')

Unnamed: 0_level_0,Total,bad,good,bad_per,good_per,I_V
Credit_history,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A30,40,25,15,0.083333,0.021429,0.084074
A31,49,28,21,0.093333,0.03,0.071882
A32,530,169,361,0.563333,0.515714,0.004206
A33,88,28,60,0.093333,0.085714,0.000649
A34,293,50,243,0.166667,0.347143,0.132423


In [12]:
# int형 데이터 테스트
IV_calc(credit_data, 'Duration_in_month')

Unnamed: 0_level_0,Total,bad,good,bad_per,good_per,I_V
bin_var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(0.999, 100.9]",100,11,89,0.036667,0.127143,0.112502
"(100.9, 200.8]",100,20,80,0.066667,0.114286,0.025667
"(200.8, 300.7]",100,27,73,0.09,0.104286,0.002105
"(300.7, 400.6]",100,25,75,0.083333,0.107143,0.005984
"(400.6, 500.5]",100,26,74,0.086667,0.105714,0.003784
"(500.5, 600.4]",100,38,62,0.126667,0.088571,0.013629
"(600.4, 700.3]",100,32,68,0.106667,0.097143,0.000891
"(700.3, 800.2]",100,31,69,0.103333,0.098571,0.000225
"(800.2, 900.1]",100,42,58,0.14,0.082857,0.029973
"(900.1, 1000.0]",100,48,52,0.16,0.074286,0.065765


In [13]:
credit_data['Duration_in_month'].rank(method = 'first')

0        8.0
1      937.0
2      181.0
3      920.0
4      587.0
       ...  
995    358.0
996    827.0
997    359.0
998    934.0
999    935.0
Name: Duration_in_month, Length: 1000, dtype: float64

In [14]:
# 일단, 데이터 타입 확인 -> 자료형 series
credit_data.dtypes

Status_of_existing_checking_account                           object
Duration_in_month                                              int64
Credit_history                                                object
Purpose                                                       object
Credit_amount                                                  int64
Savings_account/bonds                                         object
Present_employment_since                                      object
Installment_rate_in_percentage_of_disposable_income            int64
Personal_status_and_sex                                       object
Other_debtors_/_guarantors                                    object
Present_residence_since                                        int64
Property                                                      object
Age_in_years                                                   int64
Other_installment_plans                                       object
Housing                           

In [15]:
# 범주형 변수와 연속형 변수를 나눠서 컬럼 지정을 하자
discrete_columns = list()
continuous_columns = list()
for i in range(len(credit_data.dtypes)):
    if credit_data.dtypes[i] == 'object':
        discrete_columns.append(credit_data.columns[i])  
    elif credit_data.dtypes[i] == 'int64':
        continuous_columns.append(credit_data.columns[i])   

In [16]:
# 데이터 확인
print(discrete_columns, len(discrete_columns))
print(continuous_columns, len(continuous_columns) )

['Status_of_existing_checking_account', 'Credit_history', 'Purpose', 'Savings_account/bonds', 'Present_employment_since', 'Personal_status_and_sex', 'Other_debtors_/_guarantors', 'Property', 'Other_installment_plans', 'Housing', 'Job', 'Telephone', 'foreign_worker'] 13
['Duration_in_month', 'Credit_amount', 'Installment_rate_in_percentage_of_disposable_income', 'Present_residence_since', 'Age_in_years', 'Number_of_existing_credits_at_this_bank', 'Number_of_people_being_liable_to_provide_maintenance_for', 'class'] 8


In [17]:
# 전체 컬럼 정의
total_columns = discrete_columns + continuous_columns
print(len(total_columns))

21


In [18]:
# IV값 리스트 들고 오기
IV_list = list()
for col in total_columns:
    assigned_data = IV_calc(data = credit_data, var = col)
    IV_val = round(assigned_data["I_V"].sum(),3)
    dt_type = credit_data[col].dtypes
    IV_list.append((IV_val, col, dt_type))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [19]:
# 데이터 확인 및 내림차순으로 정리
IV_list = sorted(IV_list, reverse = True)

In [20]:
# 데이터 확인
IV_list

[(inf, 'class', dtype('int64')),
 (0.666, 'Status_of_existing_checking_account', dtype('O')),
 (0.293, 'Credit_history', dtype('O')),
 (0.261, 'Duration_in_month', dtype('int64')),
 (0.196, 'Savings_account/bonds', dtype('O')),
 (0.169, 'Purpose', dtype('O')),
 (0.113, 'Property', dtype('O')),
 (0.113, 'Credit_amount', dtype('int64')),
 (0.103, 'Age_in_years', dtype('int64')),
 (0.098, 'Number_of_existing_credits_at_this_bank', dtype('int64')),
 (0.086, 'Present_employment_since', dtype('O')),
 (0.083, 'Housing', dtype('O')),
 (0.062,
  'Installment_rate_in_percentage_of_disposable_income',
  dtype('int64')),
 (0.058, 'Other_installment_plans', dtype('O')),
 (0.049, 'Present_residence_since', dtype('int64')),
 (0.045, 'Personal_status_and_sex', dtype('O')),
 (0.044, 'foreign_worker', dtype('O')),
 (0.034,
  'Number_of_people_being_liable_to_provide_maintenance_for',
  dtype('int64')),
 (0.032, 'Other_debtors_/_guarantors', dtype('O')),
 (0.009, 'Job', dtype('O')),
 (0.006, 'Telephone',

In [21]:
# 데이터 수정
IV_list = IV_list[1:]

In [22]:
# 20200831
# 데이터 재확인
IV_list

[(0.666, 'Status_of_existing_checking_account', dtype('O')),
 (0.293, 'Credit_history', dtype('O')),
 (0.261, 'Duration_in_month', dtype('int64')),
 (0.196, 'Savings_account/bonds', dtype('O')),
 (0.169, 'Purpose', dtype('O')),
 (0.113, 'Property', dtype('O')),
 (0.113, 'Credit_amount', dtype('int64')),
 (0.103, 'Age_in_years', dtype('int64')),
 (0.098, 'Number_of_existing_credits_at_this_bank', dtype('int64')),
 (0.086, 'Present_employment_since', dtype('O')),
 (0.083, 'Housing', dtype('O')),
 (0.062,
  'Installment_rate_in_percentage_of_disposable_income',
  dtype('int64')),
 (0.058, 'Other_installment_plans', dtype('O')),
 (0.049, 'Present_residence_since', dtype('int64')),
 (0.045, 'Personal_status_and_sex', dtype('O')),
 (0.044, 'foreign_worker', dtype('O')),
 (0.034,
  'Number_of_people_being_liable_to_provide_maintenance_for',
  dtype('int64')),
 (0.032, 'Other_debtors_/_guarantors', dtype('O')),
 (0.009, 'Job', dtype('O')),
 (0.006, 'Telephone', dtype('O'))]

In [23]:
# 명목형 변수는 더미화
# 더미화 이전에 테스트
pd.get_dummies(credit_data['Status_of_existing_checking_account'], prefix = 'status_exs_accnt')

Unnamed: 0,status_exs_accnt_A11,status_exs_accnt_A12,status_exs_accnt_A13,status_exs_accnt_A14
0,1,0,0,0
1,0,1,0,0
2,0,0,0,1
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
995,0,0,0,1
996,1,0,0,0
997,0,0,0,1
998,1,0,0,0


In [24]:
# 명목형 변수 컬럼 명 => discrete_columns
dummies = list()
for name in discrete_columns:
    tmp = pd.get_dummies(credit_data[name], prefix = name)
    dummies.append(tmp)

In [25]:
# 연속형 변수 컬럼명 => continuous_columns
# 연속형 변수만 있는 데이터 프레임 만들기
credit_continuous = credit_data[continuous_columns]

In [26]:
# 데이터 프레임 확인
credit_continuous.head()

Unnamed: 0,Duration_in_month,Credit_amount,Installment_rate_in_percentage_of_disposable_income,Present_residence_since,Age_in_years,Number_of_existing_credits_at_this_bank,Number_of_people_being_liable_to_provide_maintenance_for,class
0,6,1169,4,4,67,2,1,0
1,48,5951,2,2,22,1,1,1
2,12,2096,2,3,49,1,2,0
3,42,7882,2,4,45,1,2,0
4,24,4870,3,4,53,2,2,1


In [27]:
# 명목형 변수만 있는 데이터 프레임 만들기
credit_data_new =pd.concat([credit_continuous, pd.concat(dummies, axis = 1)], axis = 1 )

In [28]:
# 데이터 확인
credit_data_new

Unnamed: 0,Duration_in_month,Credit_amount,Installment_rate_in_percentage_of_disposable_income,Present_residence_since,Age_in_years,Number_of_existing_credits_at_this_bank,Number_of_people_being_liable_to_provide_maintenance_for,class,Status_of_existing_checking_account_A11,Status_of_existing_checking_account_A12,...,Housing_A152,Housing_A153,Job_A171,Job_A172,Job_A173,Job_A174,Telephone_A191,Telephone_A192,foreign_worker_A201,foreign_worker_A202
0,6,1169,4,4,67,2,1,0,1,0,...,1,0,0,0,1,0,0,1,1,0
1,48,5951,2,2,22,1,1,1,0,1,...,1,0,0,0,1,0,1,0,1,0
2,12,2096,2,3,49,1,2,0,0,0,...,1,0,0,1,0,0,1,0,1,0
3,42,7882,2,4,45,1,2,0,1,0,...,0,1,0,0,1,0,1,0,1,0
4,24,4870,3,4,53,2,2,1,1,0,...,0,1,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,12,1736,3,4,31,1,1,0,0,0,...,1,0,0,1,0,0,1,0,1,0
996,30,3857,4,4,40,1,1,0,1,0,...,1,0,0,0,0,1,0,1,1,0
997,12,804,4,4,38,1,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
998,45,1845,4,4,23,1,1,1,1,0,...,0,1,0,0,1,0,0,1,1,0


In [29]:
# train 데이터, test 데이터 나누기
x_train, x_test, y_train, y_test = train_test_split(credit_data_new.drop(['class'], axis = 1), credit_data_new['class'], train_size = 0.7, random_state = 42)

In [30]:
# 데이터 확인하기
type(x_train), type(x_test), type(y_train), type(y_test)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.series.Series,
 pandas.core.series.Series)

In [31]:
# 데이터 확인하기
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((700, 61), (300, 61), (700,), (300,))

In [32]:
# 데이터 타입 변한
y_train = y_train.to_frame()
y_test = y_test.to_frame()

In [33]:
credit_data_new

Unnamed: 0,Duration_in_month,Credit_amount,Installment_rate_in_percentage_of_disposable_income,Present_residence_since,Age_in_years,Number_of_existing_credits_at_this_bank,Number_of_people_being_liable_to_provide_maintenance_for,class,Status_of_existing_checking_account_A11,Status_of_existing_checking_account_A12,...,Housing_A152,Housing_A153,Job_A171,Job_A172,Job_A173,Job_A174,Telephone_A191,Telephone_A192,foreign_worker_A201,foreign_worker_A202
0,6,1169,4,4,67,2,1,0,1,0,...,1,0,0,0,1,0,0,1,1,0
1,48,5951,2,2,22,1,1,1,0,1,...,1,0,0,0,1,0,1,0,1,0
2,12,2096,2,3,49,1,2,0,0,0,...,1,0,0,1,0,0,1,0,1,0
3,42,7882,2,4,45,1,2,0,1,0,...,0,1,0,0,1,0,1,0,1,0
4,24,4870,3,4,53,2,2,1,1,0,...,0,1,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,12,1736,3,4,31,1,1,0,0,0,...,1,0,0,1,0,0,1,0,1,0
996,30,3857,4,4,40,1,1,0,1,0,...,1,0,0,0,0,1,0,1,1,0
997,12,804,4,4,38,1,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
998,45,1845,4,4,23,1,1,1,1,0,...,0,1,0,0,1,0,0,1,1,0


In [34]:
# N개의 부류는 N-1개의 더미 변수로 표현할 수 있다.
# 따라서, 여분의 변수를 하나씩 삭제한다
remove_cols_extra_dummy = ['Status_of_existing_checking_account_A11', 'Credit_history_A30', 'Purpose_A40', 'Savings_account/bonds_A61', 'Present_employment_since_A71', 'Personal_status_and_sex_A91', 'Other_debtors_/_guarantors_A101', 'Property_A121', 'Other_installment_plans_A141', 'Housing_A151', 'Job_A171', 'foreign_worker_A201']

In [35]:
# 후진 제거법을 통해 무의미한 변수를 하나씩 제거해 나가기 위해 별도의 리스트를 하나 생성
# 실행이 반복될 때마다 가장 무의미한 변수와 다중공선성을 갖는 변수를 remove_cols_insig 리스트에 추가해 모델을 훈련하는 동안에 제거할 수 있도록 한다
remove_cols_insig = []
remove_cols = list(set(remove_cols_extra_dummy + remove_cols_insig))

In [36]:
import statsmodels.api as sm
logistic_model = sm.Logit(y_train, sm.add_constant(x_train.drop(remove_cols, axis = 1))).fit()

Optimization terminated successfully.
         Current function value: 0.452608
         Iterations 11


In [37]:
print(logistic_model.summary())

                           Logit Regression Results                           
Dep. Variable:                  class   No. Observations:                  700
Model:                          Logit   Df Residuals:                      651
Method:                           MLE   Df Model:                           48
Date:                Tue, 01 Sep 2020   Pseudo R-squ.:                  0.2576
Time:                        20:53:10   Log-Likelihood:                -316.83
converged:                       True   LL-Null:                       -426.75
Covariance Type:            nonrobust   LLR p-value:                 7.805e-24
                                                               coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------------------
const                                                        0.2104   1.43e+07   1.47e-08      1.000   -2.81e+07    2.

In [38]:
# VIF를 계산하자
cnames = x_train.drop(remove_cols, axis = 1).columns
for i in range(0, len(cnames)):
    xvars = list(cnames)
    yvar = xvars.pop(i)
    mod = sm.OLS(x_train.drop(remove_cols, axis = 1)[yvar], sm.add_constant(x_train.drop(remove_cols, axis = 1))[xvars])
    res = mod.fit()
    vif = 1 /(1 - res.rsquared)
    print(yvar,round(vif,3))

Duration_in_month 2.063
Credit_amount 2.668
Installment_rate_in_percentage_of_disposable_income 1.392
Present_residence_since 1.356
Age_in_years 1.514
Number_of_existing_credits_at_this_bank 1.6
Number_of_people_being_liable_to_provide_maintenance_for 1.226
Status_of_existing_checking_account_A12 1.752
Status_of_existing_checking_account_A13 1.267
Status_of_existing_checking_account_A14 1.868
Credit_history_A31 2.228
Credit_history_A32 7.035
Credit_history_A33 2.926
Credit_history_A34 6.058
Purpose_A41 1.464
Purpose_A410 1.24
Purpose_A42 1.602
Purpose_A43 1.797
Purpose_A44 1.092
Purpose_A45 1.129
Purpose_A46 1.205
Purpose_A48 1.104
Purpose_A49 1.513
Savings_account/bonds_A62 1.157
Savings_account/bonds_A63 1.126
Savings_account/bonds_A64 1.118
Savings_account/bonds_A65 1.204
Present_employment_since_A72 4.148
Present_employment_since_A73 5.746
Present_employment_since_A74 4.293
Present_employment_since_A75 4.913
Personal_status_and_sex_A92 6.455
Personal_status_and_sex_A93 7.131
Person

In [39]:
# VIF값이 5 이상인 컬럼들
# 'Credit_history_A32', 'Credit_history_A34', 'Present_employment_since_A73', 'Personal_status_and_sex_A92', 'Personal_status_and_sex_A93', 'Telephone_A191 38.478', 'Telephone_A192 11.222'

In [40]:
# c 통계량 계산
# 예측 확률
y_pred = logistic_model.predict(sm.add_constant(x_train.drop(remove_cols, axis=1))).to_frame()
y_pred.columns = ['probs']
both = pd.concat([y_train,y_pred], axis = 1)
both

Unnamed: 0,class,probs
541,0,0.168461
440,0,0.172219
482,0,0.306132
422,0,0.117376
778,0,0.017026
...,...,...
106,1,0.636216
270,0,0.054809
860,0,0.022928
435,1,0.117557


In [44]:
zeros = both[both['class'] == 0]
ones = both[both['class'] == 1]

In [45]:
def df_crossjoin(df1, df2):
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1
    res = pd.merge(df1, df2, on = '_tmpkey').drop('_tmpkey', axis = 1)
    res.index = pd.MultiIndex.from_product((df1.index, df2.index))
    df1.drop('_tmpkey', axis = 1, inplace = True)
    df2.drop('_tmpkey', axis = 1, inplace = True)
    return res

In [56]:
joined_data = df_crossjoin(ones, zeros)
joined_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['_tmpkey'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['_tmpkey'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Unnamed: 1,class_x,probs_x,class_y,probs_y
949,541,1,0.092969,0,0.168461
949,440,1,0.092969,0,0.172219
949,482,1,0.092969,0,0.306132
949,422,1,0.092969,0,0.117376
949,778,1,0.092969,0,0.017026
...,...,...,...,...,...
435,20,1,0.117557,0,0.113294
435,71,1,0.117557,0,0.020558
435,270,1,0.117557,0,0.054809
435,860,1,0.117557,0,0.022928


In [57]:
joined_data['concordant_pair'] = 0

In [59]:
joined_data.loc[joined_data['probs_x'] > joined_data['probs_y'], 'concordant_pair'] = 1

In [60]:
joined_data['discordant_pair'] = 0
joined_data.loc[joined_data['probs_x'] < joined_data['probs_y'], 'discordant_pair'] = 1

In [62]:
joined_data['tied_pair'] = 0
joined_data.loc[joined_data['probs_x'] == joined_data['probs_y'], 'tied_pair'] = 1

In [63]:
joined_data

Unnamed: 0,Unnamed: 1,class_x,probs_x,class_y,probs_y,concordant_pair,discordant_pair,tied_pair
949,541,1,0.092969,0,0.168461,0,1,0
949,440,1,0.092969,0,0.172219,0,1,0
949,482,1,0.092969,0,0.306132,0,1,0
949,422,1,0.092969,0,0.117376,0,1,0
949,778,1,0.092969,0,0.017026,1,0,0
...,...,...,...,...,...,...,...,...
435,20,1,0.117557,0,0.113294,1,0,0
435,71,1,0.117557,0,0.020558,1,0,0
435,270,1,0.117557,0,0.054809,1,0,0
435,860,1,0.117557,0,0.022928,1,0,0


In [67]:
p_conc = joined_data['concordant_pair'].sum()/joined_data.shape[0]
p_disc = joined_data['discordant_pair'].sum()/joined_data.shape[0]

In [68]:
c_statistic = 0.5 + (p_conc - p_disc)/2.0
round(c_statistic, 4)

0.8295

In [None]:
# 여기까지 하면서 로지스틱 회귀가 어떤 내용인지 정확히 몰랐다.
# 다시 한 번 복습하면서 내용 정리하자.