In [1]:
import pandas as pd
import numpy as np
import gc
import pickle
import time
import datetime
import copy

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import LabelEncoder

## read_me (꼭 읽어주세요)
- 이 코드는 raw_data에서 user_spec.csv, loan_result.csv 파일을 읽어 최종 예측까지 진행하는 코드입니다. 코드 실행 도중 pkl 파일로 중간 결과를 저장하는 과정이 존재합니다.(메모리 부족으로 인해 단계별로 실행하였습니다.) 따라서 제출한 압축 파일에 있는 raw_data, tmp_data, data_split, 평가데이터 폴더를 유지한 채로 코드를 돌려야 오류없이 코드를 전체 실행할 수 있습니다.

In [2]:
# user_spec, loan_result 테이블 불러오기 및 lonapply_insert_time type 변경
user_spec = pd.read_csv('./raw_data/user_spec.csv')
loan_result = pd.read_csv('./raw_data/loan_result.csv')
loan_result['loanapply_insert_time'] = pd.to_datetime(loan_result['loanapply_insert_time'])

## 결측치처리

#### gender, age, amt, credit_score, employment_period 변수 제외한 결측치 처리

In [3]:
def cem_var_preproc(x):
    if len(str(x)) == 10:
        return str(x)[:6] + '01'
    elif len(str(x)) == 8:
        return str(x)[:6] + '01'
    elif str(x) == 'nan':
        return np.nan
    else: 
        print(x)
        return x

# datetime 형변환
user_spec['company_enter_month'] = pd.to_datetime(user_spec['company_enter_month'].apply(cem_var_preproc))
user_spec['insert_time'] = pd.to_datetime(user_spec.insert_time)

# employment_period, age 변수 생성
user_spec['employment_period'] = (user_spec['insert_time'] - user_spec['company_enter_month']).dt.days
user_spec['age'] = user_spec['insert_time'].dt.year - user_spec['birth_year']

# company_enter_month, birth_year drop
user_spec = user_spec.drop(columns = ['company_enter_month', 'birth_year'])

In [4]:
# employment_period가 음수인 사람 같은 user_id의 값으로 대체 - 잘못 입력한 값
row_idx = np.where(user_spec['employment_period'] < 0)[0]
col_idx = np.where(user_spec.columns == 'employment_period')[0][0]
user_spec.iloc[row_idx, col_idx] = 136

In [5]:
# 19세 미만 변수 삭제
print("19세 미만 변수 삭제 전 min / max: ", user_spec.age.min(), user_spec.age.max())
user_spec = pd.concat([user_spec[user_spec.age >= 19], user_spec[user_spec.age.isna()]])
print("19세 미만 변수 삭제 후min / max: ", user_spec.age.min(), user_spec.age.max())

19세 미만 변수 삭제 전 min / max:  14.0 95.0
19세 미만 변수 삭제 후min / max:  19.0 95.0


In [6]:
# income_type, employment_type, houseown_type, desired_amount, purpose nan인 데이터 삭제
user_spec = user_spec[~user_spec.income_type.isna()]

In [7]:
# amt와 cnt가 모두 nan이면 0으로 채운다.
amt_idx = np.where(user_spec.columns == 'existing_loan_amt')[0][0]
cnt_idx = np.where(user_spec.columns == 'existing_loan_cnt')[0][0]
nan_idx = np.where((user_spec.existing_loan_cnt.isna()) & (user_spec.existing_loan_amt.isna()))[0]
user_spec.iloc[nan_idx,amt_idx] = 0
user_spec.iloc[nan_idx,cnt_idx] = 0

In [8]:
# 개인회생 0, 1 조합 삭제
user_spec = user_spec[~((user_spec.personal_rehabilitation_yn == 0)&(user_spec.personal_rehabilitation_complete_yn == 1))]

In [9]:
# 개인회생 nan, nan인 데이터 0으로 처리
yn_idx = np.where(user_spec.columns == 'personal_rehabilitation_yn')[0][0]
cyn_idx = np.where(user_spec.columns == 'personal_rehabilitation_complete_yn')[0][0]
nan_idx = np.where(user_spec.personal_rehabilitation_yn.isna())[0]
user_spec.iloc[nan_idx,yn_idx] = 0
user_spec.iloc[nan_idx,cyn_idx] = 0 

In [10]:
# 개인회생 0, nan인 데이터 0으로 처리
cyn_idx = np.where(user_spec.columns == 'personal_rehabilitation_complete_yn')[0][0]
nan_idx = np.where(user_spec.personal_rehabilitation_complete_yn.isna())[0]
user_spec.iloc[nan_idx,cyn_idx] = 0 

In [11]:
# yearly_income 결측치 데이터 목록
user_spec[user_spec.yearly_income.isna()]

Unnamed: 0,application_id,user_id,gender,insert_time,credit_score,yearly_income,income_type,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age
165422,1026915,861363,1.0,2022-06-30 21:11:51,,,OTHERINCOME,기타,전월세,5000000.0,생활비,0.0,0.0,0.0,0.0,,27.0
233316,2045613,702899,1.0,2022-06-30 18:37:04,520.0,,FREELANCER,기타,전월세,5000000.0,전월세보증금,0.0,0.0,1.0,1000000.0,29.0,27.0
597591,391560,329226,0.0,2022-06-30 19:27:28,590.0,,OTHERINCOME,기타,전월세,20000000.0,생활비,0.0,0.0,9.0,27000000.0,,26.0
597613,341149,670502,1.0,2022-06-30 21:32:18,930.0,,OTHERINCOME,기타,기타가족소유,6000000.0,사업자금,0.0,0.0,1.0,3000000.0,,41.0
971960,1330905,771592,1.0,2022-06-30 18:57:05,750.0,,OTHERINCOME,기타,자가,3000000.0,생활비,0.0,0.0,1.0,3000000.0,,19.0


In [12]:
# 같은 user_id 정보가 없는 데이터 (1개) 삭제
user_spec = user_spec[~(user_spec.user_id == 861363)]

In [13]:
# 같은 user_id 정보가 있는 데이터 mean()으로 대체
col_idx = np.where(user_spec.columns == 'yearly_income')[0][0]
# 1
row_idx = np.where(user_spec.application_id == 2045613)[0]
user_spec.iloc[row_idx, col_idx] = user_spec[user_spec.user_id == 702899].yearly_income.mean()
# 2
row_idx = np.where(user_spec.application_id == 391560)[0]
user_spec.iloc[row_idx, col_idx] = user_spec[user_spec.user_id == 329226].yearly_income.mean()
# 3
row_idx = np.where(user_spec.application_id == 341149)[0]
user_spec.iloc[row_idx, col_idx] = user_spec[user_spec.user_id == 670502].yearly_income.mean()
# 4
row_idx = np.where(user_spec.application_id == 1330905)[0]
user_spec.iloc[row_idx, col_idx] = user_spec[user_spec.user_id == 771592].yearly_income.mean()

In [14]:
user_spec.isna().sum()

application_id                              0
user_id                                     0
gender                                  12959
insert_time                                 0
credit_score                           105014
yearly_income                               0
income_type                                 0
employment_type                             0
houseown_type                               0
desired_amount                              0
purpose                                     0
personal_rehabilitation_yn                  0
personal_rehabilitation_complete_yn         0
existing_loan_cnt                           0
existing_loan_amt                      115148
employment_period                      171578
age                                     12959
dtype: int64

- LIVING -> 생활비
- ETC -> 기타
- HOUSEDEPOSIT -> 전월세보증금
- BUYHOUSE -> 주택구입
- BUYCAR -> 자동차구입
- INVEST -> 투자
- BUSINESS -> 사업자금
- SWITCHLOAN -> 대환대출

In [15]:
# purpose 영어, 한글 통일
user_spec.purpose.unique()

array(['기타', '대환대출', '생활비', '사업자금', '주택구입', '전월세보증금', '투자', 'LIVING',
       'SWITCHLOAN', 'ETC', 'INVEST', '자동차구입', 'BUSINESS', 'BUYCAR',
       'HOUSEDEPOSIT', 'BUYHOUSE'], dtype=object)

In [16]:
user_spec.purpose = user_spec.purpose.replace(['LIVING', 'ETC', 'HOUSEDEPOSIT', 'BUYHOUSE', 'BUYCAR', 'INVEST', 'BUSINESS', 'SWITCHLOAN'],
                  ['생활비', '기타', '전월세보증금', '주택구입', '자동차구입', '투자', '사업자금', '대환대출'])
user_spec.purpose.unique()

array(['기타', '대환대출', '생활비', '사업자금', '주택구입', '전월세보증금', '투자', '자동차구입'],
      dtype=object)

In [17]:
# bank_id == 16 인 데이터 삭제
loan_result = loan_result[~(loan_result.bank_id == 16)]

In [18]:
# loan_result랑 merge
user_loan = pd.merge(loan_result, user_spec, on = 'application_id')
print(user_loan.shape)
print(user_loan.isna().sum())
user_loan.head()

(13527229, 23)
application_id                               0
loanapply_insert_time                        0
bank_id                                      0
product_id                                   0
loan_limit                                7382
loan_rate                                 7382
is_applied                             3257239
user_id                                      0
gender                                  128096
insert_time                                  0
credit_score                           1509274
yearly_income                                0
income_type                                  0
employment_type                              0
houseown_type                                0
desired_amount                               0
purpose                                      0
personal_rehabilitation_yn                   0
personal_rehabilitation_complete_yn          0
existing_loan_cnt                            0
existing_loan_amt                      120444

Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,user_id,gender,insert_time,...,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age
0,1748340,2022-06-07 13:05:41,7,191,42000000.0,13.6,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
1,1748340,2022-06-07 13:05:41,25,169,24000000.0,17.9,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
2,1748340,2022-06-07 13:05:41,2,7,24000000.0,18.5,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
3,1748340,2022-06-07 13:05:41,4,268,29000000.0,10.8,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
4,1748340,2022-06-07 13:05:41,11,118,5000000.0,16.4,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0


In [19]:
# month 변수 생성 (loanapply_insert_time 기준)
user_loan['month'] = user_loan['loanapply_insert_time'].dt.month
user_loan['day'] = user_loan['loanapply_insert_time'].dt.day

In [20]:
print(user_loan.shape)

(13527229, 25)


In [21]:
# get_dummies
cat_features = ['income_type', 'employment_type', 'houseown_type', 'purpose']
user_loan = pd.get_dummies(user_loan, prefix_sep = "_", columns = cat_features, drop_first = True)

#### 결측을 나타내는 변수 추가

In [22]:
user_loan['gen_age_nan'] = user_loan.gender.isna()
user_loan['credit_nan'] = user_loan.credit_score.isna()
user_loan['amt_nan'] = user_loan.existing_loan_amt.isna()
user_loan['period_nan'] = user_loan.employment_period.isna()

#### 전체 데이터 수치형 변수 impute (MICE - extra tree)

- imp_num_all.pkl : 수치형 결측치 변수들을 MICE로 채운 파일입니다.

In [23]:
drop_columns = ['application_id', 'loanapply_insert_time', 'bank_id','product_id',
                 'loan_limit', 'loan_rate', 'is_applied', 'user_id', 'gender',
                 'insert_time','month', 'day', 'gen_age_nan', 'credit_nan', 'amt_nan','period_nan']

all_for_impute = user_loan.drop(columns = drop_columns).drop_duplicates(keep = 'first')
all_for_impute_col_name = all_for_impute.columns

In [24]:
start = time.time()
et = ExtraTreesRegressor(n_estimators=10, random_state=1004)
imp = IterativeImputer(estimator = et, max_iter=10, tol=0.1
                    ,skip_complete = True, random_state = 1004, min_value = 0)
imp.fit(all_for_impute)
end = time.time()
print(datetime.timedelta(seconds = end-start))

0:06:19.062517


In [25]:
start = time.time()
imp_all = imp.transform(user_loan.drop(columns = drop_columns))
end = time.time()
print(datetime.timedelta(seconds = end-start))

0:00:52.005738


In [26]:
imp_all = pd.DataFrame(columns = all_for_impute_col_name, index = user_loan.index, data = imp_all)

In [27]:
imp_all = pd.concat([imp_all, user_loan.loc[:,drop_columns]], axis = 1)
print(imp_all.shape)

(13527229, 43)


In [28]:
imp_all.isna().sum()

credit_score                                 0
yearly_income                                0
desired_amount                               0
personal_rehabilitation_yn                   0
personal_rehabilitation_complete_yn          0
existing_loan_cnt                            0
existing_loan_amt                            0
employment_period                            0
age                                          0
income_type_EARNEDINCOME2                    0
income_type_FREELANCER                       0
income_type_OTHERINCOME                      0
income_type_PRACTITIONER                     0
income_type_PRIVATEBUSINESS                  0
employment_type_기타                           0
employment_type_일용직                          0
employment_type_정규직                          0
houseown_type_배우자                            0
houseown_type_자가                             0
houseown_type_전월세                            0
purpose_대환대출                                 0
purpose_사업자금 

In [29]:
del et
del imp
gc.collect()

45

In [30]:
imp_all.iloc[:, 3:5] = imp_all.iloc[:, 3:5].astype('uint8')
imp_all.iloc[:, 9:27] = imp_all.iloc[:, 9:27].astype('uint8')

#### gender 결측치 처리(1에 속할 확률으로 대체)

In [31]:
unusing_features = ['application_id', 'loanapply_insert_time', 'product_id', 
                    'user_id', 'insert_time','loan_limit', 'loan_rate', 
                    'is_applied','month', 'day', 'bank_id',
                    'gen_age_nan', 'credit_nan', 'amt_nan','period_nan']
tmp_all = imp_all.drop(columns = unusing_features)
print(tmp_all.shape)

(13527229, 28)


In [32]:
tmp_all_X = tmp_all[~tmp_all.gender.isna()].drop(columns = ['gender'])
tmp_all_y = tmp_all[~tmp_all.gender.isna()]['gender']
print(tmp_all_X.shape)
print(tmp_all_y.shape)

(13399133, 27)
(13399133,)


In [33]:
# 모델 훈련 (logistic regression)
start = time.time()
lr = LogisticRegression(class_weight = 'balanced')
lr.fit(tmp_all_X, tmp_all_y)
end = time.time()
print(datetime.timedelta(seconds = end - start))

0:00:47.145342


In [34]:
all_nan_idx = np.where(imp_all.gender.isna())[0]
col_idx = np.where(imp_all.columns == 'gender')[0][0]

In [35]:
unusing_features = ['application_id', 'loanapply_insert_time', 'product_id', 
                    'user_id', 'insert_time','loan_limit', 'loan_rate', 
                    'is_applied','month', 'day', 'bank_id','gender',
                   'gen_age_nan', 'credit_nan', 'amt_nan','period_nan']
imp_gender_prob = lr.predict_proba(imp_all.iloc[all_nan_idx].drop(columns = unusing_features))

In [36]:
imp_all.iloc[all_nan_idx, col_idx] = imp_gender_prob[:,1]

In [37]:
imp_all.isna().sum()

credit_score                                 0
yearly_income                                0
desired_amount                               0
personal_rehabilitation_yn                   0
personal_rehabilitation_complete_yn          0
existing_loan_cnt                            0
existing_loan_amt                            0
employment_period                            0
age                                          0
income_type_EARNEDINCOME2                    0
income_type_FREELANCER                       0
income_type_OTHERINCOME                      0
income_type_PRACTITIONER                     0
income_type_PRIVATEBUSINESS                  0
employment_type_기타                           0
employment_type_일용직                          0
employment_type_정규직                          0
houseown_type_배우자                            0
houseown_type_자가                             0
houseown_type_전월세                            0
purpose_대환대출                                 0
purpose_사업자금 

In [38]:
# age 범주화
def age_to_cat(age):
    if   age < 30: return 2
    elif age < 40: return 3
    elif age < 50: return 4
    elif age < 60: return 5
    elif age < 70: return 6
    elif age < 80: return 7
    else: return 8

In [39]:
imp_all['age_cat'] = imp_all.age.apply(age_to_cat)

In [40]:
# bank_id 전처리

class make_bank_apply_prop():
    
    # fit
    def make_bank_dic(self, X_train, y_train):
        self.bank_dic = {}
        tmp = pd.concat([X_train, y_train], axis = 1)[['bank_id', 'is_applied']].value_counts().reset_index(drop = False)
        bank_id_unq = tmp.bank_id.unique()
        for bid in bank_id_unq:
            tmp_row = tmp[(tmp.bank_id == bid)&(tmp.is_applied == 1)].shape[0]
            if tmp_row == 0:
                count_app = 0
            else:
                count_app = int(tmp[(tmp.bank_id == bid)&(tmp.is_applied == 1)].iloc[:,2])
            count_all = sum(tmp[(tmp.bank_id == bid)].iloc[:,2])
            self.bank_dic[bid] = count_app / count_all
        #return self.bank_dic
    
    # transform using apply function
    def add_bank_prop(self, bank_id):
        # 만약 key가 존재하지 않으면 평균값 반환
        if bank_id in self.bank_dic.keys():
            return self.bank_dic[bank_id]
        else:
            return np.array(list(self.bank_dic.values())).mean()

In [41]:
makebank = make_bank_apply_prop()
makebank.make_bank_dic(imp_all.drop(columns = ['is_applied']), imp_all[['is_applied']]) # imp_all로 fit
imp_all['bank_apply_prop'] = imp_all['bank_id'].apply(makebank.add_bank_prop)

In [42]:
# 모든 결측치 처리한 데이터 저장
imp_all.to_pickle('./tmp_data/imp_all.pkl')

#### loan_rate, limit mean, max, min 변수 추가

In [43]:
imp_all = pd.read_pickle('./tmp_data/imp_all.pkl')

In [44]:
df_agg = imp_all.groupby('application_id').agg({'loan_rate' : ['mean', 'min', 'max'], 
                                                'loan_limit' : ['mean', 'min', 'max']})
df_agg = df_agg.reset_index()

In [45]:
merge_df = pd.merge(imp_all, df_agg, how = 'outer', on = 'application_id')

  merge_df = pd.merge(imp_all, df_agg, how = 'outer', on = 'application_id')
  merge_df = pd.merge(imp_all, df_agg, how = 'outer', on = 'application_id')


In [46]:
del imp_all
del df_agg
gc.collect()

21

In [47]:
merge_df.columns = list(merge_df.columns[:-6]) + \
                  ['loan_rate_mean', 'loan_rate_min', 'loan_rate_max',
                   'loan_limit_mean', 'loan_limit_min', 'loan_limit_max']

## data split (train, valid, test, 6 month)

In [48]:
drop_col = ['loanapply_insert_time',
            'bank_id', 'user_id', 'insert_time', 'day']
merge_df = merge_df.drop(columns= drop_col)
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13527229 entries, 0 to 13527228
Data columns (total 46 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   credit_score                         float64
 1   yearly_income                        float64
 2   desired_amount                       float64
 3   personal_rehabilitation_yn           uint8  
 4   personal_rehabilitation_complete_yn  uint8  
 5   existing_loan_cnt                    float64
 6   existing_loan_amt                    float64
 7   employment_period                    float64
 8   age                                  float64
 9   income_type_EARNEDINCOME2            uint8  
 10  income_type_FREELANCER               uint8  
 11  income_type_OTHERINCOME              uint8  
 12  income_type_PRACTITIONER             uint8  
 13  income_type_PRIVATEBUSINESS          uint8  
 14  employment_type_기타                   uint8  
 15  employment_type_일용직           

In [49]:
# 2 modeling에 쓸 변수인 model1_label 변수 생성 - 최종 모델에는 쓰이지 않음.
model1_label = merge_df[['application_id', 'is_applied']].groupby('application_id').agg({'is_applied':['max']})
model1_label.shape

(968745, 1)

In [50]:
model1_label = model1_label.fillna(0)
model1_label.columns = ['model1_label']
model1_label

Unnamed: 0_level_0,model1_label
application_id,Unnamed: 1_level_1
1,0.0
2,0.0
4,0.0
6,0.0
8,0.0
...,...
2167817,1.0
2167819,0.0
2167820,1.0
2167822,0.0


In [51]:
merge_df = pd.merge(merge_df, model1_label, left_on = 'application_id', right_index = True, how ='outer')
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13527229 entries, 0 to 13527228
Data columns (total 47 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   credit_score                         float64
 1   yearly_income                        float64
 2   desired_amount                       float64
 3   personal_rehabilitation_yn           uint8  
 4   personal_rehabilitation_complete_yn  uint8  
 5   existing_loan_cnt                    float64
 6   existing_loan_amt                    float64
 7   employment_period                    float64
 8   age                                  float64
 9   income_type_EARNEDINCOME2            uint8  
 10  income_type_FREELANCER               uint8  
 11  income_type_OTHERINCOME              uint8  
 12  income_type_PRACTITIONER             uint8  
 13  income_type_PRIVATEBUSINESS          uint8  
 14  employment_type_기타                   uint8  
 15  employment_type_일용직           

In [52]:
# 6월 데이터
final_6month_df = merge_df[merge_df.month == 6]

In [53]:
merge_df = merge_df[merge_df.month != 6]
merge_df.shape

(10269990, 47)

In [54]:
np.random.seed(1004)

num_of_app = len(merge_df.application_id.unique())
print(num_of_app)
split_idx = np.random.choice(num_of_app, size = 100000, replace = False)
test_app = merge_df.application_id.unique()[split_idx[:50000]]
valid_app = merge_df.application_id.unique()[split_idx[50000:]]

print(len(valid_app), len(test_app))

728876
50000 50000


In [55]:
set(valid_app) & set(test_app)

set()

In [56]:
# 색인을 위한 index change
merge_df = merge_df.reset_index()
merge_df = merge_df.set_index(merge_df.application_id)
merge_df

Unnamed: 0_level_0,index,credit_score,yearly_income,desired_amount,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age,...,period_nan,age_cat,bank_apply_prop,loan_rate_mean,loan_rate_min,loan_rate_max,loan_limit_mean,loan_limit_min,loan_limit_max,model1_label
application_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2157865,13301,540.0,32000000.0,10000000.0,0,0,12.0,162000000.0,3081.0,52.0,...,False,5,0.042501,16.500000,16.5,16.5,2.000000e+07,20000000.0,20000000.0,1.0
576643,13302,580.0,72000000.0,20000000.0,0,0,7.0,106000000.0,1012.0,45.0,...,False,4,0.042501,18.250000,16.5,20.0,7.000000e+06,3000000.0,11000000.0,0.0
576643,13303,580.0,72000000.0,20000000.0,0,0,7.0,106000000.0,1012.0,45.0,...,False,4,0.017018,18.250000,16.5,20.0,7.000000e+06,3000000.0,11000000.0,0.0
2136706,13304,740.0,39000000.0,80000000.0,0,0,2.0,58000000.0,3721.0,39.0,...,False,3,0.024636,13.762500,6.2,19.9,2.317500e+07,2000000.0,48000000.0,1.0
2136706,13305,740.0,39000000.0,80000000.0,0,0,2.0,58000000.0,3721.0,39.0,...,False,3,0.021634,13.762500,6.2,19.9,2.317500e+07,2000000.0,48000000.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369315,13519552,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,...,False,4,0.070538,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0
1369315,13519553,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,...,False,4,0.026148,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0
1369315,13519554,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,...,False,4,0.101006,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0
1369315,13519555,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,...,False,4,0.107743,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0


In [57]:
valid_df = merge_df.loc[valid_app]
valid_df.iloc[:,1]

application_id
537614    701.0
537614    701.0
537614    701.0
537614    701.0
537614    701.0
          ...  
123038    930.0
123038    930.0
123038    930.0
123038    930.0
123038    930.0
Name: credit_score, Length: 707053, dtype: float64

In [58]:
len(valid_df.index.unique())

50000

In [59]:
valid_df.iloc[:,0]

application_id
537614    8657043
537614    8657044
537614    8657045
537614    8657046
537614    8657047
           ...   
123038    3979507
123038    3979508
123038    3979509
123038    3979510
123038    3979511
Name: index, Length: 707053, dtype: int64

In [60]:
# valid
valid_df = merge_df.loc[valid_app]
valid_df = valid_df.reset_index(drop = True)
valid_df = valid_df.set_index(valid_df.iloc[:,0])
valid_df = valid_df.iloc[:,1:]
print(len(set(valid_df.index)))
valid_df

707053


Unnamed: 0_level_0,credit_score,yearly_income,desired_amount,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age,income_type_EARNEDINCOME2,...,period_nan,age_cat,bank_apply_prop,loan_rate_mean,loan_rate_min,loan_rate_max,loan_limit_mean,loan_limit_min,loan_limit_max,model1_label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8657043,701.0,68000000.0,20000000.0,0,0,0.0,0.0,1451.0,27.0,0,...,False,2,0.045518,14.7800,9.2,20.0,9600000.0,2000000.0,28000000.0,1.0
8657044,701.0,68000000.0,20000000.0,0,0,0.0,0.0,1451.0,27.0,0,...,False,2,0.068810,14.7800,9.2,20.0,9600000.0,2000000.0,28000000.0,1.0
8657045,701.0,68000000.0,20000000.0,0,0,0.0,0.0,1451.0,27.0,0,...,False,2,0.221203,14.7800,9.2,20.0,9600000.0,2000000.0,28000000.0,1.0
8657046,701.0,68000000.0,20000000.0,0,0,0.0,0.0,1451.0,27.0,0,...,False,2,0.028592,14.7800,9.2,20.0,9600000.0,2000000.0,28000000.0,1.0
8657047,701.0,68000000.0,20000000.0,0,0,0.0,0.0,1451.0,27.0,0,...,False,2,0.071640,14.7800,9.2,20.0,9600000.0,2000000.0,28000000.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3979507,930.0,30000000.0,150000000.0,0,0,3.0,428000000.0,1130.0,34.0,0,...,False,3,0.045518,11.3875,8.9,15.5,14750000.0,3000000.0,50000000.0,0.0
3979508,930.0,30000000.0,150000000.0,0,0,3.0,428000000.0,1130.0,34.0,0,...,False,3,0.019825,11.3875,8.9,15.5,14750000.0,3000000.0,50000000.0,0.0
3979509,930.0,30000000.0,150000000.0,0,0,3.0,428000000.0,1130.0,34.0,0,...,False,3,0.068810,11.3875,8.9,15.5,14750000.0,3000000.0,50000000.0,0.0
3979510,930.0,30000000.0,150000000.0,0,0,3.0,428000000.0,1130.0,34.0,0,...,False,3,0.016036,11.3875,8.9,15.5,14750000.0,3000000.0,50000000.0,0.0


In [61]:
valid_df.index

Int64Index([ 8657043,  8657044,  8657045,  8657046,  8657047, 10244596,
            10244597, 10244598, 10244599, 10244600,
            ...
            10725794,  3463087,  3979504,  3979505,  3979506,  3979507,
             3979508,  3979509,  3979510,  3979511],
           dtype='int64', name='index', length=707053)

In [62]:
# test
test_df = merge_df.loc[test_app]
test_df = test_df.reset_index(drop = True)
test_df = test_df.set_index(test_df.iloc[:,0])
test_df = test_df.iloc[:,1:]
print(len(set(test_df.index)))
test_df

701847


Unnamed: 0_level_0,credit_score,yearly_income,desired_amount,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age,income_type_EARNEDINCOME2,...,period_nan,age_cat,bank_apply_prop,loan_rate_mean,loan_rate_min,loan_rate_max,loan_limit_mean,loan_limit_min,loan_limit_max,model1_label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1440223,950.0,45000000.0,150000000.0,0,0,0.0,0.0,1122.0,36.0,0,...,False,3,0.020610,10.085000,3.8,16.4,3.866667e+07,3000000.0,80000000.0,0.0
1440224,950.0,45000000.0,150000000.0,0,0,0.0,0.0,1122.0,36.0,0,...,False,3,0.057393,10.085000,3.8,16.4,3.866667e+07,3000000.0,80000000.0,0.0
1440225,950.0,45000000.0,150000000.0,0,0,0.0,0.0,1122.0,36.0,0,...,False,3,0.104059,10.085000,3.8,16.4,3.866667e+07,3000000.0,80000000.0,0.0
1440226,950.0,45000000.0,150000000.0,0,0,0.0,0.0,1122.0,36.0,0,...,False,3,0.021614,10.085000,3.8,16.4,3.866667e+07,3000000.0,80000000.0,0.0
1440227,950.0,45000000.0,150000000.0,0,0,0.0,0.0,1122.0,36.0,0,...,False,3,0.054244,10.085000,3.8,16.4,3.866667e+07,3000000.0,80000000.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10851528,940.0,35000000.0,100000000.0,0,0,2.0,115000000.0,2626.0,41.0,0,...,False,4,0.091395,10.238182,4.7,19.9,3.329091e+07,2000000.0,50000000.0,0.0
10851529,940.0,35000000.0,100000000.0,0,0,2.0,115000000.0,2626.0,41.0,0,...,False,4,0.091395,10.238182,4.7,19.9,3.329091e+07,2000000.0,50000000.0,0.0
10851530,940.0,35000000.0,100000000.0,0,0,2.0,115000000.0,2626.0,41.0,0,...,False,4,0.042242,10.238182,4.7,19.9,3.329091e+07,2000000.0,50000000.0,0.0
10851531,940.0,35000000.0,100000000.0,0,0,2.0,115000000.0,2626.0,41.0,0,...,False,4,0.046390,10.238182,4.7,19.9,3.329091e+07,2000000.0,50000000.0,0.0


In [63]:
# index 돌려놓기
merge_df = merge_df.set_index(merge_df.iloc[:,0])
merge_df = merge_df.iloc[:,1:]
merge_df

Unnamed: 0_level_0,credit_score,yearly_income,desired_amount,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age,income_type_EARNEDINCOME2,...,period_nan,age_cat,bank_apply_prop,loan_rate_mean,loan_rate_min,loan_rate_max,loan_limit_mean,loan_limit_min,loan_limit_max,model1_label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13301,540.0,32000000.0,10000000.0,0,0,12.0,162000000.0,3081.0,52.0,0,...,False,5,0.042501,16.500000,16.5,16.5,2.000000e+07,20000000.0,20000000.0,1.0
13302,580.0,72000000.0,20000000.0,0,0,7.0,106000000.0,1012.0,45.0,0,...,False,4,0.042501,18.250000,16.5,20.0,7.000000e+06,3000000.0,11000000.0,0.0
13303,580.0,72000000.0,20000000.0,0,0,7.0,106000000.0,1012.0,45.0,0,...,False,4,0.017018,18.250000,16.5,20.0,7.000000e+06,3000000.0,11000000.0,0.0
13304,740.0,39000000.0,80000000.0,0,0,2.0,58000000.0,3721.0,39.0,0,...,False,3,0.024636,13.762500,6.2,19.9,2.317500e+07,2000000.0,48000000.0,1.0
13305,740.0,39000000.0,80000000.0,0,0,2.0,58000000.0,3721.0,39.0,0,...,False,3,0.021634,13.762500,6.2,19.9,2.317500e+07,2000000.0,48000000.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13519552,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,1,...,False,4,0.070538,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0
13519553,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,1,...,False,4,0.026148,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0
13519554,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,1,...,False,4,0.101006,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0
13519555,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,1,...,False,4,0.107743,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0


In [64]:
# train_data 생성
train_idx_set = set(merge_df.index) - set(valid_df.index)
train_idx = train_idx_set - set(test_df.index)
train_df = merge_df.loc[list(train_idx),:]
train_df

Unnamed: 0_level_0,credit_score,yearly_income,desired_amount,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age,income_type_EARNEDINCOME2,...,period_nan,age_cat,bank_apply_prop,loan_rate_mean,loan_rate_min,loan_rate_max,loan_limit_mean,loan_limit_min,loan_limit_max,model1_label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13301,540.0,32000000.0,10000000.0,0,0,12.0,162000000.0,3081.0,52.0,0,...,False,5,0.042501,16.500000,16.5,16.5,2.000000e+07,20000000.0,20000000.0,1.0
13304,740.0,39000000.0,80000000.0,0,0,2.0,58000000.0,3721.0,39.0,0,...,False,3,0.024636,13.762500,6.2,19.9,2.317500e+07,2000000.0,48000000.0,1.0
13305,740.0,39000000.0,80000000.0,0,0,2.0,58000000.0,3721.0,39.0,0,...,False,3,0.021634,13.762500,6.2,19.9,2.317500e+07,2000000.0,48000000.0,1.0
13306,740.0,39000000.0,80000000.0,0,0,2.0,58000000.0,3721.0,39.0,0,...,False,3,0.027749,13.762500,6.2,19.9,2.317500e+07,2000000.0,48000000.0,1.0
13307,740.0,39000000.0,80000000.0,0,0,2.0,58000000.0,3721.0,39.0,0,...,False,3,0.021614,13.762500,6.2,19.9,2.317500e+07,2000000.0,48000000.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13519552,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,1,...,False,4,0.070538,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0
13519553,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,1,...,False,4,0.026148,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0
13519554,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,1,...,False,4,0.101006,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0
13519555,600.0,35000000.0,6000000.0,0,0,3.0,27000000.0,257.0,45.0,1,...,False,4,0.107743,12.378571,6.5,18.4,2.035714e+07,9000000.0,57000000.0,1.0


In [65]:
print(merge_df.shape)
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)
print(final_6month_df.shape)

(10269990, 47)
(8861090, 47)
(707053, 47)
(701847, 47)
(3257239, 47)


In [66]:
8861090 + 707053 + 701847

10269990

In [67]:
merge_df.to_pickle('./data_split/merge_df.pkl')
train_df.to_pickle('./data_split/train_df.pkl')
valid_df.to_pickle('./data_split/valid_df.pkl')
test_df.to_pickle('./data_split/test_df.pkl')
final_6month_df.to_pickle('./data_split/final_6month_df.pkl')

## modeling

In [68]:
train_df = pd.read_pickle('./data_split/train_df.pkl')
valid_df = pd.read_pickle('./data_split/valid_df.pkl')
test_df = pd.read_pickle('./data_split/test_df.pkl')
month6_df = pd.read_pickle('./data_split/final_6month_df.pkl')
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)
print(month6_df.shape)

(8861090, 47)
(707053, 47)
(701847, 47)
(3257239, 47)


In [69]:
train_df = train_df[~train_df.loan_limit.isna()]
valid_df = valid_df[~valid_df.loan_limit.isna()]
test_df = test_df[~test_df.loan_limit.isna()]
month6_df = month6_df[~month6_df.loan_limit.isna()]

In [70]:
drop_col = ['model1_label',
            'month', 'age', 'application_id','product_id']
train_df = train_df.drop(columns = drop_col)
valid_df = valid_df.drop(columns = drop_col)
test_df = test_df.drop(columns = drop_col)
month6_df = month6_df.drop(columns = drop_col)
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)
print(month6_df.shape)

(8856243, 42)
(706663, 42)
(701459, 42)
(3255482, 42)


In [71]:
new_train_df = pd.concat([train_df, valid_df, test_df])

In [72]:
train_X = new_train_df.drop(columns = ['is_applied'])
train_y = new_train_df['is_applied']
month6_X = month6_df.drop(columns = ['is_applied'])
month6_y = month6_df['is_applied']

In [73]:
n_features = train_X.shape[1]
print(n_features)

41


In [74]:
train_X.columns

Index(['credit_score', 'yearly_income', 'desired_amount',
       'personal_rehabilitation_yn', 'personal_rehabilitation_complete_yn',
       'existing_loan_cnt', 'existing_loan_amt', 'employment_period',
       'income_type_EARNEDINCOME2', 'income_type_FREELANCER',
       'income_type_OTHERINCOME', 'income_type_PRACTITIONER',
       'income_type_PRIVATEBUSINESS', 'employment_type_기타',
       'employment_type_일용직', 'employment_type_정규직', 'houseown_type_배우자',
       'houseown_type_자가', 'houseown_type_전월세', 'purpose_대환대출', 'purpose_사업자금',
       'purpose_생활비', 'purpose_자동차구입', 'purpose_전월세보증금', 'purpose_주택구입',
       'purpose_투자', 'loan_limit', 'loan_rate', 'gender', 'gen_age_nan',
       'credit_nan', 'amt_nan', 'period_nan', 'age_cat', 'bank_apply_prop',
       'loan_rate_mean', 'loan_rate_min', 'loan_rate_max', 'loan_limit_mean',
       'loan_limit_min', 'loan_limit_max'],
      dtype='object')

In [76]:
# full data에 대해서 성능 평가 (기준 모델 : randomforest)
start = time.time()
rf = RandomForestClassifier(n_estimators = 200, 
                            min_samples_split = 30, 
#                             max_features = int(np.ceil(n_features/3)),
                            class_weight = "balanced" , n_jobs=4, 
                            random_state = 1004)
rf.fit(train_X, train_y)
end = time.time()
print(datetime.timedelta(seconds = end-start))

1:00:41.933322


In [77]:
pred_train = rf.predict(train_X)
pred_month6 = rf.predict(month6_X)

In [78]:
print("훈련데이터 accuracy : ", accuracy_score(train_y, pred_train))


훈련데이터 accuracy :  0.9555062587895111


In [79]:
print("훈련데이터 f1 score: ", f1_score(train_y, pred_train))


훈련데이터 f1 score:  0.7037070676925872


In [80]:
print("<훈련데이터 confusion matrix>")
print(confusion_matrix(train_y, pred_train))

<훈련데이터 confusion matrix>
[[9265325  444334]
 [  12366  542340]]


In [81]:
final_pred_train = pd.Series(pred_train, index = train_X.index)
final_pred_month6 = pd.Series(pred_month6, index = month6_X.index)

In [82]:
# 중간 값 저장
pd.Series(final_pred_train, index = train_X.index).to_csv('./tmp_data/final_pred_train.csv', index = True)
pd.Series(final_pred_month6, index = month6_X.index).to_csv('./tmp_data/final_pred_month6.csv', index = True)

In [176]:
train_df = pd.read_pickle('./data_split/train_df.pkl')
valid_df = pd.read_pickle('./data_split/valid_df.pkl')
test_df = pd.read_pickle('./data_split/test_df.pkl')
month6_df = pd.read_pickle('./data_split/final_6month_df.pkl')
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)
print(month6_df.shape)

(8861090, 47)
(707053, 47)
(701847, 47)
(3257239, 47)


In [177]:
final_pred_train = pd.read_csv('./tmp_data/final_pred_train.csv')
final_pred_train = final_pred_train.set_index(final_pred_train.iloc[:,0])
final_pred_train = final_pred_train.iloc[:,1:]

final_pred_month6 = pd.read_csv('./tmp_data/final_pred_month6.csv')
final_pred_month6 = final_pred_month6.set_index(final_pred_month6.iloc[:,0])
final_pred_month6 = final_pred_month6.iloc[:,1:]

In [178]:
final_pred_train.columns = ['final_pred']
final_pred_month6.columns = ['final_pred']
print(final_pred_train.shape)
print(final_pred_month6.shape)

(10264365, 1)
(3255482, 1)


In [179]:
new_train_df = pd.concat([train_df, valid_df, test_df])
new_train_df.shape

(10269990, 47)

In [180]:
new_train_df = pd.merge(new_train_df, final_pred_train, left_index = True, right_index = True, how='left')
month6_df = pd.merge(month6_df, final_pred_month6, left_index = True, right_index = True, how = 'left')
new_train_df['final_pred'] = new_train_df.final_pred.fillna(0)
month6_df['final_pred'] = month6_df.final_pred.fillna(0)
month6_df.head()

Unnamed: 0,credit_score,yearly_income,desired_amount,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age,income_type_EARNEDINCOME2,...,age_cat,bank_apply_prop,loan_rate_mean,loan_rate_min,loan_rate_max,loan_limit_mean,loan_limit_min,loan_limit_max,model1_label,final_pred
0,620.0,24000000.0,25000000.0,0,0,2.0,15000000.0,126.0,26.0,0,...,2,0.103032,14.517143,8.1,19.7,21171430.0,1000000.0,50000000.0,0.0,0.0
1,620.0,24000000.0,25000000.0,0,0,2.0,15000000.0,126.0,26.0,0,...,2,0.021634,14.517143,8.1,19.7,21171430.0,1000000.0,50000000.0,0.0,0.0
2,620.0,24000000.0,25000000.0,0,0,2.0,15000000.0,126.0,26.0,0,...,2,0.098055,14.517143,8.1,19.7,21171430.0,1000000.0,50000000.0,0.0,0.0
3,620.0,24000000.0,25000000.0,0,0,2.0,15000000.0,126.0,26.0,0,...,2,0.199582,14.517143,8.1,19.7,21171430.0,1000000.0,50000000.0,0.0,1.0
4,620.0,24000000.0,25000000.0,0,0,2.0,15000000.0,126.0,26.0,0,...,2,0.017018,14.517143,8.1,19.7,21171430.0,1000000.0,50000000.0,0.0,0.0


In [181]:
new_train_df.loc[train_df[train_df.loan_limit.isna()].index, 'final_pred'] = 1
new_train_df.loc[valid_df[valid_df.loan_limit.isna()].index, 'final_pred'] = 1
new_train_df.loc[test_df[test_df.loan_limit.isna()].index, 'final_pred'] = 1

month6_df.loc[month6_df[month6_df.loan_limit.isna()].index, 'final_pred'] = 1

In [182]:
print("훈련데이터 accuracy : ", accuracy_score(new_train_df['is_applied'], new_train_df['final_pred']))

훈련데이터 accuracy :  0.9555306285595215


In [183]:
print("훈련데이터 f1 score: ", f1_score(new_train_df['is_applied'], new_train_df['final_pred']))

훈련데이터 f1 score:  0.7058539381565473


In [184]:
print("<훈련데이터 confusion matrix>")
print(confusion_matrix(new_train_df['is_applied'], new_train_df['final_pred']))

<훈련데이터 confusion matrix>
[[9265325  444334]
 [  12366  547965]]


## 최종 제출 파일 생성

In [185]:
final_pred_csv = pd.read_csv('./raw_data/데이터분석분야_퓨처스부문_평가데이터.csv')
#final_pred_csv = final_pred_csv[['application_id', 'product_id']]
print(final_pred_csv.shape)

(3255194, 3)


In [186]:
month6_df = month6_df[['application_id', 'product_id', 'final_pred']]
print(month6_df.shape)

(3257239, 3)


In [187]:
tmp_df = month6_df.drop_duplicates(subset = ['application_id', 'product_id'],keep = 'first')
tmp_df.shape

(3257034, 3)

In [188]:
final_pred = pd.merge(final_pred_csv[['application_id','product_id']], tmp_df, on = ['application_id', 'product_id'], how = 'left')
final_pred.columns = ['application_id', 'product_id', 'final_pred']
final_pred.shape

(3255194, 3)

In [189]:
final_pred.to_csv('./평가데이터/nan알아요_평가데이터.csv', index = False)