In [1]:
## required packages
import pandas as pd
import numpy as np

In [2]:
user_spec = pd.read_csv('./user_spec.csv')
loan_result = pd.read_csv('./loan_result.csv')

In [3]:
print(user_spec.shape)
print(loan_result.shape)

(1394216, 17)
(13527363, 7)


In [4]:
# 전체 데이터에서 predict data가 3257239개 나와야 한다.
loan_result.isna().sum()

application_id                 0
loanapply_insert_time          0
bank_id                        0
product_id                     0
loan_limit                  7495
loan_rate                   7495
is_applied               3257239
dtype: int64

In [5]:
loan_result['loanapply_insert_time'] = pd.to_datetime(loan_result.loanapply_insert_time)

In [6]:
## insert_time 형 변환 : object -> datetime
user_spec['insert_time'] = pd.to_datetime(user_spec['insert_time'])

In [7]:
## company_enter_month(입사연도) -> employment_period(근무기간 , days)
def cem_var_preproc(x):
    if len(str(x)) == 10:
        return str(x)[:6] + '01'
    elif len(str(x)) == 8:
        return str(x)[:6] + '01'
    elif str(x) == 'nan':
        return np.nan
    else: 
        print(x)
        return x

user_spec['company_enter_month'] = pd.to_datetime(user_spec['company_enter_month'].apply(cem_var_preproc))

user_spec['employment_period'] = (user_spec['insert_time'] - user_spec['company_enter_month']).dt.days

In [8]:
## age 변수 생성 (insert_time - birth_year)
user_spec['age'] = user_spec['insert_time'].dt.year - user_spec['birth_year']

In [9]:
## age < 19 인 데이터 삭제
print(user_spec.shape[0])
user_spec = user_spec[user_spec.age >= 19]
print(user_spec.shape[0])

1394216
1381141


In [10]:
# user_loan = loan_result, user_spec 합친 table
user_loan = pd.merge(loan_result, user_spec, on = 'application_id')
print(user_loan.shape[0])
user_loan.head()

13399154


Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,user_id,birth_year,gender,...,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age
0,1748340,2022-06-07 13:05:41,7,191,42000000.0,13.6,,430982,1996.0,1.0,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
1,1748340,2022-06-07 13:05:41,25,169,24000000.0,17.9,,430982,1996.0,1.0,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
2,1748340,2022-06-07 13:05:41,2,7,24000000.0,18.5,,430982,1996.0,1.0,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
3,1748340,2022-06-07 13:05:41,4,268,29000000.0,10.8,,430982,1996.0,1.0,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
4,1748340,2022-06-07 13:05:41,11,118,5000000.0,16.4,,430982,1996.0,1.0,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0


In [11]:
# 필요없는 변수 drop
user_loan = user_loan.drop(columns = ['company_enter_month', 'birth_year'])

In [12]:
user_loan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13399154 entries, 0 to 13399153
Data columns (total 23 columns):
 #   Column                               Dtype         
---  ------                               -----         
 0   application_id                       int64         
 1   loanapply_insert_time                datetime64[ns]
 2   bank_id                              int64         
 3   product_id                           int64         
 4   loan_limit                           float64       
 5   loan_rate                            float64       
 6   is_applied                           float64       
 7   user_id                              int64         
 8   gender                               float64       
 9   insert_time                          datetime64[ns]
 10  credit_score                         float64       
 11  yearly_income                        float64       
 12  income_type                          object        
 13  employment_type          

In [13]:
# 6월 데이터(predict_final_set) : 최종으로 예측해야 하는 데이터 (기준 : loanapply_insert_time)
predict_final_set = user_loan[user_loan.loanapply_insert_time.dt.month == 6]
print(predict_final_set.shape)

(3220769, 23)


In [14]:
# 3, 4, 5월 데이터 30%, 30%, 40% 비율로 검증데이터(20000), 테스트 데이터(10000) 랜덤 샘플링
tmp3 = user_loan[user_loan.loanapply_insert_time.dt.month == 3]
tmp4 = user_loan[user_loan.loanapply_insert_time.dt.month == 4]
tmp5 = user_loan[user_loan.loanapply_insert_time.dt.month == 5]
print(user_loan.shape[0])
print(tmp3.shape[0] + tmp4.shape[0] + tmp5.shape[0] + predict_final_set.shape[0])

13399154
13399154


In [15]:
np.random.seed(1004)
idx3 = np.random.choice(tmp3.shape[0], size = 9000, replace = False)
idx4 = np.random.choice(tmp4.shape[0], size = 9000, replace = False)
idx5 = np.random.choice(tmp5.shape[0], size = 12000, replace = False)
print(idx3[:10])
print(idx4[:10])
print(idx5[:10])

[1124548 1250633 1601556 1893004 2274618  626917  871745  292564 2386945
 2240512]
[1791054 1337041 3058522 1548403 1427558 1013888 1358240  264653 3352146
 2083692]
[1088738 2848893  981890 3220751 1747173  622087  550474 2667202  841994
 2435964]


In [16]:
not_idx3 = np.array(list(set(range(tmp3.shape[0])) - set(idx3)))
not_idx4 = np.array(list(set(range(tmp4.shape[0])) - set(idx4)))
not_idx5 = np.array(list(set(range(tmp5.shape[0])) - set(idx5)))
print(len(tmp3) , "  " , len(not_idx3))
print(len(tmp4) , "  " , len(not_idx4))
print(len(tmp5) , "  " , len(not_idx5))
print(tmp3.shape[0] + tmp4.shape[0] + tmp5.shape[0])

3231594    3222594
3396647    3387647
3550144    3538144
10178385


In [17]:
# 훈련데이터
train_tmp3 = tmp3.iloc[not_idx3,:]
train_tmp4 = tmp4.iloc[not_idx4,:]
train_tmp5 = tmp5.iloc[not_idx5,:]

# 검증데이터
val_tmp3 = tmp3.iloc[idx3[:6000],:]
val_tmp4 = tmp4.iloc[idx4[:6000],:]
val_tmp5 = tmp5.iloc[idx5[:8000],:]

# 테스트데이터
test_tmp3 = tmp3.iloc[idx3[6000:],:]
test_tmp4 = tmp4.iloc[idx4[6000:],:]
test_tmp5 = tmp5.iloc[idx5[8000:],:]

In [18]:
user_loan_train = pd.concat([train_tmp3, train_tmp4, train_tmp5])
user_loan_val = pd.concat([val_tmp3, val_tmp4, val_tmp5])
user_loan_test = pd.concat([test_tmp3, test_tmp4, test_tmp5])
print(user_loan_train.shape)
print(user_loan_val.shape)
print(user_loan_test.shape)
print(user_loan_train.shape[0] + user_loan_val.shape[0] + user_loan_test.shape[0])

(10148385, 23)
(20000, 23)
(10000, 23)
10178385


In [19]:
user_loan_train.isna().sum()

application_id                               0
loanapply_insert_time                        0
bank_id                                      0
product_id                                   0
loan_limit                                5568
loan_rate                                 5568
is_applied                                   0
user_id                                      0
gender                                       0
insert_time                                  0
credit_score                           1226263
yearly_income                                0
income_type                                  0
employment_type                              0
houseown_type                                0
desired_amount                               0
purpose                                      0
personal_rehabilitation_yn             5804720
personal_rehabilitation_complete_yn    9128584
existing_loan_cnt                      2094997
existing_loan_amt                      2985508
employment_pe

In [20]:
user_loan_val.isna().sum()

application_id                             0
loanapply_insert_time                      0
bank_id                                    0
product_id                                 0
loan_limit                                11
loan_rate                                 11
is_applied                                 0
user_id                                    0
gender                                     0
insert_time                                0
credit_score                            2418
yearly_income                              0
income_type                                0
employment_type                            0
houseown_type                              0
desired_amount                             0
purpose                                    0
personal_rehabilitation_yn             10653
personal_rehabilitation_complete_yn    17889
existing_loan_cnt                       4013
existing_loan_amt                       5744
employment_period                        572
age       

In [21]:
user_loan_test.isna().sum()

application_id                            0
loanapply_insert_time                     0
bank_id                                   0
product_id                                0
loan_limit                                5
loan_rate                                 5
is_applied                                0
user_id                                   0
gender                                    0
insert_time                               0
credit_score                           1260
yearly_income                             0
income_type                               0
employment_type                           0
houseown_type                             0
desired_amount                            0
purpose                                   0
personal_rehabilitation_yn             5346
personal_rehabilitation_complete_yn    8891
existing_loan_cnt                      2106
existing_loan_amt                      2962
employment_period                       279
age                             

In [22]:
predict_final_set.isna().sum()

application_id                               0
loanapply_insert_time                        0
bank_id                                      0
product_id                                   0
loan_limit                                1740
loan_rate                                 1740
is_applied                             3220769
user_id                                      0
gender                                       0
insert_time                                  0
credit_score                            261097
yearly_income                                6
income_type                                  0
employment_type                              0
houseown_type                                0
desired_amount                               0
purpose                                      0
personal_rehabilitation_yn               15453
personal_rehabilitation_complete_yn    2534180
existing_loan_cnt                       526027
existing_loan_amt                       827349
employment_pe

In [32]:
## csv로 저장
user_loan_train.to_csv('./data/user_loan_train.csv', index = False)
user_loan_val.to_csv('./data/user_loan_val.csv', index = False)
user_loan_test.to_csv('./data/user_loan_test.csv', index = False)
predict_final_set.to_csv('./data/predict_final_set.csv', index = False)

- 확인용 코드

In [33]:
## 확인
train = pd.read_csv('./data/user_loan_train.csv')
val = pd.read_csv('./data/user_loan_val.csv')
test = pd.read_csv('./data/user_loan_test.csv')
predict = pd.read_csv('./data/predict_final_set.csv')

In [34]:
print(train.shape)
print(train.isna().sum())
train.head()

(10148385, 23)
application_id                               0
loanapply_insert_time                        0
bank_id                                      0
product_id                                   0
loan_limit                                5568
loan_rate                                 5568
is_applied                                   0
user_id                                      0
gender                                       0
insert_time                                  0
credit_score                           1226263
yearly_income                                0
income_type                                  0
employment_type                              0
houseown_type                                0
desired_amount                               0
purpose                                      0
personal_rehabilitation_yn             5804720
personal_rehabilitation_complete_yn    9128584
existing_loan_cnt                      2094997
existing_loan_amt                      298550

Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,user_id,gender,insert_time,...,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age
0,1754832,2022-03-09 17:18:10,47,138,40000000.0,14.8,0.0,848651,1.0,2022-03-09 17:18:02,...,정규직,전월세,20000000.0,대환대출,,,4.0,54000000.0,890.0,33.0
1,1754832,2022-03-09 17:18:02,50,142,44000000.0,15.9,0.0,848651,1.0,2022-03-09 17:18:02,...,정규직,전월세,20000000.0,대환대출,,,4.0,54000000.0,890.0,33.0
2,1754832,2022-03-09 17:18:03,1,61,6000000.0,13.9,0.0,848651,1.0,2022-03-09 17:18:02,...,정규직,전월세,20000000.0,대환대출,,,4.0,54000000.0,890.0,33.0
3,1754832,2022-03-09 17:18:03,28,217,2000000.0,19.4,0.0,848651,1.0,2022-03-09 17:18:02,...,정규직,전월세,20000000.0,대환대출,,,4.0,54000000.0,890.0,33.0
4,1754832,2022-03-09 17:18:11,62,200,20000000.0,12.8,0.0,848651,1.0,2022-03-09 17:18:02,...,정규직,전월세,20000000.0,대환대출,,,4.0,54000000.0,890.0,33.0


In [35]:
print(val.shape)
print(val.isna().sum())
val.head()

(20000, 23)
application_id                             0
loanapply_insert_time                      0
bank_id                                    0
product_id                                 0
loan_limit                                11
loan_rate                                 11
is_applied                                 0
user_id                                    0
gender                                     0
insert_time                                0
credit_score                            2418
yearly_income                              0
income_type                                0
employment_type                            0
houseown_type                              0
desired_amount                             0
purpose                                    0
personal_rehabilitation_yn             10653
personal_rehabilitation_complete_yn    17889
existing_loan_cnt                       4013
existing_loan_amt                       5744
employment_period                        57

Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,user_id,gender,insert_time,...,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age
0,695412,2022-03-31 06:16:21,10,149,70000000.0,12.4,0.0,522477,1.0,2022-03-31 06:16:20,...,정규직,전월세,100000000.0,사업자금,,,2.0,21000000.0,607.0,42.0
1,1112076,2022-03-05 08:52:14,6,36,31000000.0,15.4,0.0,126746,1.0,2022-03-05 08:52:13,...,정규직,전월세,5000000.0,생활비,,,,,247.0,26.0
2,922570,2022-03-19 15:43:24,54,235,11000000.0,17.5,0.0,274771,1.0,2022-03-19 15:43:22,...,계약직,전월세,10000000.0,생활비,,,5.0,109000000.0,199.0,35.0
3,990846,2022-03-07 16:30:52,34,139,15000000.0,18.9,0.0,99223,0.0,2022-03-07 16:30:28,...,정규직,전월세,20000000.0,대환대출,,,4.0,75000000.0,1041.0,39.0
4,1360224,2022-03-25 16:42:02,55,185,28000000.0,9.3,0.0,715752,1.0,2022-03-25 16:42:01,...,정규직,자가,20000000.0,사업자금,,,,,205.0,46.0


In [36]:
print(test.shape)
print(test.isna().sum())
test.head()

(10000, 23)
application_id                            0
loanapply_insert_time                     0
bank_id                                   0
product_id                                0
loan_limit                                5
loan_rate                                 5
is_applied                                0
user_id                                   0
gender                                    0
insert_time                               0
credit_score                           1260
yearly_income                             0
income_type                               0
employment_type                           0
houseown_type                             0
desired_amount                            0
purpose                                   0
personal_rehabilitation_yn             5346
personal_rehabilitation_complete_yn    8891
existing_loan_cnt                      2106
existing_loan_amt                      2962
employment_period                       279
age                 

Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,user_id,gender,insert_time,...,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age
0,968089,2022-03-01 20:33:10,36,103,9000000.0,7.8,0.0,205135,1.0,2022-03-01 20:33:08,...,정규직,기타가족소유,10000000.0,생활비,,,7.0,77000000.0,974.0,50.0
1,1812284,2022-03-30 18:16:54,33,189,9000000.0,7.9,0.0,577796,0.0,2022-03-30 18:16:53,...,계약직,전월세,40000000.0,생활비,,,2.0,90000000.0,57.0,22.0
2,263039,2022-03-29 19:55:12,30,85,10000000.0,17.4,0.0,629323,1.0,2022-03-29 19:55:12,...,정규직,기타가족소유,10000000.0,전월세보증금,,,1.0,,87.0,27.0
3,1857670,2022-03-08 14:29:12,11,170,37000000.0,14.2,0.0,467347,0.0,2022-03-08 14:29:00,...,정규직,기타가족소유,3000000.0,생활비,,,2.0,23000000.0,1042.0,56.0
4,1179337,2022-03-11 09:38:26,30,121,25000000.0,13.4,0.0,250212,1.0,2022-03-11 09:38:25,...,계약직,전월세,5000000.0,생활비,,,1.0,4000000.0,191.0,40.0


In [37]:
print(predict.shape)
print(predict.isna().sum())
predict.head()

(3220769, 23)
application_id                               0
loanapply_insert_time                        0
bank_id                                      0
product_id                                   0
loan_limit                                1740
loan_rate                                 1740
is_applied                             3220769
user_id                                      0
gender                                       0
insert_time                                  0
credit_score                            261097
yearly_income                                6
income_type                                  0
employment_type                              0
houseown_type                                0
desired_amount                               0
purpose                                      0
personal_rehabilitation_yn               15453
personal_rehabilitation_complete_yn    2534180
existing_loan_cnt                       526027
existing_loan_amt                       827349

Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,user_id,gender,insert_time,...,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age
0,1748340,2022-06-07 13:05:41,7,191,42000000.0,13.6,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
1,1748340,2022-06-07 13:05:41,25,169,24000000.0,17.9,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
2,1748340,2022-06-07 13:05:41,2,7,24000000.0,18.5,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
3,1748340,2022-06-07 13:05:41,4,268,29000000.0,10.8,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
4,1748340,2022-06-07 13:05:41,11,118,5000000.0,16.4,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0


###  loan_result의 6월 데이터 중에 user_spec에 없는 application_id가 존재

application id가 loan_result에는 존재하지만 user_spec에는 존재하지 않는 loan_result의 데이터셋 생성

In [23]:
print(len(set(loan_result.application_id.unique()) - set(user_spec.application_id.unique())))
no_info_idx = np.array(list(set(loan_result.application_id.unique()) - set(user_spec.application_id.unique())))
no_info_data = loan_result[np.isin(np.array(loan_result.application_id), no_info_idx)]

8706


In [24]:
print(no_info_data.shape)
no_info_data.head()

(128209, 7)


Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied
318,107628,2022-06-07 23:38:26,38,223,3000000.0,7.4,
319,107628,2022-06-07 23:38:27,25,163,3000000.0,11.9,
320,107628,2022-06-07 23:38:27,21,196,3000000.0,8.6,
321,107628,2022-06-07 23:38:26,42,258,3000000.0,9.1,
322,107628,2022-06-07 23:38:26,44,8,3000000.0,9.8,


In [25]:
predict_final_no_info = no_info_data[no_info_data.loanapply_insert_time.dt.month == 6]
no_info_data_345 = no_info_data[~(no_info_data.loanapply_insert_time.dt.month == 6)]
print(predict_final_no_info.shape)
print(no_info_data_345.shape)

(36470, 7)
(91739, 7)


In [26]:
predict_final_no_info.isna().sum()

application_id               0
loanapply_insert_time        0
bank_id                      0
product_id                   0
loan_limit                  17
loan_rate                   17
is_applied               36470
dtype: int64

In [27]:
print(loan_result[loan_result.loanapply_insert_time.dt.month == 6].shape[0])

3257239


In [28]:
36470 + 3220769 # 맞다

3257239

In [29]:
predict_final_no_info.to_csv('./data/predict_final_no_info.csv', index = False)
no_info_data_345.to_csv('./data/no_info_data_345.csv', index = False)

- 확인용 코드

In [30]:
tmp = pd.read_csv('./data/predict_final_no_info.csv')
print(tmp.shape)
print(tmp.isna().sum())
tmp.head()

(36470, 7)
application_id               0
loanapply_insert_time        0
bank_id                      0
product_id                   0
loan_limit                  17
loan_rate                   17
is_applied               36470
dtype: int64


Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied
0,107628,2022-06-07 23:38:26,38,223,3000000.0,7.4,
1,107628,2022-06-07 23:38:27,25,163,3000000.0,11.9,
2,107628,2022-06-07 23:38:27,21,196,3000000.0,8.6,
3,107628,2022-06-07 23:38:26,42,258,3000000.0,9.1,
4,107628,2022-06-07 23:38:26,44,8,3000000.0,9.8,


In [31]:
tmp = pd.read_csv('./data/no_info_data_345.csv')
print(tmp.shape)
print(tmp.isna().sum())
tmp.head()

(91739, 7)
application_id             0
loanapply_insert_time      0
bank_id                    0
product_id                 0
loan_limit               154
loan_rate                154
is_applied                 0
dtype: int64


Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied
0,436033,2022-05-09 22:54:41,27,148,12000000.0,7.0,0.0
1,436033,2022-05-09 22:55:01,49,136,12000000.0,6.5,0.0
2,436033,2022-05-09 22:54:55,22,164,12000000.0,8.0,0.0
3,436033,2022-05-09 22:54:42,32,257,12000000.0,7.2,0.0
4,436033,2022-05-09 22:54:48,19,127,11000000.0,9.4,0.0


- 최종 데이터 개수 확인

In [38]:
print(loan_result.shape)

(13527363, 7)


In [39]:
print(user_loan_train.shape)
print(user_loan_val.shape)
print(user_loan_test.shape)
print(predict_final_set.shape)
print(predict_final_no_info.shape)
print(no_info_data_345.shape)

(10148385, 23)
(20000, 23)
(10000, 23)
(3220769, 23)
(36470, 7)
(91739, 7)


In [41]:
10148385 + 20000 + 10000 + 3220769 + 36470 + 91739 # 맞다.

13527363