In [1]:
import pandas as pd
import numpy as np
import gc
import pickle
import time
import datetime
import copy


from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

In [2]:
# user_spec, loan_result 테이블 불러오기 및 lonapply_insert_time type 변경
user_spec = pd.read_csv('./user_spec.csv')
loan_result = pd.read_csv('./loan_result.csv')
loan_result['loanapply_insert_time'] = pd.to_datetime(loan_result['loanapply_insert_time'])

#### gender, age, amt, credit_score, employment_period 변수 제외한 결측치 처리

In [3]:
def cem_var_preproc(x):
    if len(str(x)) == 10:
        return str(x)[:6] + '01'
    elif len(str(x)) == 8:
        return str(x)[:6] + '01'
    elif str(x) == 'nan':
        return np.nan
    else: 
        print(x)
        return x

# datetime 형변환
user_spec['company_enter_month'] = pd.to_datetime(user_spec['company_enter_month'].apply(cem_var_preproc))
user_spec['insert_time'] = pd.to_datetime(user_spec.insert_time)

# employment_period, age 변수 생성
user_spec['employment_period'] = (user_spec['insert_time'] - user_spec['company_enter_month']).dt.days
user_spec['age'] = user_spec['insert_time'].dt.year - user_spec['birth_year']

# company_enter_month, birth_year drop
user_spec = user_spec.drop(columns = ['company_enter_month', 'birth_year'])

In [4]:
# employment_period가 음수인 사람 같은 user_id의 값으로 대체 - 잘못 입력한 값
row_idx = np.where(user_spec['employment_period'] < 0)[0]
col_idx = np.where(user_spec.columns == 'employment_period')[0][0]
user_spec.iloc[row_idx, col_idx] = 136

In [5]:
# 19세 미만 변수 삭제
print("19세 미만 변수 삭제 전 min / max: ", user_spec.age.min(), user_spec.age.max())
user_spec = pd.concat([user_spec[user_spec.age >= 19], user_spec[user_spec.age.isna()]])
print("19세 미만 변수 삭제 후min / max: ", user_spec.age.min(), user_spec.age.max())

19세 미만 변수 삭제 전 min / max:  14.0 95.0
19세 미만 변수 삭제 후min / max:  19.0 95.0


In [6]:
# income_type, employment_type, houseown_type, desired_amount, purpose nan인 데이터 삭제
user_spec = user_spec[~user_spec.income_type.isna()]

In [7]:
# amt와 cnt가 모두 nan이면 0으로 채운다.
amt_idx = np.where(user_spec.columns == 'existing_loan_amt')[0][0]
cnt_idx = np.where(user_spec.columns == 'existing_loan_cnt')[0][0]
nan_idx = np.where((user_spec.existing_loan_cnt.isna()) & (user_spec.existing_loan_amt.isna()))[0]
user_spec.iloc[nan_idx,amt_idx] = 0
user_spec.iloc[nan_idx,cnt_idx] = 0

In [8]:
# 개인회생 0, 1 조합 삭제
user_spec = user_spec[~((user_spec.personal_rehabilitation_yn == 0)&(user_spec.personal_rehabilitation_complete_yn == 1))]

In [9]:
# 개인회생 nan, nan인 데이터 0으로 처리
yn_idx = np.where(user_spec.columns == 'personal_rehabilitation_yn')[0][0]
cyn_idx = np.where(user_spec.columns == 'personal_rehabilitation_complete_yn')[0][0]
nan_idx = np.where(user_spec.personal_rehabilitation_yn.isna())[0]
user_spec.iloc[nan_idx,yn_idx] = 0
user_spec.iloc[nan_idx,cyn_idx] = 0 

In [10]:
# 개인회생 0, nan인 데이터 0으로 처리
cyn_idx = np.where(user_spec.columns == 'personal_rehabilitation_complete_yn')[0][0]
nan_idx = np.where(user_spec.personal_rehabilitation_complete_yn.isna())[0]
user_spec.iloc[nan_idx,cyn_idx] = 0 

In [11]:
# yearly_income 결측치 데이터 목록
user_spec[user_spec.yearly_income.isna()]

Unnamed: 0,application_id,user_id,gender,insert_time,credit_score,yearly_income,income_type,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age
165422,1026915,861363,1.0,2022-06-30 21:11:51,,,OTHERINCOME,기타,전월세,5000000.0,생활비,0.0,0.0,0.0,0.0,,27.0
233316,2045613,702899,1.0,2022-06-30 18:37:04,520.0,,FREELANCER,기타,전월세,5000000.0,전월세보증금,0.0,0.0,1.0,1000000.0,29.0,27.0
597591,391560,329226,0.0,2022-06-30 19:27:28,590.0,,OTHERINCOME,기타,전월세,20000000.0,생활비,0.0,0.0,9.0,27000000.0,,26.0
597613,341149,670502,1.0,2022-06-30 21:32:18,930.0,,OTHERINCOME,기타,기타가족소유,6000000.0,사업자금,0.0,0.0,1.0,3000000.0,,41.0
971960,1330905,771592,1.0,2022-06-30 18:57:05,750.0,,OTHERINCOME,기타,자가,3000000.0,생활비,0.0,0.0,1.0,3000000.0,,19.0


In [12]:
# 같은 user_id 정보가 없는 데이터 (1개) 삭제
user_spec = user_spec[~(user_spec.user_id == 861363)]

In [13]:
# 같은 user_id 정보가 있는 데이터 mean()으로 대체
col_idx = np.where(user_spec.columns == 'yearly_income')[0][0]
# 1
row_idx = np.where(user_spec.application_id == 2045613)[0]
user_spec.iloc[row_idx, col_idx] = user_spec[user_spec.user_id == 702899].yearly_income.mean()
# 2
row_idx = np.where(user_spec.application_id == 391560)[0]
user_spec.iloc[row_idx, col_idx] = user_spec[user_spec.user_id == 329226].yearly_income.mean()
# 3
row_idx = np.where(user_spec.application_id == 341149)[0]
user_spec.iloc[row_idx, col_idx] = user_spec[user_spec.user_id == 670502].yearly_income.mean()
# 4
row_idx = np.where(user_spec.application_id == 1330905)[0]
user_spec.iloc[row_idx, col_idx] = user_spec[user_spec.user_id == 771592].yearly_income.mean()

In [14]:
user_spec.isna().sum()

application_id                              0
user_id                                     0
gender                                  12959
insert_time                                 0
credit_score                           105014
yearly_income                               0
income_type                                 0
employment_type                             0
houseown_type                               0
desired_amount                              0
purpose                                     0
personal_rehabilitation_yn                  0
personal_rehabilitation_complete_yn         0
existing_loan_cnt                           0
existing_loan_amt                      115148
employment_period                      171578
age                                     12959
dtype: int64

- LIVING -> 생활비
- ETC -> 기타
- HOUSEDEPOSIT -> 전월세보증금
- BUYHOUSE -> 주택구입
- BUYCAR -> 자동차구입
- INVEST -> 투자
- BUSINESS -> 사업자금
- SWITCHLOAN -> 대환대출

In [15]:
# purpose 영어, 한글 통일
user_spec.purpose.unique()

array(['기타', '대환대출', '생활비', '사업자금', '주택구입', '전월세보증금', '투자', 'LIVING',
       'SWITCHLOAN', 'ETC', 'INVEST', '자동차구입', 'BUSINESS', 'BUYCAR',
       'HOUSEDEPOSIT', 'BUYHOUSE'], dtype=object)

In [16]:
user_spec.purpose = user_spec.purpose.replace(['LIVING', 'ETC', 'HOUSEDEPOSIT', 'BUYHOUSE', 'BUYCAR', 'INVEST', 'BUSINESS', 'SWITCHLOAN'],
                  ['생활비', '기타', '전월세보증금', '주택구입', '자동차구입', '투자', '사업자금', '대환대출'])
user_spec.purpose.unique()

array(['기타', '대환대출', '생활비', '사업자금', '주택구입', '전월세보증금', '투자', '자동차구입'],
      dtype=object)

#### user_spec + loan_result table merge and data split
- user_spec과 loan_result table을 합치고, train, valid, test 데이터를 3월(30%), 4월(30%), 5월(40%)을 계통으로 하여 valid 20000개, test 20000개 분리

In [17]:
# bank_id == 16 인 데이터 삭제
loan_result = loan_result[~(loan_result.bank_id == 16)]

In [18]:
# loan_result랑 merge
user_loan = pd.merge(loan_result, user_spec, on = 'application_id')
print(user_loan.shape)
print(user_loan.isna().sum())
user_loan.head()

(13527229, 23)
application_id                               0
loanapply_insert_time                        0
bank_id                                      0
product_id                                   0
loan_limit                                7382
loan_rate                                 7382
is_applied                             3257239
user_id                                      0
gender                                  128096
insert_time                                  0
credit_score                           1509274
yearly_income                                0
income_type                                  0
employment_type                              0
houseown_type                                0
desired_amount                               0
purpose                                      0
personal_rehabilitation_yn                   0
personal_rehabilitation_complete_yn          0
existing_loan_cnt                            0
existing_loan_amt                      120444

Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,user_id,gender,insert_time,...,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,employment_period,age
0,1748340,2022-06-07 13:05:41,7,191,42000000.0,13.6,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
1,1748340,2022-06-07 13:05:41,25,169,24000000.0,17.9,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
2,1748340,2022-06-07 13:05:41,2,7,24000000.0,18.5,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
3,1748340,2022-06-07 13:05:41,4,268,29000000.0,10.8,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0
4,1748340,2022-06-07 13:05:41,11,118,5000000.0,16.4,,430982,1.0,2022-06-07 13:05:39,...,정규직,자가,25000000.0,대환대출,0.0,0.0,2.0,15000000.0,126.0,26.0


In [19]:
# month, day 변수 생성 (loanapply_insert_time 기준)
user_loan['month'] = user_loan['loanapply_insert_time'].dt.month
user_loan['day'] = user_loan['loanapply_insert_time'].dt.day

In [20]:
# 6월인 데이터 real_6month.pkl로 저장
real_6month = user_loan[user_loan.month == 6]

In [21]:
# 6월인 데이터 제외
user_loan = user_loan[user_loan.month != 6]

In [22]:
# train, valid(20000), test(20000) split

In [23]:
# idx 생성
np.random.seed(1004)
num3 = (user_loan.month == 3).sum()
num4 = (user_loan.month == 4).sum()
num5 = (user_loan.month == 5).sum()
idx3 = np.random.choice(num3, size = 12000, replace = False)
idx4 = np.random.choice(num4, size = 12000, replace = False)
idx5 = np.random.choice(num5, size = 16000, replace = False)
not_idx3 = np.array(list(set(range(num3)) - set(idx3)))
not_idx4 = np.array(list(set(range(num4)) - set(idx4)))
not_idx5 = np.array(list(set(range(num5)) - set(idx5)))

In [24]:
# train, valid, test 분리
train3 = user_loan[user_loan.month == 3].iloc[not_idx3]
train4 = user_loan[user_loan.month == 4].iloc[not_idx4]
train5 = user_loan[user_loan.month == 5].iloc[not_idx5]
valid3 = user_loan[user_loan.month == 3].iloc[idx3[:6000]]
valid4 = user_loan[user_loan.month == 4].iloc[idx4[:6000]]
valid5 = user_loan[user_loan.month == 5].iloc[idx5[:8000]]
test3 = user_loan[user_loan.month == 3].iloc[idx3[6000:]]
test4 = user_loan[user_loan.month == 4].iloc[idx4[6000:]]
test5 = user_loan[user_loan.month == 5].iloc[idx5[8000:]]
real_train = pd.concat([train3, train4, train5])
real_valid = pd.concat([valid3, valid4, valid5])
real_test = pd.concat([test3, test4, test5])
print(real_train.shape)
print(real_valid.shape)
print(real_test.shape)

(10229990, 25)
(20000, 25)
(20000, 25)


In [25]:
del idx3
del idx4
del idx5
del not_idx3
del not_idx4
del not_idx5
del train3
del train4
del train5
del valid3
del valid4
del valid5
del test3
del test4
del test5
gc.collect()

55

In [26]:
# 범주형 변수 개수 동일한지 확인 -> 맞음
# bank_id
print(len(real_train.bank_id.unique()))
print(len(real_valid.bank_id.unique()))
print(len(real_test.bank_id.unique()))
print(len(real_6month.bank_id.unique()))
# gender
print(len(real_train.gender.unique()))
print(len(real_valid.gender.unique()))
print(len(real_test.gender.unique()))
print(len(real_6month.gender.unique()))
# employment_type
print(len(real_train.employment_type.unique()))
print(len(real_valid.employment_type.unique()))
print(len(real_test.employment_type.unique()))
print(len(real_6month.employment_type.unique()))
# houseown_type
print(len(real_train.houseown_type.unique()))
print(len(real_valid.houseown_type.unique()))
print(len(real_test.houseown_type.unique()))
print(len(real_6month.houseown_type.unique()))
# purpose
print(len(real_train.purpose.unique()))
print(len(real_valid.purpose.unique()))
print(len(real_test.purpose.unique()))
print(len(real_6month.purpose.unique()))

61
61
61
61
3
3
3
3
4
4
4
4
4
4
4
4
8
8
8
8


In [27]:
# get_dummies
cat_features = ['income_type', 'employment_type', 'houseown_type', 'purpose']
real_train = pd.get_dummies(real_train, prefix_sep = "_", columns = cat_features, drop_first = True)
real_valid = pd.get_dummies(real_valid, prefix_sep = "_", columns = cat_features, drop_first = True)
real_test = pd.get_dummies(real_test, prefix_sep = "_", columns = cat_features, drop_first = True)
real_6month = pd.get_dummies(real_6month, prefix_sep = "_", columns = cat_features, drop_first = True)

In [28]:
real_train.to_pickle('./결측치처리data/real_train.pkl')
real_valid.to_pickle('./결측치처리data/real_valid.pkl')
real_test.to_pickle('./결측치처리data/real_test.pkl')
real_6month.to_pickle('./결측치처리data/real_6month.pkl')

In [29]:
del cat_features
del real_6month
del real_test
gc.collect()

48

#### train과 valid에서 결측 있는 데이터와 없는 데이터로 나누기

In [30]:
train_full = real_train[real_train.isna().sum(axis=1) == 0] # 결측이 없는 데이터
train_null = real_train[~(real_train.isna().sum(axis=1) == 0)] # 결측이 있는 데이터
valid_full = real_valid[real_valid.isna().sum(axis=1) == 0]
valid_null = real_valid[~(real_valid.isna().sum(axis=1) == 0)]

In [31]:
train_full.to_pickle('./결측치처리data/train_full.pkl')
train_null.to_pickle('./결측치처리data/train_null.pkl')
valid_full.to_pickle('./결측치처리data/valid_full.pkl')
valid_null.to_pickle('./결측치처리data/valid_null.pkl')

In [32]:
del train_null
del valid_null
gc.collect()

48

#### 기준 모델 성능 측정
- train_full, valid_full을 기준으로 rf 모델 성능 측정

#### train_full과 valid_full에 임의로 결측 생성

In [33]:
#위에서 실행
print(user_spec.shape)
user_spec.isna().sum()

gen_nan_n = user_spec.gender.isna().sum() / user_spec.shape[0]
credit_nan_n = user_spec.credit_score.isna().sum() / user_spec.shape[0]
amt_nan_n = user_spec.existing_loan_amt.isna().sum() / user_spec.shape[0]
emp_nan_n = user_spec.employment_period.isna().sum() / user_spec.shape[0]

print("gender/age nan ratio : ", gen_nan_n)
print("credit_score nan ratio : ", credit_nan_n)
print("amt nan ratio : ", amt_nan_n)
print("employment_period nan ratio : ", emp_nan_n)

(1394012, 17)
gender/age nan ratio :  0.00929618970281461
credit_score nan ratio :  0.07533220660941226
amt nan ratio :  0.08260187143295754
employment_period nan ratio :  0.12308215424257467


In [34]:
# nan으로 바꿀 feature index 생성
missing_feature_idx = [np.where(train_full.columns == 'gender')[0][0],
 np.where(train_full.columns == 'credit_score')[0][0],
 np.where(train_full.columns == 'existing_loan_amt')[0][0],
 np.where(train_full.columns == 'employment_period')[0][0],
 np.where(train_full.columns == 'age')[0][0]]
print(missing_feature_idx)

ft_dic = {}
ft_dic["gender"] = missing_feature_idx[0]
ft_dic["credit"] = missing_feature_idx[1]
ft_dic["amt"] = missing_feature_idx[2]
ft_dic["emp"] = missing_feature_idx[3]
ft_dic["age"] = missing_feature_idx[4]
ft_dic

[8, 10, 16, 17, 18]


{'gender': 8, 'credit': 10, 'amt': 16, 'emp': 17, 'age': 18}

In [35]:
# trian_full - nan으로 바꿀 행 index 생성
np.random.seed(1004)
n_samples = train_full.shape[0]
idx_gen = np.random.choice(n_samples, size = int(np.ceil(n_samples * gen_nan_n)), replace = False) #age
idx_credit = np.random.choice(n_samples, size = int(np.ceil(n_samples * credit_nan_n)), replace = False)
idx_amt = np.random.choice(n_samples, size = int(np.ceil(n_samples * amt_nan_n)), replace = False)
idx_emp = np.random.choice(n_samples, size = int(np.ceil(n_samples * emp_nan_n)), replace = False)

In [36]:
# 결측치 생성
train_full.iloc[idx_gen, ft_dic['gender']] = np.nan
train_full.iloc[idx_gen, ft_dic['age']] = np.nan
train_full.iloc[idx_credit, ft_dic['credit']] = np.nan
train_full.iloc[idx_amt, ft_dic['amt']] = np.nan
train_full.iloc[idx_emp, ft_dic['emp']] = np.nan

In [37]:
train_full.isna().sum()

application_id                              0
loanapply_insert_time                       0
bank_id                                     0
product_id                                  0
loan_limit                                  0
loan_rate                                   0
is_applied                                  0
user_id                                     0
gender                                  72713
insert_time                                 0
credit_score                           589232
yearly_income                               0
desired_amount                              0
personal_rehabilitation_yn                  0
personal_rehabilitation_complete_yn         0
existing_loan_cnt                           0
existing_loan_amt                      646094
employment_period                      962721
age                                     72713
month                                       0
day                                         0
income_type_EARNEDINCOME2         

In [38]:
# valid_full - nan으로 바꿀 행 index 생성
np.random.seed(1004)
n_samples = valid_full.shape[0]
idx_gen = np.random.choice(n_samples, size = int(np.ceil(n_samples * gen_nan_n)), replace = False) #age
idx_credit = np.random.choice(n_samples, size = int(np.ceil(n_samples * credit_nan_n)), replace = False)
idx_amt = np.random.choice(n_samples, size = int(np.ceil(n_samples * amt_nan_n)), replace = False)
idx_emp = np.random.choice(n_samples, size = int(np.ceil(n_samples * emp_nan_n)), replace = False)

In [39]:
# 결측치 생성
valid_full.iloc[idx_gen, ft_dic['gender']] = np.nan
valid_full.iloc[idx_gen, ft_dic['age']] = np.nan
valid_full.iloc[idx_credit, ft_dic['credit']] = np.nan
valid_full.iloc[idx_amt, ft_dic['amt']] = np.nan
valid_full.iloc[idx_emp, ft_dic['emp']] = np.nan

In [40]:
valid_full.isna().sum()

application_id                            0
loanapply_insert_time                     0
bank_id                                   0
product_id                                0
loan_limit                                0
loan_rate                                 0
is_applied                                0
user_id                                   0
gender                                  143
insert_time                               0
credit_score                           1151
yearly_income                             0
desired_amount                            0
personal_rehabilitation_yn                0
personal_rehabilitation_complete_yn       0
existing_loan_cnt                         0
existing_loan_amt                      1262
employment_period                      1881
age                                     143
month                                     0
day                                       0
income_type_EARNEDINCOME2                 0
income_type_FREELANCER          

In [41]:
train_full.to_pickle('./결측치처리data/train_full_null.pkl')
valid_full.to_pickle('./결측치처리data/valid_full_null.pkl')

#### impute 모델 훈련

In [None]:
# 아래 코드에서 train_full은 train_full_null.pkl의 데이터를 의미한다.
# 아래 코드에서 valid_full은 valid_full_null.pkl의 데이터를 의미한다.
train_full = pd.read_pickle('./결측치처리data/train_full_null.pkl')
valid_full = pd.read_pickle('./결측치처리data/valid_full_null.pkl')

In [42]:
train_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7821772 entries, 68377 to 13519556
Data columns (total 39 columns):
 #   Column                               Dtype         
---  ------                               -----         
 0   application_id                       int64         
 1   loanapply_insert_time                datetime64[ns]
 2   bank_id                              int64         
 3   product_id                           int64         
 4   loan_limit                           float64       
 5   loan_rate                            float64       
 6   is_applied                           float64       
 7   user_id                              int64         
 8   gender                               float64       
 9   insert_time                          datetime64[ns]
 10  credit_score                         float64       
 11  yearly_income                        float64       
 12  desired_amount                       float64       
 13  personal_rehabilitatio

In [43]:
drop_columns = ['application_id', 'loanapply_insert_time', 'bank_id','product_id',
                 'loan_limit', 'loan_rate', 'is_applied', 'user_id', 'gender',
                 'insert_time','month','day']
train_for_impute = train_full.drop(columns = drop_columns).drop_duplicates(keep = 'first')
train_for_impute_col_name = train_for_impute.columns

- linear_reg(n_nearest_features = None)

In [44]:
start = time.time()
imp = IterativeImputer(estimator = LinearRegression(), max_iter=200, tol=1e-10
                    ,skip_complete = True, random_state = 1004, min_value = 0)
imp.fit(train_for_impute)
end = time.time()
print(datetime.timedelta(seconds = end-start))

0:04:15.931631


In [45]:
imp_train = imp.transform(train_full.drop(columns = drop_columns))
imp_valid = imp.transform(valid_full.drop(columns = drop_columns))

In [46]:
imp_train = pd.DataFrame(columns = train_for_impute_col_name, index = train_full.index, data = imp_train)
imp_valid = pd.DataFrame(columns = train_for_impute_col_name, index = valid_full.index, data = imp_valid)

In [47]:
imp_train =  pd.concat([imp_train, train_full.loc[:,drop_columns]])
imp_valid = pd.concat([imp_valid, valid_full.loc[:, drop_columns]])

In [48]:
imp_train.to_pickle('./결측치처리data/train_impute_lr_n.pkl')
imp_valid.to_pickle('./결측치처리data/valid_impute_lr_n.pkl')

In [49]:
del imp
del imp_train
del imp_valid
gc.collect()

80

- linear_reg(n_nearest_features = 10)

In [50]:
start = time.time()
imp = IterativeImputer(estimator = LinearRegression(), max_iter=200, tol=1e-10
                    , n_nearest_features = 10 ,skip_complete = True, random_state = 1004, min_value = 0)
imp.fit(train_for_impute)
end = time.time()
print(datetime.timedelta(seconds = end-start))



0:27:28.679890


In [51]:
imp_train = imp.transform(train_full.drop(columns = drop_columns))
imp_valid = imp.transform(valid_full.drop(columns = drop_columns))

In [52]:
imp_train = pd.DataFrame(columns = train_for_impute_col_name, index = train_full.index, data = imp_train)
imp_valid = pd.DataFrame(columns = train_for_impute_col_name, index = valid_full.index, data = imp_valid)

In [53]:
imp_train =  pd.concat([imp_train, train_full.loc[:,drop_columns]])
imp_valid = pd.concat([imp_valid, valid_full.loc[:, drop_columns]])

In [54]:
imp_train.to_pickle('./결측치처리data/train_impute_lr_y.pkl')
imp_valid.to_pickle('./결측치처리data/valid_impute_lr_y.pkl')

In [55]:
del imp
del imp_train
del imp_valid
gc.collect()

32

- extratree (n_nearest_features = None)

In [None]:
start = time.time()
et = ExtraTreesRegressor(n_estimators=10, random_state=1004)
imp = IterativeImputer(estimator = et, max_iter=200, tol=1e-10
                    ,skip_complete = True, random_state = 1004, min_value = 0)
imp.fit(train_for_impute)
end = time.time()
print(datetime.timedelta(seconds = end-start))

In [None]:
imp_train = imp.transform(train_full.drop(columns = drop_columns))
imp_valid = imp.transform(valid_full.drop(columns = drop_columns))

In [None]:
imp_train = pd.DataFrame(columns = train_for_impute_col_name, index = train_full.index, data = imp_train)
imp_valid = pd.DataFrame(columns = train_for_impute_col_name, index = valid_full.index, data = imp_valid)

In [None]:
imp_train =  pd.concat([imp_train, train_full.loc[:,drop_columns]])
imp_valid = pd.concat([imp_valid, valid_full.loc[:, drop_columns]])

In [None]:
imp_train.to_pickle('./결측치처리data/train_impute_extra.pkl')
imp_valid.to_pickle('./결측치처리data/valid_impute_extra.pkl')

In [None]:
del et
del imp
del imp_train
del imp_valid
gc.collect()