In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../dataset/housing/train.csv', encoding = 'utf-8')
test = pd.read_csv('../dataset/housing/test.csv', encoding = 'utf-8')
submission = pd.read_csv('../dataset/housing/sample_submission.csv', encoding = 'utf-8')

In [3]:
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

In [4]:
train = train[train['Garage Yr Blt'] < 2050]

## Target Encoding

In [5]:
# Column median Encoding
cat_cols = ['Exter Qual', 'Kitchen Qual', 'Bsmt Qual']

In [6]:
for c in cat_cols :
    ord_df = train.groupby(c).target.median().reset_index(name = f'ord_{c}')
    train = pd.merge(train, ord_df, how = 'left')
    test = pd.merge(test, ord_df, how = 'left')

In [7]:
# 기존 column 삭제
train.drop(cat_cols, axis = 1, inplace = True)
test.drop(cat_cols, axis = 1, inplace = True)

In [8]:
# 'Exter Qual', 'Bsmt Qual', 'Kitchen Qual' 변수는 Overall Qual로 보면 되기 때문에 삭제
train=train.drop(['Exter Qual', 'Bsmt Qual', 'Kitchen Qual'], axis=1, inplace=False)
test=test.drop(['Exter Qual', 'Bsmt Qual', 'Kitchen Qual'], axis=1, inplace=False)

KeyError: "['Exter Qual' 'Bsmt Qual' 'Kitchen Qual'] not found in axis"

In [9]:
print(f'로그 변환 전 타겟 왜도 = {train.target.skew()} / 로그 변환 후 타겟 왜도 = {np.log1p(train.target).skew()}')

로그 변환 전 타겟 왜도 = 1.7204669822790544 / 로그 변환 후 타겟 왜도 = 0.08225196452806845


In [10]:
X = train.drop('target', axis = 1)
y = np.log1p(train.target)

target = test[X.columns]

In [11]:
target.fillna(target.mean(), inplace = True)

## Modeling

In [12]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from catboost import CatBoostRegressor, Pool
from ngboost import NGBRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

In [13]:
def NMAE(true, pred) -> float:
    mae = np.mean(np.abs(true - pred))
    score = mae / np.mean(np.abs(true))
    return score

In [14]:
nmae_score = make_scorer(NMAE, greater_is_better=False)

In [15]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

## RandomForest

In [16]:
rf_pred = np.zeros(target.shape[0])
rf_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    rf = RandomForestRegressor(random_state = 42, criterion = 'mae')
    rf.fit(tr_x, tr_y)
    
    val_pred = np.expm1(rf.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    rf_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = rf.predict(target) / 10
    rf_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(rf_val)} & std = {np.std(rf_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.0936869815642399

2 FOLD Training.....
2 FOLD NMAE = 0.11372083162720076

3 FOLD Training.....
3 FOLD NMAE = 0.09443308083710712

4 FOLD Training.....
4 FOLD NMAE = 0.10951429437846574

5 FOLD Training.....
5 FOLD NMAE = 0.08987572925170044

6 FOLD Training.....
6 FOLD NMAE = 0.09997599529487447

7 FOLD Training.....
7 FOLD NMAE = 0.08966072470724444

8 FOLD Training.....
8 FOLD NMAE = 0.10569501920345054

9 FOLD Training.....
9 FOLD NMAE = 0.09540092022299637

10 FOLD Training.....
10 FOLD NMAE = 0.10049471546417325

10FOLD Mean of NMAE = 0.0992458292551453 & std = 0.007792121941923879


## GradientBoosting

In [17]:
gbr_pred = np.zeros(target.shape[0])
gbr_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    gbr = GradientBoostingRegressor(random_state = 42, max_depth = 4, learning_rate = 0.05, n_estimators = 1000)
    gbr.fit(tr_x, tr_y)
    
    val_pred = np.expm1(gbr.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    gbr_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = gbr.predict(target) / 10
    gbr_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(gbr_val)} & std = {np.std(gbr_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.09367372438745086

2 FOLD Training.....
2 FOLD NMAE = 0.10452949253790086

3 FOLD Training.....
3 FOLD NMAE = 0.0822404359570606

4 FOLD Training.....
4 FOLD NMAE = 0.10848179566764835

5 FOLD Training.....
5 FOLD NMAE = 0.09544021807976999

6 FOLD Training.....
6 FOLD NMAE = 0.09940533669629992

7 FOLD Training.....
7 FOLD NMAE = 0.08873019327524258

8 FOLD Training.....
8 FOLD NMAE = 0.10423773655894322

9 FOLD Training.....
9 FOLD NMAE = 0.10540905806087426

10 FOLD Training.....
10 FOLD NMAE = 0.0939495270409796

10FOLD Mean of NMAE = 0.09760975182621703 & std = 0.007891302159289498


## CatBoost

In [18]:
cb_pred = np.zeros(target.shape[0])
cb_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    tr_data = Pool(data = tr_x, label = tr_y)
    val_data = Pool(data = val_x, label = val_y)
    
    cb = CatBoostRegressor(depth = 4, random_state = 42, loss_function = 'MAE', n_estimators = 3000, learning_rate = 0.03, verbose = 0)
    cb.fit(tr_data, eval_set = val_data, early_stopping_rounds = 750, verbose = 1000)
    
    val_pred = np.expm1(cb.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    cb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = cb.predict(target) / 10
    cb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(cb_val)} & std = {np.std(cb_val)}')

1 FOLD Training.....
0:	learn: 0.2928192	test: 189506.6733447	best: 189506.6733447 (0)	total: 139ms	remaining: 6m 56s
Stopped by overfitting detector  (750 iterations wait)

bestTest = 189506.5933
bestIteration = 182

Shrink model to first 183 iterations.
1 FOLD NMAE = 0.09777341102660744

2 FOLD Training.....
0:	learn: 0.2929645	test: 186991.7521093	best: 186991.7521093 (0)	total: 1.18ms	remaining: 3.55s
1000:	learn: 0.0677636	test: 186991.7205303	best: 186991.7187119 (578)	total: 735ms	remaining: 1.47s
Stopped by overfitting detector  (750 iterations wait)

bestTest = 186991.7187
bestIteration = 578

Shrink model to first 579 iterations.
2 FOLD NMAE = 0.10260214892787221

3 FOLD Training.....
0:	learn: 0.2919462	test: 176765.4793246	best: 176765.4793246 (0)	total: 1.14ms	remaining: 3.42s
Stopped by overfitting detector  (750 iterations wait)

bestTest = 176765.4658
bestIteration = 66

Shrink model to first 67 iterations.
3 FOLD NMAE = 0.12234936741494892

4 FOLD Training.....
0:	lear

## NGBoost

In [19]:
ngb_pred = np.zeros(target.shape[0])
ngb_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    ngb = NGBRegressor(random_state = 42, n_estimators = 1000, verbose = 0, learning_rate = 0.03)
    ngb.fit(tr_x, tr_y, val_x, val_y, early_stopping_rounds = 300)
    
    val_pred = np.expm1(ngb.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    ngb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = ngb.predict(target) / 10
    ngb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(ngb_val)} & std = {np.std(ngb_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.09141854198095778

2 FOLD Training.....
2 FOLD NMAE = 0.10453644003185501

3 FOLD Training.....
3 FOLD NMAE = 0.09012342916688346

4 FOLD Training.....
4 FOLD NMAE = 0.10385877081603535

5 FOLD Training.....
5 FOLD NMAE = 0.08823461666867576

6 FOLD Training.....
6 FOLD NMAE = 0.0992715689789523

7 FOLD Training.....
7 FOLD NMAE = 0.09022827054658515

8 FOLD Training.....
8 FOLD NMAE = 0.09899957912118629

9 FOLD Training.....
9 FOLD NMAE = 0.10036670901025278

10 FOLD Training.....
10 FOLD NMAE = 0.09584672894941221

10FOLD Mean of NMAE = 0.09628846552707962 & std = 0.005672117092757084


## XGBoost

In [20]:
# 초기화
xgb_pred = np.zeros(target.shape[0])
xgb_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    # xgb 학습
    xgb = XGBRegressor(random_state = 42, criterion = 'mae')
    xgb.fit(tr_x, tr_y)
    
    val_pred = np.expm1(xgb.predict(val_x))
    val_nmae = NMAE(val_y, val_pred) # NMAE
    xgb_val.append(val_nmae) # 초기화한 리스트에 NMAE를 삽입.
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = xgb.predict(target) / 10
    xgb_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(xgb_val)} & std = {np.std(xgb_val)}')

1 FOLD Training.....
Parameters: { "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


1 FOLD NMAE = 0.10174294309067064

2 FOLD Training.....
Parameters: { "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


2 FOLD NMAE = 0.11585616326848466

3 FOLD Training.....
Parameters: { "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but 

## LGBM

In [21]:
# 초기화
lgbm_pred = np.zeros(target.shape[0])
lgbm_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    # xgb 학습
    lgbm = LGBMRegressor(random_state = 42, criterion = 'mae')
    lgbm.fit(tr_x, tr_y)
    
    val_pred = np.expm1(lgbm.predict(val_x))
    val_nmae = NMAE(val_y, val_pred) # NMAE
    lgbm_val.append(val_nmae) # 초기화한 리스트에 NMAE를 삽입.
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = lgbm.predict(target) / 10
    lgbm_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(lgbm_val)} & std = {np.std(lgbm_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.09201157240469633

2 FOLD Training.....
2 FOLD NMAE = 0.10895172392077022

3 FOLD Training.....
3 FOLD NMAE = 0.09168478976167703

4 FOLD Training.....
4 FOLD NMAE = 0.11362699748623657

5 FOLD Training.....
5 FOLD NMAE = 0.09112515275480314

6 FOLD Training.....
6 FOLD NMAE = 0.10007852232252476

7 FOLD Training.....
7 FOLD NMAE = 0.09450594667743578

8 FOLD Training.....
8 FOLD NMAE = 0.10768739403591426

9 FOLD Training.....
9 FOLD NMAE = 0.10647087418496351

10 FOLD Training.....
10 FOLD NMAE = 0.10003430042614071

10FOLD Mean of NMAE = 0.10061772739751622 & std = 0.007777489084609819


## Ensemble

In [22]:
(rf_pred + gbr_pred + cb_pred + ngb_pred + xgb_pred + lgbm_pred) / 6

array([12.71889824, 11.75681089, 12.07424625, ..., 11.24701861,
       12.19540471, 11.86582694])

In [23]:
submission['target'] = np.expm1((rf_pred + gbr_pred + cb_pred + ngb_pred + xgb_pred + lgbm_pred) / 6)

In [24]:
submission.to_csv('5th.csv', index = False)